In [35]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.metrics import accuracy_score

In [25]:
import nltk
nltk.download('all')
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

#Problem_1
Use the popular SMS Spam Collection dataset (available on Kaggle), which contains labeled
messages as either ”spam” or ”ham” (not spam), stored in a Pandas DataFrame with columns

Label (spam/ham) and Message (text). Perform the following tasks:

• Preprocess each message by tokenizing, removing stop words, and lowercasing the text.

• Load the pre-trained Google News Word2Vec model using gensim.

• Convert each message into a fixed-length vector by averaging the Word2Vec vectors of
all the words in the message (ignore words not found in the model vocabulary).

• Split the dataset into training (80%) and testing (20%) sets using train test split.

• Train a Logistic Regression classifier on the vectorized training data and print the
accuracy on the test set.

• Write a Python function predict message class(model, w2v model, message) that
takes a trained classifier, the Word2Vec model, and a single message (string), and
returns the predicted class (spam or ham).

In [73]:
data = pd.read_csv('/content/spam.csv', encoding='latin1')
df = pd.DataFrame(data)
df.head(4)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,


In [74]:
def to_lower_case(text):
  text = text.lower()
  return text

def remove_stop_words(text_list):
  filtered_text = [word for word in text_list if word not in stop_words]
  return filtered_text

def tokenize(text):
  return nltk.word_tokenize(text)

In [75]:
df['lower_case_messages'] = df['v2'].apply(lambda x: to_lower_case(x))
df['tokenized_messages'] = df['lower_case_messages'].apply(lambda y: tokenize(y))
df['filtered_messages'] = df['tokenized_messages'].apply(lambda z: remove_stop_words(z))
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,lower_case_messages,tokenized_messages,filtered_messages
0,ham,"Go until jurong point, crazy.. Available only ...",,,,"go until jurong point, crazy.. available only ...","[go, until, jurong, point, ,, crazy, .., avail...","[go, jurong, point, ,, crazy, .., available, b..."
1,ham,Ok lar... Joking wif u oni...,,,,ok lar... joking wif u oni...,"[ok, lar, ..., joking, wif, u, oni, ...]","[ok, lar, ..., joking, wif, u, oni, ...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,,,,u dun say so early hor... u c already then say...,"[u, dun, say, so, early, hor, ..., u, c, alrea...","[u, dun, say, early, hor, ..., u, c, already, ..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,"nah i don't think he goes to usf, he lives aro...","[nah, i, do, n't, think, he, goes, to, usf, ,,...","[nah, n't, think, goes, usf, ,, lives, around,..."


In [76]:
!pip install gensim



In [77]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')

In [78]:
def avg_word2vec(filtered_list):
  valid_vectors = [model[word] for word in filtered_list if word in model]
  if not valid_vectors:
    return np.zeros(model.vector_size)

  return np.mean(valid_vectors, axis=0)

df['avg_word2vec'] = df['filtered_messages'].apply(lambda x: avg_word2vec(x))
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,lower_case_messages,tokenized_messages,filtered_messages,avg_word2vec
0,ham,"Go until jurong point, crazy.. Available only ...",,,,"go until jurong point, crazy.. available only ...","[go, until, jurong, point, ,, crazy, .., avail...","[go, jurong, point, ,, crazy, .., available, b...","[-0.019805908, 0.05167062, 0.02709961, 0.21868..."
1,ham,Ok lar... Joking wif u oni...,,,,ok lar... joking wif u oni...,"[ok, lar, ..., joking, wif, u, oni, ...]","[ok, lar, ..., joking, wif, u, oni, ...]","[-0.06323496, 0.0803833, 0.060943604, 0.102498..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[-0.03482437, -0.00703014, -0.06348601, 0.1161..."
3,ham,U dun say so early hor... U c already then say...,,,,u dun say so early hor... u c already then say...,"[u, dun, say, so, early, hor, ..., u, c, alrea...","[u, dun, say, early, hor, ..., u, c, already, ...","[-0.06568061, 0.0262146, 0.1081543, 0.0869751,..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,"nah i don't think he goes to usf, he lives aro...","[nah, i, do, n't, think, he, goes, to, usf, ,,...","[nah, n't, think, goes, usf, ,, lives, around,...","[0.01461792, 0.07184219, -0.005203247, 0.14686..."


In [79]:
df=df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=0)
df.head()

Unnamed: 0,v1,v2,lower_case_messages,tokenized_messages,filtered_messages,avg_word2vec
0,ham,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ...","[go, until, jurong, point, ,, crazy, .., avail...","[go, jurong, point, ,, crazy, .., available, b...","[-0.019805908, 0.05167062, 0.02709961, 0.21868..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...,"[ok, lar, ..., joking, wif, u, oni, ...]","[ok, lar, ..., joking, wif, u, oni, ...]","[-0.06323496, 0.0803833, 0.060943604, 0.102498..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[-0.03482437, -0.00703014, -0.06348601, 0.1161..."
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...,"[u, dun, say, so, early, hor, ..., u, c, alrea...","[u, dun, say, early, hor, ..., u, c, already, ...","[-0.06568061, 0.0262146, 0.1081543, 0.0869751,..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah i don't think he goes to usf, he lives aro...","[nah, i, do, n't, think, he, goes, to, usf, ,,...","[nah, n't, think, goes, usf, ,, lives, around,...","[0.01461792, 0.07184219, -0.005203247, 0.14686..."


In [80]:
X = np.vstack(df['avg_word2vec'].values)
y = df['v1']
X_train, X_test, Y_train, Y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model_LR = LogisticRegression()
model_LR.fit(X_train,Y_train)
y_predicted = model_LR.predict(X_test)
acc = accuracy_score(y_predicted, Y_test)
print("Accuracy of model: ", np.round(acc*100,2), "%")

Accuracy of model:  94.53 %


In [87]:
def predict_message_class(model_LR, avg_word2vec, message):
  # Using above defined functions for removing stop_words and tokenizing
  message = to_lower_case(message)
  tokenized_message = tokenize(message)
  filtered_message = remove_stop_words(message)
  # Using above defined function to convert it into word2vec
  word_vec = avg_word2vec(filtered_message).reshape(1,-1)
  prediction = model_LR.predict(word_vec)
  print("Message is ", prediction)

In [88]:
predict_message_class(model_LR, avg_word2vec, "You Have won a prize of Rs.1000 Provide your detail like pan card no to redeem")

Message is  ['spam']


#Problem 2
Use the Twitter US Airline Sentiment dataset (available on Kaggle), which contains tweets
labeled with the sentiment of the user toward airlines (positive, negative, or neutral). The
data is stored in a Pandas DataFrame with columns such as airline sentiment (target)
and text (tweet content). Perform the following tasks:

• Preprocess each tweet using the following steps:

- Convert the text to lowercase.
- Remove URLs, mentions (e.g., @username), hashtags, and punctuation.
- Expand common contractions (e.g., "don’t" → "do not").
- Lemmatize the words (use NLTK).
- Optionally remove emojis and special symbols.

• Load the pre-trained Google News Word2Vec model using gensim.

• Convert each tweet into a fixed-length vector by averaging the Word2Vec word vectors
for all words in the tweet. Ignore words not found in the embeddings.

• Split the dataset into training (80%) and testing (20%) sets using train test split.

• Train a Logistic Regression classifier on the vectorized training data and report the
accuracy on the test set.

• Write a Python function predict tweet sentiment(model, glove model, tweet)
that takes the trained classifier, the GloVe model, and a single tweet (string), and
returns the predicted sentiment (positive, negative, or neutral).

In [136]:
import re
import string
from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer

In [122]:
data_tweets = pd.read_csv('/content/Tweets.csv', encoding='latin1')
df_new = pd.DataFrame(data_tweets)

In [123]:
#I am using above defined functions to preprocess the data
df_new['lower_case_tweets'] = df_new['text'].apply(lambda x: to_lower_case(x))
def remove_url_punctuations(text):
  text = re.sub(r'http\S+|www\.\S+','',text)
  text = re.sub(r'@\w+|#\w+', '', text)
  text = text.translate(str.maketrans('', '', string.punctuation))
  return text
df_new['removed_urls_punctuations'] = df_new['lower_case_tweets'].apply(lambda x: remove_url_punctuations(x))

In [112]:
!pip install contractions
import contractions



In [125]:
def expand_text(text):
  expanded_text = contractions.fix(text)
  return expanded_text
df_new['expanded_filtered_text'] = df_new['removed_urls_punctuations'].apply(lambda y: expand_text(y))

In [126]:
df_new = df_new.drop(columns=['airline_sentiment_confidence','negativereason','negativereason_confidence',
                              'airline_sentiment_gold','negativereason_gold','retweet_count','tweet_location',
                              'user_timezone','tweet_id','tweet_coord'])
df_new.head()

Unnamed: 0,airline_sentiment,airline,name,text,tweet_created,lower_case_tweets,removed_urls_punctuations,expanded_filtered_text
0,neutral,Virgin America,cairdin,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52 -0800,@virginamerica what @dhepburn said.,what said,what said
1,positive,Virgin America,jnardino,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59 -0800,@virginamerica plus you've added commercials t...,plus youve added commercials to the experienc...,plus you have added commercials to the experi...
2,neutral,Virgin America,yvonnalynn,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48 -0800,@virginamerica i didn't today... must mean i n...,i didnt today must mean i need to take anothe...,i did not today must mean i need to take anot...
3,negative,Virgin America,jnardino,@VirginAmerica it's really aggressive to blast...,2015-02-24 11:15:36 -0800,@virginamerica it's really aggressive to blast...,its really aggressive to blast obnoxious ente...,its really aggressive to blast obnoxious ente...
4,negative,Virgin America,jnardino,@VirginAmerica and it's a really big bad thing...,2015-02-24 11:14:45 -0800,@virginamerica and it's a really big bad thing...,and its a really big bad thing about it,and its a really big bad thing about it


In [137]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def Lemmatize(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    lemmatized_words = [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged
    ]
    return ' '.join(lemmatized_words)

df_new['lemmatized_text'] = df_new['expanded_filtered_text'].apply(lambda z: Lemmatize(z))

In [139]:
df_new.head()

Unnamed: 0,airline_sentiment,airline,name,text,tweet_created,lower_case_tweets,removed_urls_punctuations,expanded_filtered_text,lemmatized_text
0,neutral,Virgin America,cairdin,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52 -0800,@virginamerica what @dhepburn said.,what said,what said,what say
1,positive,Virgin America,jnardino,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59 -0800,@virginamerica plus you've added commercials t...,plus youve added commercials to the experienc...,plus you have added commercials to the experi...,plus you have add commercial to the experience...
2,neutral,Virgin America,yvonnalynn,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48 -0800,@virginamerica i didn't today... must mean i n...,i didnt today must mean i need to take anothe...,i did not today must mean i need to take anot...,i do not today must mean i need to take anothe...
3,negative,Virgin America,jnardino,@VirginAmerica it's really aggressive to blast...,2015-02-24 11:15:36 -0800,@virginamerica it's really aggressive to blast...,its really aggressive to blast obnoxious ente...,its really aggressive to blast obnoxious ente...,it really aggressive to blast obnoxious entert...
4,negative,Virgin America,jnardino,@VirginAmerica and it's a really big bad thing...,2015-02-24 11:14:45 -0800,@virginamerica and it's a really big bad thing...,and its a really big bad thing about it,and its a really big bad thing about it,and it a really big bad thing about it


In [140]:
def word2vec(text):
  tokens = nltk.word_tokenize(text)
  valid_vectors_ = [model[words] for words in tokens if words in model]
  if not valid_vectors_:
    return np.zeros(model.vector_size)
  return np.mean(valid_vectors_, axis=0)

df_new['word2vec'] = df_new['lemmatized_text'].apply(lambda x: word2vec(x))
df_new.head()

Unnamed: 0,airline_sentiment,airline,name,text,tweet_created,lower_case_tweets,removed_urls_punctuations,expanded_filtered_text,lemmatized_text,word2vec
0,neutral,Virgin America,cairdin,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52 -0800,@virginamerica what @dhepburn said.,what said,what said,what say,"[0.051757812, -0.06362915, 0.1743164, 0.093505..."
1,positive,Virgin America,jnardino,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59 -0800,@virginamerica plus you've added commercials t...,plus youve added commercials to the experienc...,plus you have added commercials to the experi...,plus you have add commercial to the experience...,"[0.022262573, -0.0069885254, -0.02242279, 0.10..."
2,neutral,Virgin America,yvonnalynn,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48 -0800,@virginamerica i didn't today... must mean i n...,i didnt today must mean i need to take anothe...,i did not today must mean i need to take anot...,i do not today must mean i need to take anothe...,"[-0.023753773, 0.019930752, 0.05172452, 0.0965..."
3,negative,Virgin America,jnardino,@VirginAmerica it's really aggressive to blast...,2015-02-24 11:15:36 -0800,@virginamerica it's really aggressive to blast...,its really aggressive to blast obnoxious ente...,its really aggressive to blast obnoxious ente...,it really aggressive to blast obnoxious entert...,"[0.019074503, 0.07301521, 0.0019124349, 0.0949..."
4,negative,Virgin America,jnardino,@VirginAmerica and it's a really big bad thing...,2015-02-24 11:14:45 -0800,@virginamerica and it's a really big bad thing...,and its a really big bad thing about it,and its a really big bad thing about it,and it a really big bad thing about it,"[0.11593192, 0.024156298, 0.043247767, 0.08426..."


In [143]:
x_train, x_test, y_train, y_test = train_test_split(np.vstack(df_new['word2vec'].values),df_new['airline_sentiment'])
model_LR2 = LogisticRegression()
model_LR2.fit(x_train, y_train)
predictions = model_LR2.predict(x_test)
acc = accuracy_score(predictions, y_test)*100
print("Accuracy: ", acc)

Accuracy:  76.80327868852459


In [148]:
def predict_tweet_sentiment(model, word2vec, text):
  text = to_lower_case(text)
  text = remove_url_punctuations(text)
  text = expand_text(text)
  predicted_sentiment = model.predict(word2vec(text).reshape((1,-1)))
  return predicted_sentiment
print(predict_tweet_sentiment(model_LR2, word2vec, "It was a bad experience"))

['negative']


#Problem 3
Manually implement the TF-IDF algorithm and compare the results with the outputs from scikit-learn's CountVectorizer and TfidfVectorizer.

In [167]:
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

In [169]:
df = pd.DataFrame({'the':[1,1,1],'sun':[1,0,1],'is':[1,1,0],'a':[1,1,0],'star':[1,0,0],
                   'moon':[0,1,1],'satellite':[0,1,0],'and':[0,0,1], 'are':[0,0,1],
                   'celestial':[0,0,1],'bodies':[0,0,1]}, index=['Doc1','Doc2','Doc3'])
df

Unnamed: 0,the,sun,is,a,star,moon,satellite,and,are,celestial,bodies
Doc1,1,1,1,1,1,0,0,0,0,0,0
Doc2,1,0,1,1,0,1,1,0,0,0,0
Doc3,1,1,0,0,0,1,0,1,1,1,1


In [189]:
columns = list(df.columns)
sum = []
for column in columns:
  sum.append(np.sum(df[column]))
sum

[3, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1]

In [190]:
sum = np.array(sum)
idf = []
for s in sum:
  idf.append(np.log10(3/s))
idf = np.array(idf)
idf = np.vstack((idf,idf,idf))
idf

array([[0.        , 0.17609126, 0.17609126, 0.17609126, 0.47712125,
        0.17609126, 0.47712125, 0.47712125, 0.47712125, 0.47712125,
        0.47712125],
       [0.        , 0.17609126, 0.17609126, 0.17609126, 0.47712125,
        0.17609126, 0.47712125, 0.47712125, 0.47712125, 0.47712125,
        0.47712125],
       [0.        , 0.17609126, 0.17609126, 0.17609126, 0.47712125,
        0.17609126, 0.47712125, 0.47712125, 0.47712125, 0.47712125,
        0.47712125]])

In [192]:
tf_idf = idf*df
tf_idf

Unnamed: 0,the,sun,is,a,star,moon,satellite,and,are,celestial,bodies
Doc1,0.0,0.176091,0.176091,0.176091,0.477121,0.0,0.0,0.0,0.0,0.0,0.0
Doc2,0.0,0.0,0.176091,0.176091,0.0,0.176091,0.477121,0.0,0.0,0.0,0.0
Doc3,0.0,0.176091,0.0,0.0,0.0,0.176091,0.0,0.477121,0.477121,0.477121,0.477121


In [194]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
print(vectorizer.fit_transform(corpus))

  (0, 9)	0.3731188059313277
  (0, 8)	0.4804583972923858
  (0, 4)	0.4804583972923858
  (0, 7)	0.6317450542765208
  (1, 9)	0.3731188059313277
  (1, 4)	0.4804583972923858
  (1, 5)	0.4804583972923858
  (1, 6)	0.6317450542765208
  (2, 9)	0.2517108425440014
  (2, 8)	0.3241235393856436
  (2, 5)	0.3241235393856436
  (2, 0)	0.42618350336974425
  (2, 1)	0.42618350336974425
  (2, 3)	0.42618350336974425
  (2, 2)	0.42618350336974425
