# Sentiment analysis : Logistic regression

_deeplearning.ai/Natural Language Processing Specialization_

In [1]:
import nltk
import os
import pandas as pd
import numpy as np
from nltk.corpus import twitter_samples

#### Prepare the data

In [2]:
#nltk.download("twitter_samples")

In [3]:
all_pos_tweets = twitter_samples.strings("positive_tweets.json")
all_neg_tweets = twitter_samples.strings("negative_tweets.json")

In [4]:
#len(all_pos_tweets),len(all_pos_tweets)
print(all_pos_tweets[0],'\n',all_neg_tweets[0])

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :) 
 hopeless for tmr :(


#### train-test-split

In [5]:
X = np.hstack((np.array(all_pos_tweets),np.array(all_pos_tweets)))
y = np.hstack((np.zeros(len(all_pos_tweets)),np.ones(len(all_neg_tweets))))

In [6]:
import sklearn
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

#### Preprocessing word

In [7]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import re
import string

def process_tweet(tweet):
    
    stopwords_ = stopwords.words('english')
    # remove stock market ticker
    tweet = re.sub(r'\$\w*','',tweet)
    # remove RT
    tweet = re.sub(r'^RT[\s]+','',tweet)
    # remove hyperlink
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtag
    tweet = re.sub(r'#', '', tweet)
    
    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    
    for word in tweet_tokens:
        if word not in stopwords_ and word not in string.punctuation:
            stem_word = PorterStemmer().stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean

In [8]:
# tweets = X_train.tolist()
# print(tweets[0])
# process_tweet(tweets[0])

In [9]:
def word_freqs(tweets,class_):
    freq_dict = {}
    tweets = tweets.tolist()
    class_ = class_.tolist()
    for tweet,c in zip(tweets,class_):
        for word in process_tweet(tweet):
            if (word,c) not in freq_dict:
                freq_dict[(word,c)] = 1
            else:
                freq_dict[(word,c)] += 1
    return freq_dict

In [10]:
freqs = word_freqs(X_train,y_train)
freq_dict = pd.DataFrame({'Pairs': list(freqs.keys()), 'Frequence': list(freqs.values())})
freq_dict.head()

Unnamed: 0,Pairs,Frequence
0,"(friday, 1.0)",91
1,"(:), 1.0)",2891
2,"(good, 0.0)",195
3,"(thank, 0.0)",464
4,"(anyway, 0.0)",13


#### Create the freq table like the lecture note

In [11]:
word_freq = freq_dict.copy()
word_freq["Vocabulary"] = word_freq["Pairs"].apply(lambda x : x[0])
word_freq

Unnamed: 0,Pairs,Frequence,Vocabulary
0,"(friday, 1.0)",91,friday
1,"(:), 1.0)",2891,:)
2,"(good, 0.0)",195,good
3,"(thank, 0.0)",464,thank
4,"(anyway, 0.0)",13,anyway
...,...,...,...
11518,"(unreal, 1.0)",1,unreal
11519,"(zain, 1.0)",1,zain
11520,"(zac, 1.0)",1,zac
11521,"(isaac, 1.0)",1,isaac


In [12]:

key_words = list(set(list(word_freq.Vocabulary.values)))
freq_dict_info = pd.DataFrame(columns = ["Vocabulary","PosFreq(1)","NegFreq(1)"])

for w in key_words:
    pos_freq = freqs.get((w,1.0),0)
    neg_freq = freqs.get((w,0.0),0)
    freq_dict_info = freq_dict_info.append({"Vocabulary": w, "PosFreq(1)" : pos_freq,"NegFreq(1)":neg_freq},ignore_index=True)

In [13]:
freq_dict_info = freq_dict_info.sort_values(['Vocabulary'])
freq_dict_info

Unnamed: 0,Vocabulary,PosFreq(1),NegFreq(1)
3934,(-:,2,2
2126,(:,0,1
945,):,5,5
3474,--->,1,0
5222,-->,2,2
...,...,...,...
3569,🚂,0,1
544,🚮,1,1
3605,🚲,2,2
644,󾌴,1,0


In [14]:
def extract_features(processed_tweet):
    pos_freq = freq_dict_info.loc[freq_dict_info.Vocabulary.isin(processed_tweet)].sum().values[1]
    neg_freq = freq_dict_info.loc[freq_dict_info.Vocabulary.isin(processed_tweet)].sum().values[2]
    return [1,pos_freq,neg_freq]

In [15]:
train_df = pd.DataFrame(X_train,columns = ["Tweet"])
train_df["Processed_tweet"] = train_df.Tweet.apply(process_tweet)
train_df["Features"] = train_df.Processed_tweet.apply(extract_features)
train_df

Unnamed: 0,Tweet,Processed_tweet,Features
0,Friday!:) http://t.co/HUoq4txhmb,"[friday, :)]","[1, 2982, 2945]"
1,"@Lachdog_AU @Posica all good, thanks anyway :)","[good, thank, anyway, :)]","[1, 3605, 3523]"
2,my bf is mean :),"[bf, mean, :)]","[1, 2910, 2874]"
3,@bwoyblunder @rajudasonline Sorted :). Thanks....,"[sort, :), thank, daaru, parti, chaddi, bro]","[1, 3429, 3345]"
4,@narrhallamarsch Good Flight! :),"[good, flight, :)]","[1, 3088, 3048]"
...,...,...,...
7995,@ChaSilveo follow @jnlazts &amp; http://t.co/...,[follow],"[1, 304, 300]"
7996,Hi BAM ! @BarsAndMelody \nCan you follow my be...,"[hi, bam, follow, bestfriend, love, lot, :), s...","[1, 4050, 4080]"
7997,@hostclubhowell no prob!:),"[prob, :)]","[1, 2893, 2856]"
7998,@jeremygutsche we like this amazing donut reci...,"[like, amaz, donut, recip, share, foiegra, lov...","[1, 3176, 3112]"


In [16]:
test_df = pd.DataFrame(X_test,columns = ["Tweet"])
test_df["Processed_tweet"] = test_df.Tweet.apply(process_tweet)
test_df["Features"] = test_df.Processed_tweet.apply(extract_features)
test_df

Unnamed: 0,Tweet,Processed_tweet,Features
0,@Malan_Sanjaya yes switched back :) my lap is ...,"[ye, switch, back, :), lap, optim, window, 7, ...","[1, 3041, 3005]"
1,@mayusushita @dildeewana_ @sonalp2591 @deepti_...,"[thank, guy, :)]","[1, 3441, 3357]"
2,"Your love, O Lord, is better than life. :) &lt...","[love, lord, better, life, :), <3]","[1, 3390, 3372]"
3,@yasminyasir96 yeah but it will be better if w...,"[yeah, better, use, offici, account, :), like,...","[1, 3274, 3197]"
4,Ok good night I wish troye wasn't ugly and I m...,"[ok, good, night, wish, troy, ugli, met, today...","[1, 3504, 3473]"
...,...,...,...
1995,Agree?? :) #Phone #WiFi #LifeStyle \n\n#Qatar...,"[agre, :), phone, wifi, lifestyl, qatarday]","[1, 2916, 2882]"
1996,@Rl191459Alex Hey and thank you for following :),"[hey, thank, follow, :)]","[1, 3761, 3674]"
1997,See yah on Sunday @carmenkvarnen! :) https://t...,"[see, yah, sunday, :)]","[1, 3041, 3010]"
1998,@JSHUGI we must!!! Hahaha :p,"[must, hahaha, :p]","[1, 127, 136]"


#### Train the model

In [17]:
x_feat_train = np.stack(train_df["Features"].values)
x_feat_test = np.stack(test_df["Features"].values)

In [18]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(x_feat_train,y_train)
clf.score(x_feat_train,y_train)

0.531125

In [19]:
clf.coef_

array([[-2.53252524e-06,  5.33830815e-03, -5.40788661e-03]])

In [20]:
x_feat_test[0]

array([1.000e+00, 3.041e+03, 3.005e+03])

In [21]:
clf.predict(x_feat_test)

array([0., 1., 0., ..., 0., 0., 0.])

In [22]:
test_df.head()

Unnamed: 0,Tweet,Processed_tweet,Features
0,@Malan_Sanjaya yes switched back :) my lap is ...,"[ye, switch, back, :), lap, optim, window, 7, ...","[1, 3041, 3005]"
1,@mayusushita @dildeewana_ @sonalp2591 @deepti_...,"[thank, guy, :)]","[1, 3441, 3357]"
2,"Your love, O Lord, is better than life. :) &lt...","[love, lord, better, life, :), <3]","[1, 3390, 3372]"
3,@yasminyasir96 yeah but it will be better if w...,"[yeah, better, use, offici, account, :), like,...","[1, 3274, 3197]"
4,Ok good night I wish troye wasn't ugly and I m...,"[ok, good, night, wish, troy, ugli, met, today...","[1, 3504, 3473]"


In [23]:
test_df["Processed_tweet"].iloc[:5]

0    [ye, switch, back, :), lap, optim, window, 7, ...
1                                     [thank, guy, :)]
2                   [love, lord, better, life, :), <3]
3    [yeah, better, use, offici, account, :), like,...
4    [ok, good, night, wish, troy, ugli, met, today...
Name: Processed_tweet, dtype: object

In [24]:
clf.predict(x_feat_test[0:5])

array([0., 1., 0., 1., 0.])

In [25]:
y_test[0:5]

array([1., 0., 0., 0., 0.])

In [26]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf,x_feat_train,y_train,return_train_score = True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.01229,0.000567,0.528125,0.532344
1,0.015079,0.000636,0.54875,0.527031
2,0.056194,0.000835,0.53,0.534062
3,0.020995,0.000436,0.5175,0.532813
4,0.010293,0.000426,0.51875,0.533594
