In [44]:
import re
import nltk
import string
import numpy as np
import pandas as pd

from nltk.corpus import twitter_samples, stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [3]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
type(all_positive_tweets)

list

In [6]:
len(all_positive_tweets)

5000

In [7]:
all_positive_tweets[6]

"We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI"

In [59]:
train_pos = all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]
train_neg = all_negative_tweets[:4000]
test_neg = all_negative_tweets[4000:]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [17]:
train_y = np.append(np.ones((len(train_pos),1)), np.zeros((len(train_neg),1)))
test_y = np.append(np.ones((len(test_pos),1)), np.zeros((len(test_neg),1)))

In [18]:
train_y.shape

(8000,)

In [19]:
test_y.shape

(2000,)

In [23]:
englis_stopwords = stopwords.words('english')
stemmer = PorterStemmer()
tokenizer = TweetTokenizer(preserve_case=False,reduce_len=True, strip_handles=True)

In [26]:
def process_tweet(tweet):
    
    tweet = re.sub(r'\$\w*', '',tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenized_tweet = tokenizer.tokenize(tweet)
    
    tweet_clean = []
    for word in tokenized_tweet:
        if (word not in englis_stopwords and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweet_clean.append(stem_word)
        
    return tweet_clean

In [39]:
print('This is an example of a positive tweet:\n', train_x[0])
print()
print('This is the processed version of the tweet:\n', process_tweet(train_x[0]))

This is an example of a positive tweet:
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

This is the processed version of the tweet:
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [34]:
def build_freq(tweets, ys):
    
    ylist = np.squeeze(ys).tolist()
    freqs = dict()
    for y, tweet in zip(ylist,tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] +=1
            else:
                freqs[pair] = 1
    return freqs

In [36]:
freqs = build_freq(train_x, train_y)

print(len(freqs.keys()))
print(type(freqs))

11340
<class 'dict'>


In [60]:
tokens = [process_tweet(tweet) for tweet in train_x]
# train_x = np.array()
tokens[0]

['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']

In [56]:
train_x[0]

['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']

In [57]:
cv = CountVectorizer()
x_train_cv = cv.fit_transform(train_x)

AttributeError: 'list' object has no attribute 'lower'

In [48]:
tfidf = TfidfVectorizer()
x_train_tf = tfidf.fit_transform(train_x)

In [49]:
lr_cv = LogisticRegression()
lr_cv.fit(x_train_cv, train_y)

LogisticRegression()

In [52]:
x_test_cv = cv.fit_transform(test_x)
predictions = lr_cv.predict(x_test_cv)

ValueError: X has 6040 features per sample; expecting 17488

In [94]:
x = all_positive_tweets + all_negative_tweets
y = np.append(np.ones((len(all_positive_tweets),1)), np.zeros((len(all_negative_tweets),1)))

In [95]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [96]:
df = pd.DataFrame(data={'tweet':x, 'label':y})

In [97]:
df.head()

Unnamed: 0,tweet,label
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,1.0
1,@Lamb2ja Hey James! How odd :/ Please call our...,1.0
2,@DespiteOfficial we had a listen last night :)...,1.0
3,@97sides CONGRATS :),1.0
4,yeaaaah yippppy!!! my accnt verified rqst has...,1.0


In [98]:
df.tail()

Unnamed: 0,tweet,label
9995,I wanna change my avi but uSanele :(,0.0
9996,MY PUPPY BROKE HER FOOT :(,0.0
9997,where's all the jaebum baby pictures :((,0.0
9998,But but Mr Ahmad Maslan cooks too :( https://t...,0.0
9999,@eawoman As a Hull supporter I am expecting a ...,0.0


In [99]:
sid = SentimentIntensityAnalyzer()

In [100]:
sid.polarity_scores('I am happy')

{'neg': 0.0, 'neu': 0.213, 'pos': 0.787, 'compound': 0.5719}

In [101]:
df['result'] = df['tweet'].apply(lambda tweet: sid.polarity_scores(tweet))

In [102]:
df.head()

Unnamed: 0,tweet,label,result
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,1.0,"{'neg': 0.0, 'neu': 0.615, 'pos': 0.385, 'comp..."
1,@Lamb2ja Hey James! How odd :/ Please call our...,1.0,"{'neg': 0.145, 'neu': 0.585, 'pos': 0.27, 'com..."
2,@DespiteOfficial we had a listen last night :)...,1.0,"{'neg': 0.0, 'neu': 0.706, 'pos': 0.294, 'comp..."
3,@97sides CONGRATS :),1.0,"{'neg': 0.0, 'neu': 0.123, 'pos': 0.877, 'comp..."
4,yeaaaah yippppy!!! my accnt verified rqst has...,1.0,"{'neg': 0.0, 'neu': 0.718, 'pos': 0.282, 'comp..."


In [103]:
df['com'] = df['result'].apply(lambda d: d['compound'])

In [104]:
df.head()

Unnamed: 0,tweet,label,result,com
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,1.0,"{'neg': 0.0, 'neu': 0.615, 'pos': 0.385, 'comp...",0.7579
1,@Lamb2ja Hey James! How odd :/ Please call our...,1.0,"{'neg': 0.145, 'neu': 0.585, 'pos': 0.27, 'com...",0.6229
2,@DespiteOfficial we had a listen last night :)...,1.0,"{'neg': 0.0, 'neu': 0.706, 'pos': 0.294, 'comp...",0.7959
3,@97sides CONGRATS :),1.0,"{'neg': 0.0, 'neu': 0.123, 'pos': 0.877, 'comp...",0.7983
4,yeaaaah yippppy!!! my accnt verified rqst has...,1.0,"{'neg': 0.0, 'neu': 0.718, 'pos': 0.282, 'comp...",0.795


In [105]:
df['pred'] = df['com'].apply(lambda c: 1.0 if c>0.5 else 0.0)

In [106]:
df.head()

Unnamed: 0,tweet,label,result,com,pred
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,1.0,"{'neg': 0.0, 'neu': 0.615, 'pos': 0.385, 'comp...",0.7579,1.0
1,@Lamb2ja Hey James! How odd :/ Please call our...,1.0,"{'neg': 0.145, 'neu': 0.585, 'pos': 0.27, 'com...",0.6229,1.0
2,@DespiteOfficial we had a listen last night :)...,1.0,"{'neg': 0.0, 'neu': 0.706, 'pos': 0.294, 'comp...",0.7959,1.0
3,@97sides CONGRATS :),1.0,"{'neg': 0.0, 'neu': 0.123, 'pos': 0.877, 'comp...",0.7983,1.0
4,yeaaaah yippppy!!! my accnt verified rqst has...,1.0,"{'neg': 0.0, 'neu': 0.718, 'pos': 0.282, 'comp...",0.795,1.0


In [107]:
df.tail()

Unnamed: 0,tweet,label,result,com,pred
9995,I wanna change my avi but uSanele :(,0.0,"{'neg': 0.391, 'neu': 0.609, 'pos': 0.0, 'comp...",-0.5927,0.0
9996,MY PUPPY BROKE HER FOOT :(,0.0,"{'neg': 0.617, 'neu': 0.383, 'pos': 0.0, 'comp...",-0.7531,0.0
9997,where's all the jaebum baby pictures :((,0.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0
9998,But but Mr Ahmad Maslan cooks too :( https://t...,0.0,"{'neg': 0.325, 'neu': 0.675, 'pos': 0.0, 'comp...",-0.5927,0.0
9999,@eawoman As a Hull supporter I am expecting a ...,0.0,"{'neg': 0.198, 'neu': 0.635, 'pos': 0.167, 'co...",-0.1027,0.0


In [108]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [109]:
accuracy_score(df['label'],df['pred'])

0.7842

In [110]:
print(classification_report(df['label'],df['pred']))

              precision    recall  f1-score   support

         0.0       0.72      0.93      0.81      5000
         1.0       0.90      0.64      0.75      5000

    accuracy                           0.78     10000
   macro avg       0.81      0.78      0.78     10000
weighted avg       0.81      0.78      0.78     10000



In [111]:
print(confusion_matrix(df['label'],df['pred']))

[[4627  373]
 [1785 3215]]
