In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import scipy.sparse

# Data Loading Stage + Pre-processing

In [2]:
# From https://www.kaggle.com/kazanova/sentiment140
# Added custom column header line to the csv after download
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1')
df.columns = ['target','ids','date','flag','user','text']

In [3]:
df.head(10)

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
5,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
6,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
7,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
8,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?
9,0,1467812416,Mon Apr 06 22:20:16 PDT 2009,NO_QUERY,erinx3leannexo,spring break in plain city... it's snowing


In [4]:
final_large_df = df.drop(['ids', 'date', 'flag', 'user'], axis=1) # drop cols that aren't useful for our model
final_large_df

Unnamed: 0,target,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1599994,4,Just woke up. Having no school is the best fee...
1599995,4,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,Happy 38th Birthday to my boo of alll time!!! ...


In [5]:
print('Number of Negative Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 0]))
print('Number of Neutral Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 2]))
print('Number of Positive Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 4]))

Number of Negative Sentiment Tweets: 799999
Number of Neutral Sentiment Tweets: 0
Number of Positive Sentiment Tweets: 800000


### Notice no neutral sentiment data and way too much data!

In [6]:
final_large_df.target = final_large_df.target / 4 # convert the target column to 0 and 1 labels where 1 is positive

In [7]:
final_large_df

Unnamed: 0,target,text
0,0.0,is upset that he can't update his Facebook by ...
1,0.0,@Kenichan I dived many times for the ball. Man...
2,0.0,my whole body feels itchy and like its on fire
3,0.0,"@nationwideclass no, it's not behaving at all...."
4,0.0,@Kwesidei not the whole crew
...,...,...
1599994,1.0,Just woke up. Having no school is the best fee...
1599995,1.0,TheWDB.com - Very cool to hear old Walt interv...
1599996,1.0,Are you ready for your MoJo Makeover? Ask me f...
1599997,1.0,Happy 38th Birthday to my boo of alll time!!! ...


In [8]:
final_df = final_large_df.sample(200000)
del final_large_df, df

In [9]:
final_df

Unnamed: 0,target,text
260971,0.0,"Lake Geneva was NICE, but I came back &amp; Na..."
635907,0.0,Woo the magnet culmination was cool aww ima mi...
953834,1.0,Fabulous lunch with a fabulous friend
1170116,1.0,good morning! just got up. its beautiful outsi...
1479335,1.0,talking about my scary ghost dream with lisa
...,...,...
1184151,1.0,@mileycyrus Good luck at the mtv movie awards ...
1004240,1.0,@iLoveColbyO he's mine too!!! lol
220613,0.0,Umm...why am I the only one in the theater abo...
755341,0.0,"http://twitpic.com/85l0y - Almost done, I'm go..."


### Split into Training and Test Splits for Model Evaluation

In [10]:
train_df, test_df = train_test_split(final_df, test_size=0.20)

In [11]:
print('Training Set Stats:')
print('Size of Training Set:', len(train_df))
print('Number of Negative Sentiment Tweets:', len(train_df[train_df['target'] == 0]))
print('Number of Positive Sentiment Tweets:', len(train_df[train_df['target'] == 1]))

Training Set Stats:
Size of Training Set: 160000
Number of Negative Sentiment Tweets: 80055
Number of Positive Sentiment Tweets: 79945


In [12]:
print('Test Set Stats:')
print('Size of Test Set:', len(test_df))
print('Number of Negative Sentiment Tweets:', len(test_df[test_df['target'] == 0]))
print('Number of Positive Sentiment Tweets:', len(test_df[test_df['target'] == 1]))

Test Set Stats:
Size of Test Set: 40000
Number of Negative Sentiment Tweets: 20070
Number of Positive Sentiment Tweets: 19930


# Count-Vectorizer Model 

In [13]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), max_features=100000)
X_train = count_vectorizer.fit_transform([entry['text'] for i, entry in train_df.iterrows()])
Y_train = np.array([int(entry['target']) for i, entry in train_df.iterrows()])

X_test = count_vectorizer.transform([entry['text'] for i, entry in test_df.iterrows()])
Y_test = np.array([int(entry['target']) for i, entry in test_df.iterrows()])

In [14]:
count_vect_model = LogisticRegression(C = 0.1, max_iter=15000)
count_vect_model.fit(X_train, Y_train)

LogisticRegression(C=0.1, max_iter=15000)

In [15]:
print(f'Training Accuracy: {np.mean(count_vect_model.predict(X_train) == Y_train)}')
print(f'Testing Accuracy: {np.mean(count_vect_model.predict(X_test) == Y_test)}')

Training Accuracy: 0.80974375
Testing Accuracy: 0.766575


In [16]:
# Write everything to files
pickle.dump(count_vectorizer, open('count_vectorizer.pickle', 'wb'))
pickle.dump(count_vect_model, open('count_vect_model.pickle', 'wb'))
scipy.sparse.save_npz('count_vect_X_train.npz', X_train)
np.save('count_vect_Y_train.npy', Y_train)
scipy.sparse.save_npz('count_vect_X_test.npz', X_test)
np.save('count_vect_Y_test.npy', Y_test)

In [17]:
# Test that pickling is working
read_vect = pickle.load(open('count_vectorizer.pickle', 'rb'))
read_model = pickle.load(open('count_vect_model.pickle', 'rb'))

read_X_train = scipy.sparse.load_npz('count_vect_X_train.npz')
read_Y_train = np.load('count_vect_Y_train.npy')
read_X_test = scipy.sparse.load_npz('count_vect_X_test.npz')
read_Y_test = np.load('count_vect_Y_test.npy')
print(f'Pickled Training Accuracy: {np.mean(read_model.predict(read_X_train) == read_Y_train)}')
print(f'Pickled Testing Accuracy: {np.mean(read_model.predict(read_X_test) == read_Y_test)}')

Pickled Training Accuracy: 0.80974375
Pickled Testing Accuracy: 0.766575


In [18]:
# Check probabilities work as expected
np.mean(np.argmax(read_model.predict_proba(read_X_train), axis=1) == read_Y_train)

0.80974375

# Tf-Idf Model

In [19]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features=100000)
X_train = tfidf_vectorizer.fit_transform([entry['text'] for i, entry in train_df.iterrows()])
Y_train = np.array([int(entry['target']) for i, entry in train_df.iterrows()])

X_test = tfidf_vectorizer.transform([entry['text'] for i, entry in test_df.iterrows()])
Y_test = np.array([int(entry['target']) for i, entry in test_df.iterrows()])

In [20]:
tfidf_model = LogisticRegression(C = 1.5, max_iter=15000)
tfidf_model.fit(X_train, Y_train)

LogisticRegression(C=1.5, max_iter=15000)

In [21]:
print(f'Training Accuracy: {np.mean(tfidf_model.predict(X_train) == Y_train)}')
print(f'Testing Accuracy: {np.mean(tfidf_model.predict(X_test) == Y_test)}')

Training Accuracy: 0.84830625
Testing Accuracy: 0.769725


In [22]:
pickle.dump(tfidf_vectorizer, open('tfidf_vect.pickle', 'wb'))
pickle.dump(tfidf_model, open('tfidf_model.pickle', 'wb'))
scipy.sparse.save_npz('tfidf_vect_X_train.npz', X_train)
np.save('tfidf_vect_Y_train.npy', Y_train)
scipy.sparse.save_npz('tfidf_vect_X_test.npz', X_test)
np.save('tfidf_vect_Y_test.npy', Y_test)

In [23]:
# Test that pickling is working
read_vect = pickle.load(open('tfidf_vect.pickle', 'rb'))
read_model = pickle.load(open('tfidf_model.pickle', 'rb'))

read_X_train = scipy.sparse.load_npz('tfidf_vect_X_train.npz')
read_Y_train = np.load('tfidf_vect_Y_train.npy')
read_X_test = scipy.sparse.load_npz('tfidf_vect_X_test.npz')
read_Y_test = np.load('tfidf_vect_Y_test.npy')
print(f'Pickled Training Accuracy: {np.mean(read_model.predict(read_X_train) == read_Y_train)}')
print(f'Pickled Testing Accuracy: {np.mean(read_model.predict(read_X_test) == read_Y_test)}')

Pickled Training Accuracy: 0.84830625
Pickled Testing Accuracy: 0.769725


# Model and Vectorizer Learned Information

In [24]:
def getExtremeWords(vectorizer, model):
    feature_names = np.array(read_vect.get_feature_names())
    order = np.argsort(read_model.coef_)
    
    print("Top 50 Most Negative Words/Phrases in Order:")
    print(feature_names[order[0, :50]])
    print()
    print("Top 50 Most Positive Words/Phrases in Order:")
    print(feature_names[order[0, -50:]][::-1])
    
    return feature_names[order[0, :50]], feature_names[order[0, -50:]][::-1] # negative, positive    

def predict(tweets, vectorizer, model):
    tweet_vectors = vectorizer.transform(tweets)
    preds = model.predict_proba(tweet_vectors)
    returnList = []
    for i, tweet in enumerate(tweets):
        print(f'Tweet: {tweet}')
        pred = "Negative" if np.argmax(preds[i]) == 0 else "Positive"
        print(f'Prediction: {pred}')
        print(f'Confidence of {pred} Prediction (0 to 1): {np.max(preds[i])}')
        print()
        returnList.append((tweet, pred, np.max(preds[i])))
    return returnList

def analyzeTweets(tweets, vectorizer, model):
    returnList = []
    for tweet in tweets:
        tweetList = []
        for word in tweet.split():
            word = word.lower()
            if word in vectorizer.get_feature_names():
                index = vectorizer.get_feature_names().index(word)
                print(f'Word: {word}, Connotation: {model.coef_[0, index]:.3f}')
                tweetList.append((word, model.coef_[0, index]))
            else: # not a top feature
                print(f'Word: {word}, Connotation: {0:.3f}')
                tweetList.append((word, 0))
        returnList.append(tweetList)
        print()
    return returnList

In [25]:
# read_vect = pickle.load(open('count_vectorizer.pickle', 'rb'))
# read_model = pickle.load(open('count_vect_model.pickle', 'rb'))
read_vect = pickle.load(open('tfidf_vect.pickle', 'rb'))
read_model = pickle.load(open('tfidf_model.pickle', 'rb'))

getExtremeWords(vectorizer=read_vect, model=read_model)

Top 50 Most Negative Words/Phrases in Order:
['sad' 'miss' 'wish' 'sick' 'poor' 'sucks' 'unfortunately' 'sadly' 'hurts'
 'missing' 'ugh' 'hate' 'bummed' 'headache' 'broken' 'lost' 'sorry'
 'bummer' 'upset' 'depressing' 'died' 'bad' 'missed' 'didn' 'rip' 'gutted'
 'broke' 'boo' 'cancelled' 'hurt' 'stuck' 'anymore' 'depressed' 'lonely'
 'crying' 'damn' 'disappointed' 'fail' 'worst' 'forgot' 'failed' 'ruined'
 'shame' 'misses' 'awful' 'horrible' 'alas' 'gone' 'hates' 'closed']

Top 50 Most Positive Words/Phrases in Order:
['thanks' 'thank' 'wish luck' 'welcome' 'glad' 'awesome' 'yay' 'great'
 'happy' 'excited' 'hehe' 'love' 'amazing' 'don forget' 'congratulations'
 'best' 'hi' 'proud' 'enjoy' 'followfriday' 'cute' 'loving' 'yummy'
 'hello' 'www' 'hehehe' 'nice' 'sweet' 'isn bad' 'fantastic' 'wonderful'
 'lovin' 'pleasure' 'hey' 'enjoyed' 'cheers' 'lovely' 'congrats' 'smile'
 'rocks' 'don miss' 'heh' 'thx' 'excellent' 'haha' 'worries'
 'don feel bad' 'woohoo' 'cool' 'thankyou']


(array(['sad', 'miss', 'wish', 'sick', 'poor', 'sucks', 'unfortunately',
        'sadly', 'hurts', 'missing', 'ugh', 'hate', 'bummed', 'headache',
        'broken', 'lost', 'sorry', 'bummer', 'upset', 'depressing', 'died',
        'bad', 'missed', 'didn', 'rip', 'gutted', 'broke', 'boo',
        'cancelled', 'hurt', 'stuck', 'anymore', 'depressed', 'lonely',
        'crying', 'damn', 'disappointed', 'fail', 'worst', 'forgot',
        'failed', 'ruined', 'shame', 'misses', 'awful', 'horrible', 'alas',
        'gone', 'hates', 'closed'], dtype='<U68'),
 array(['thanks', 'thank', 'wish luck', 'welcome', 'glad', 'awesome',
        'yay', 'great', 'happy', 'excited', 'hehe', 'love', 'amazing',
        'don forget', 'congratulations', 'best', 'hi', 'proud', 'enjoy',
        'followfriday', 'cute', 'loving', 'yummy', 'hello', 'www',
        'hehehe', 'nice', 'sweet', 'isn bad', 'fantastic', 'wonderful',
        'lovin', 'pleasure', 'hey', 'enjoyed', 'cheers', 'lovely',
        'congrats', 'sm

In [26]:
tweets = ['This bag of chips is disgusting yuck', 'i really enjoy riding my bike', 'it will be 70 degrees tomorrow']
predict(tweets, vectorizer=read_vect, model=read_model)

Tweet: This bag of chips is disgusting yuck
Prediction: Negative
Confidence of Negative Prediction (0 to 1): 0.7676623849916626

Tweet: i really enjoy riding my bike
Prediction: Positive
Confidence of Positive Prediction (0 to 1): 0.8214782411338701

Tweet: it will be 70 degrees tomorrow
Prediction: Negative
Confidence of Negative Prediction (0 to 1): 0.6281994780525346



[('This bag of chips is disgusting yuck', 'Negative', 0.7676623849916626),
 ('i really enjoy riding my bike', 'Positive', 0.8214782411338701),
 ('it will be 70 degrees tomorrow', 'Negative', 0.6281994780525346)]

In [27]:
analyzeTweets(tweets, vectorizer=read_vect, model=read_model)

Word: this, Connotation: 0.000
Word: bag, Connotation: -0.197
Word: of, Connotation: 0.000
Word: chips, Connotation: -0.150
Word: is, Connotation: 0.000
Word: disgusting, Connotation: -2.606
Word: yuck, Connotation: -0.911

Word: i, Connotation: 0.000
Word: really, Connotation: -1.299
Word: enjoy, Connotation: 3.644
Word: riding, Connotation: -0.053
Word: my, Connotation: 0.000
Word: bike, Connotation: 0.524

Word: it, Connotation: 0.000
Word: will, Connotation: 0.000
Word: be, Connotation: 0.000
Word: 70, Connotation: 0.183
Word: degrees, Connotation: -1.264
Word: tomorrow, Connotation: -0.464



[[('this', 0),
  ('bag', -0.19674551397103285),
  ('of', 0),
  ('chips', -0.14989987383719386),
  ('is', 0),
  ('disgusting', -2.605549562692836),
  ('yuck', -0.9107116164737056)],
 [('i', 0),
  ('really', -1.2989759532567624),
  ('enjoy', 3.6437202642003683),
  ('riding', -0.052755683831452155),
  ('my', 0),
  ('bike', 0.5235275738164094)],
 [('it', 0),
  ('will', 0),
  ('be', 0),
  ('70', 0.18260201844797638),
  ('degrees', -1.2641657773643316),
  ('tomorrow', -0.46417964343026413)]]

# Word2Vec Model 
### Generate similarity scores of words for visualization
Referenced https://stackabuse.com/implementing-word2vec-with-gensim-library-in-python/ and https://radimrehurek.com/gensim/auto_examples/core/run_similarity_queries.html#sphx-glr-auto-examples-core-run-similarity-queries-py

In [30]:
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/arhanna/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [31]:
tweets = [entry['text'].lower() for i, entry in train_df.iterrows()]
words = [nltk.word_tokenize(tweet) for tweet in tweets]

In [32]:
w2v = Word2Vec(words, min_count=5)
w2v.save('word2vec.model') # write to file

In [33]:
def getMostSimilarWords(tweets, w2v_model):
    returnList = []
    for tweet in tweets:
        tweetList = []
        for word in nltk.word_tokenize(tweet.lower()):
            if word in w2v_model.wv.vocab:
                tweetList.append((word, w2v_model.wv.most_similar(word)))
            else:
                tweetList.append((word, []))
        returnList.append(tweetList)
    return returnList

In [34]:
tweets = ['This bag of chips is disgusting yuck', 'i really enjoy riding my bike', 'it will be 70 degrees tomorrow']
getMostSimilarWords(tweets, w2v_model=w2v)

[[('this',
   [('which', 0.5913453698158264),
    ('that', 0.5436644554138184),
    ('next', 0.5003108978271484),
    ('it', 0.4632309675216675),
    ('every', 0.4562388062477112),
    ('the', 0.4461026191711426),
    ('life', 0.4057804346084595),
    ('everything', 0.39627110958099365),
    ('another', 0.3860161602497101),
    ('there', 0.3629739582538605)]),
  ('bag',
   [('glass', 0.8469563722610474),
    ('hole', 0.8219588994979858),
    ('pair', 0.8011025190353394),
    ('copy', 0.7925000190734863),
    ('bottle', 0.7881885766983032),
    ('truck', 0.7877194881439209),
    ('edge', 0.7855754494667053),
    ('jeans', 0.7831417322158813),
    ('bird', 0.7783561944961548),
    ('flat', 0.7778118252754211)]),
  ('of',
   [('whole', 0.4544753432273865),
    ('tree', 0.45442265272140503),
    ('newer', 0.3826151192188263),
    ('recent', 0.37735623121261597),
    ('other', 0.3723407983779907),
    ('under', 0.370388388633728),
    ('small', 0.367741197347641),
    ('throughout', 0.36459

In [35]:
# Reading from file
read_w2v = Word2Vec.load('word2vec.model')
getMostSimilarWords(tweets, w2v_model=read_w2v)

[[('this',
   [('which', 0.5913453698158264),
    ('that', 0.5436644554138184),
    ('next', 0.5003108978271484),
    ('it', 0.4632309675216675),
    ('every', 0.4562388062477112),
    ('the', 0.4461026191711426),
    ('life', 0.4057804346084595),
    ('everything', 0.39627110958099365),
    ('another', 0.3860161602497101),
    ('there', 0.3629739582538605)]),
  ('bag',
   [('glass', 0.8469563722610474),
    ('hole', 0.8219588994979858),
    ('pair', 0.8011025190353394),
    ('copy', 0.7925000190734863),
    ('bottle', 0.7881885766983032),
    ('truck', 0.7877194881439209),
    ('edge', 0.7855754494667053),
    ('jeans', 0.7831417322158813),
    ('bird', 0.7783561944961548),
    ('flat', 0.7778118252754211)]),
  ('of',
   [('whole', 0.4544753432273865),
    ('tree', 0.45442265272140503),
    ('newer', 0.3826151192188263),
    ('recent', 0.37735623121261597),
    ('other', 0.3723407983779907),
    ('under', 0.370388388633728),
    ('small', 0.367741197347641),
    ('throughout', 0.36459