In [3]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import scipy.sparse

# Data Loading Stage + Pre-processing

In [4]:
# From https://www.kaggle.com/kazanova/sentiment140
# Added custom column header line to the csv after download
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1')

In [5]:
df.head(10)
df = df.rename(columns={'0': 'target', '1467810369': 'id', 'Mon Apr 06 22:19:45 PDT 2009': 'date', 'NO_QUERY': 'flag', '_TheSpecialOne_': 'user', '@switchfoot http://twitpic.com/2y1zl - Awww, that\'s a bummer. You shoulda got David Carr of Third Day to do it. ;D': 'text'})

In [6]:
final_large_df = df.drop(['id', 'date', 'flag', 'user'], axis=1) # drop cols that aren't useful for our model
final_large_df

Unnamed: 0,target,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1599994,4,Just woke up. Having no school is the best fee...
1599995,4,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,Happy 38th Birthday to my boo of alll time!!! ...


In [7]:
print('Number of Negative Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 0]))
print('Number of Neutral Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 2]))
print('Number of Positive Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 4]))

Number of Negative Sentiment Tweets: 799999
Number of Neutral Sentiment Tweets: 0
Number of Positive Sentiment Tweets: 800000


### Notice no neutral sentiment data and way too much data!

In [8]:
final_large_df.target = final_large_df.target / 4 # convert the target column to 0 and 1 labels where 1 is positive

In [9]:
final_large_df

Unnamed: 0,target,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0.0,is upset that he can't update his Facebook by ...
1,0.0,@Kenichan I dived many times for the ball. Man...
2,0.0,my whole body feels itchy and like its on fire
3,0.0,"@nationwideclass no, it's not behaving at all...."
4,0.0,@Kwesidei not the whole crew
...,...,...
1599994,1.0,Just woke up. Having no school is the best fee...
1599995,1.0,TheWDB.com - Very cool to hear old Walt interv...
1599996,1.0,Are you ready for your MoJo Makeover? Ask me f...
1599997,1.0,Happy 38th Birthday to my boo of alll time!!! ...


In [10]:
final_df = final_large_df.sample(200000)
del final_large_df, df

In [11]:
final_df

Unnamed: 0,target,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
875230,1.0,"@HannahxCx aha, my fav colour is purple whats..."
1252875,1.0,my phone works!
414593,0.0,@teatotally I want @mollieofficial's 'blonde' ...
221361,0.0,I was in Mosouri last weekend and came back to...
150619,0.0,still hunting for the chicago @nkotb show. its...
...,...,...
1007459,1.0,@tanyasgoodies Thanks 4 following
430633,0.0,@thepjmorton
1099437,1.0,@chorale We have sunshine aplenty here today -...
1080812,1.0,@sadironman RENT was absolutely FANTASTIC. I c...


### Split into Training and Test Splits for Model Evaluation

In [12]:
train_df, test_df = train_test_split(final_df, test_size=0.20)

In [13]:
print('Training Set Stats:')
print('Size of Training Set:', len(train_df))
print('Number of Negative Sentiment Tweets:', len(train_df[train_df['target'] == 0]))
print('Number of Positive Sentiment Tweets:', len(train_df[train_df['target'] == 1]))

Training Set Stats:
Size of Training Set: 160000
Number of Negative Sentiment Tweets: 80048
Number of Positive Sentiment Tweets: 79952


In [14]:
print('Test Set Stats:')
print('Size of Test Set:', len(test_df))
print('Number of Negative Sentiment Tweets:', len(test_df[test_df['target'] == 0]))
print('Number of Positive Sentiment Tweets:', len(test_df[test_df['target'] == 1]))

Test Set Stats:
Size of Test Set: 40000
Number of Negative Sentiment Tweets: 20133
Number of Positive Sentiment Tweets: 19867


# Count-Vectorizer Model 

In [15]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), max_features=100000)
X_train = count_vectorizer.fit_transform([entry['text'] for i, entry in train_df.iterrows()])
Y_train = np.array([int(entry['target']) for i, entry in train_df.iterrows()])

X_test = count_vectorizer.transform([entry['text'] for i, entry in test_df.iterrows()])
Y_test = np.array([int(entry['target']) for i, entry in test_df.iterrows()])

KeyError: 'text'

In [14]:
count_vect_model = LogisticRegression(C = 0.1, max_iter=15000)
count_vect_model.fit(X_train, Y_train)

LogisticRegression(C=0.1, max_iter=15000)

In [15]:
print(f'Training Accuracy: {np.mean(count_vect_model.predict(X_train) == Y_train)}')
print(f'Testing Accuracy: {np.mean(count_vect_model.predict(X_test) == Y_test)}')

Training Accuracy: 0.81019375
Testing Accuracy: 0.765775


In [16]:
# Write everything to files
pickle.dump(count_vectorizer, open('count_vectorizer.pickle', 'wb'))
pickle.dump(count_vect_model, open('count_vect_model.pickle', 'wb'))
scipy.sparse.save_npz('count_vect_X_train.npz', X_train)
np.save('count_vect_Y_train.npy', Y_train)
scipy.sparse.save_npz('count_vect_X_test.npz', X_test)
np.save('count_vect_Y_test.npy', Y_test)

In [17]:
# Test that pickling is working
read_vect = pickle.load(open('count_vectorizer.pickle', 'rb'))
read_model = pickle.load(open('count_vect_model.pickle', 'rb'))

read_X_train = scipy.sparse.load_npz('count_vect_X_train.npz')
read_Y_train = np.load('count_vect_Y_train.npy')
read_X_test = scipy.sparse.load_npz('count_vect_X_test.npz')
read_Y_test = np.load('count_vect_Y_test.npy')
print(f'Pickled Training Accuracy: {np.mean(read_model.predict(read_X_train) == read_Y_train)}')
print(f'Pickled Testing Accuracy: {np.mean(read_model.predict(read_X_test) == read_Y_test)}')

Pickled Training Accuracy: 0.81019375
Pickled Testing Accuracy: 0.765775


In [18]:
# Check probabilities work as expected
np.mean(np.argmax(read_model.predict_proba(read_X_train), axis=1) == read_Y_train)

0.81019375

# Tf-Idf Model

In [19]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features=100000)
X_train = tfidf_vectorizer.fit_transform([entry['text'] for i, entry in train_df.iterrows()])
Y_train = np.array([int(entry['target']) for i, entry in train_df.iterrows()])

X_test = tfidf_vectorizer.transform([entry['text'] for i, entry in test_df.iterrows()])
Y_test = np.array([int(entry['target']) for i, entry in test_df.iterrows()])

In [20]:
tfidf_model = LogisticRegression(C = 1.5, max_iter=15000)
tfidf_model.fit(X_train, Y_train)

LogisticRegression(C=1.5, max_iter=15000)

In [21]:
print(f'Training Accuracy: {np.mean(tfidf_model.predict(X_train) == Y_train)}')
print(f'Testing Accuracy: {np.mean(tfidf_model.predict(X_test) == Y_test)}')

Training Accuracy: 0.84886875
Testing Accuracy: 0.770725


In [22]:
pickle.dump(tfidf_vectorizer, open('tfidf_vect.pickle', 'wb'))
pickle.dump(tfidf_model, open('tfidf_model.pickle', 'wb'))
scipy.sparse.save_npz('tfidf_vect_X_train.npz', X_train)
np.save('tfidf_vect_Y_train.npy', Y_train)
scipy.sparse.save_npz('tfidf_vect_X_test.npz', X_test)
np.save('tfidf_vect_Y_test.npy', Y_test)

In [23]:
# Test that pickling is working
read_vect = pickle.load(open('tfidf_vect.pickle', 'rb'))
read_model = pickle.load(open('tfidf_model.pickle', 'rb'))

read_X_train = scipy.sparse.load_npz('tfidf_vect_X_train.npz')
read_Y_train = np.load('tfidf_vect_Y_train.npy')
read_X_test = scipy.sparse.load_npz('tfidf_vect_X_test.npz')
read_Y_test = np.load('tfidf_vect_Y_test.npy')
print(f'Pickled Training Accuracy: {np.mean(read_model.predict(read_X_train) == read_Y_train)}')
print(f'Pickled Testing Accuracy: {np.mean(read_model.predict(read_X_test) == read_Y_test)}')

Pickled Training Accuracy: 0.84886875
Pickled Testing Accuracy: 0.770725


# Model and Vectorizer Learned Information

In [24]:
def getExtremeWords(vectorizer, model):
    feature_names = np.array(read_vect.get_feature_names())
    order = np.argsort(read_model.coef_)
    
    print("Top 50 Most Negative Words/Phrases in Order:")
    print(feature_names[order[0, :50]])
    print()
    print("Top 50 Most Positive Words/Phrases in Order:")
    print(feature_names[order[0, -50:]][::-1])
    
    return feature_names[order[0, :50]], feature_names[order[0, -50:]][::-1] # negative, positive    

def predict(tweets, vectorizer, model):
    tweet_vectors = vectorizer.transform(tweets)
    preds = model.predict_proba(tweet_vectors)
    returnList = []
    for i, tweet in enumerate(tweets):
        print(f'Tweet: {tweet}')
        pred = "Negative" if np.argmax(preds[i]) == 0 else "Positive"
        print(f'Prediction: {pred}')
        print(f'Confidence of {pred} Prediction (0 to 1): {np.max(preds[i])}')
        print()
        returnList.append((tweet, pred, np.max(preds[i])))
    return returnList

def analyzeTweets(tweets, vectorizer, model):
    returnList = []
    for tweet in tweets:
        tweetList = []
        for word in tweet.split():
            word = word.lower()
            if word in vectorizer.get_feature_names():
                index = vectorizer.get_feature_names().index(word)
                print(f'Word: {word}, Connotation: {model.coef_[0, index]:.3f}')
                tweetList.append((word, model.coef_[0, index]))
            else: # not a top feature
                print(f'Word: {word}, Connotation: {0:.3f}')
                tweetList.append((word, 0))
        returnList.append(tweetList)
        print()
    return returnList

In [25]:
# read_vect = pickle.load(open('count_vectorizer.pickle', 'rb'))
# read_model = pickle.load(open('count_vect_model.pickle', 'rb'))
read_vect = pickle.load(open('tfidf_vect.pickle', 'rb'))
read_model = pickle.load(open('tfidf_model.pickle', 'rb'))

getExtremeWords(vectorizer=read_vect, model=read_model)

Top 50 Most Negative Words/Phrases in Order:
['sad' 'sick' 'miss' 'poor' 'unfortunately' 'wish' 'sucks' 'hurts'
 'missing' 'hate' 'ugh' 'sadly' 'headache' 'bummed' 'died' 'sorry'
 'disappointed' 'broke' 'bad' 'bummer' 'lost' 'upset' 'didn' 'shame'
 'lonely' 'cancelled' 'missed' 'gutted' 'damn' 'broken' 'rip' 'closed'
 'crying' 'anymore' 'horrible' 'worst' 'sore' 'hurt' 'didnt' 'stuck'
 'depressing' 'misses' 'stupid' 'depressed' 'booo' 'doesn' 'awful' 'hates'
 'killing' 'burnt']

Top 50 Most Positive Words/Phrases in Order:
['thank' 'thanks' 'welcome' 'congrats' 'smile' 'excited' 'yay' 'wish luck'
 'awesome' 'glad' 'great' 'love' 'congratulations' 'wasn bad' 'hi'
 'smiling' 'cute' 'amazing' 'hehe' 'happy' 'wonderful' 'haha' 'enjoy'
 'proud' 'hello' 'loving' 'pleasure' 'nice' 'adorable' 'hey' 'cool'
 'beautiful' 'woo' 'www' 'yummy' 'hehehe' 'fantastic' 'followfriday'
 'lovely' 'thx' 'good' 'perfect' 'heh' 'excellent' 'hard work' 'listening'
 'sweet' 'woohoo' 'best' 'don need']


(array(['sad', 'sick', 'miss', 'poor', 'unfortunately', 'wish', 'sucks',
        'hurts', 'missing', 'hate', 'ugh', 'sadly', 'headache', 'bummed',
        'died', 'sorry', 'disappointed', 'broke', 'bad', 'bummer', 'lost',
        'upset', 'didn', 'shame', 'lonely', 'cancelled', 'missed',
        'gutted', 'damn', 'broken', 'rip', 'closed', 'crying', 'anymore',
        'horrible', 'worst', 'sore', 'hurt', 'didnt', 'stuck',
        'depressing', 'misses', 'stupid', 'depressed', 'booo', 'doesn',
        'awful', 'hates', 'killing', 'burnt'], dtype='<U59'),
 array(['thank', 'thanks', 'welcome', 'congrats', 'smile', 'excited',
        'yay', 'wish luck', 'awesome', 'glad', 'great', 'love',
        'congratulations', 'wasn bad', 'hi', 'smiling', 'cute', 'amazing',
        'hehe', 'happy', 'wonderful', 'haha', 'enjoy', 'proud', 'hello',
        'loving', 'pleasure', 'nice', 'adorable', 'hey', 'cool',
        'beautiful', 'woo', 'www', 'yummy', 'hehehe', 'fantastic',
        'followfriday', 'l

In [26]:
tweets = ['This bag of chips is disgusting yuck', 'i really enjoy riding my bike', 'it will be 70 degrees tomorrow']
predict(tweets, vectorizer=read_vect, model=read_model)

Tweet: This bag of chips is disgusting yuck
Prediction: Negative
Confidence of Negative Prediction (0 to 1): 0.769240077207706

Tweet: i really enjoy riding my bike
Prediction: Positive
Confidence of Positive Prediction (0 to 1): 0.6867906070710776

Tweet: it will be 70 degrees tomorrow
Prediction: Negative
Confidence of Negative Prediction (0 to 1): 0.6523543402046126



[('This bag of chips is disgusting yuck', 'Negative', 0.769240077207706),
 ('i really enjoy riding my bike', 'Positive', 0.6867906070710776),
 ('it will be 70 degrees tomorrow', 'Negative', 0.6523543402046126)]

In [27]:
analyzeTweets(tweets, vectorizer=read_vect, model=read_model)

Word: this, Connotation: 0.000
Word: bag, Connotation: 0.287
Word: of, Connotation: 0.000
Word: chips, Connotation: 0.053
Word: is, Connotation: 0.000
Word: disgusting, Connotation: -1.885
Word: yuck, Connotation: -1.171

Word: i, Connotation: 0.000
Word: really, Connotation: -1.614
Word: enjoy, Connotation: 3.597
Word: riding, Connotation: -0.327
Word: my, Connotation: 0.000
Word: bike, Connotation: -0.266

Word: it, Connotation: 0.000
Word: will, Connotation: 0.000
Word: be, Connotation: 0.000
Word: 70, Connotation: 0.026
Word: degrees, Connotation: -1.080
Word: tomorrow, Connotation: -0.761



[[('this', 0),
  ('bag', 0.2874278446361069),
  ('of', 0),
  ('chips', 0.05292501433667785),
  ('is', 0),
  ('disgusting', -1.8845646237164995),
  ('yuck', -1.1713829204188018)],
 [('i', 0),
  ('really', -1.6136558188748473),
  ('enjoy', 3.5967345548108685),
  ('riding', -0.32724736526303483),
  ('my', 0),
  ('bike', -0.2662721397711939)],
 [('it', 0),
  ('will', 0),
  ('be', 0),
  ('70', 0.02597830123092517),
  ('degrees', -1.0801505686959454),
  ('tomorrow', -0.7607353447835417)]]

# Word2Vec Model 
### Generate similarity scores of words for visualization
Referenced https://stackabuse.com/implementing-word2vec-with-gensim-library-in-python/ and https://radimrehurek.com/gensim/auto_examples/core/run_similarity_queries.html#sphx-glr-auto-examples-core-run-similarity-queries-py

In [28]:
from gensim.models import Word2Vec
import nltk

In [30]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [31]:
tweets = [entry['text'].lower() for i, entry in train_df.iterrows()]
words = [nltk.word_tokenize(tweet) for tweet in tweets]

In [32]:
w2v = Word2Vec(words, min_count=5)
w2v.save('word2vec.model') # write to file

In [33]:
def getMostSimilarWords(tweets, w2v_model):
    returnList = []
    for tweet in tweets:
        tweetList = []
        for word in nltk.word_tokenize(tweet.lower()):
            if word in w2v_model.wv.vocab:
                tweetList.append((word, w2v_model.wv.most_similar(word)))
            else:
                tweetList.append((word, []))
        returnList.append(tweetList)
    return returnList

In [34]:
tweets = ['This bag of chips is disgusting yuck', 'i really enjoy riding my bike', 'it will be 70 degrees tomorrow']
getMostSimilarWords(tweets, w2v_model=w2v)

[[('this',
   [('next', 0.5328178405761719),
    ('which', 0.5173956155776978),
    ('that', 0.5160244107246399),
    ('every', 0.4625844657421112),
    ('another', 0.45456063747406006),
    ('the', 0.432054340839386),
    ('whole', 0.4233904182910919),
    ('it', 0.4096474051475525),
    ('college', 0.39660054445266724),
    ('forecast', 0.380306214094162)]),
  ('bag',
   [('box', 0.8275642395019531),
    ('truck', 0.8130146861076355),
    ('glass', 0.7917367219924927),
    ('bottle', 0.7726737856864929),
    ('pot', 0.7644366025924683),
    ('bowl', 0.7644003629684448),
    ('lump', 0.7609724998474121),
    ('pair', 0.7606635689735413),
    ('woods', 0.7596918940544128),
    ('wall', 0.7590094804763794)]),
  ('of',
   [('parking', 0.4510842561721802),
    ('other', 0.4028436839580536),
    ('words', 0.4018063545227051),
    ('whole', 0.3989813029766083),
    ('row', 0.39098289608955383),
    ('social', 0.390688419342041),
    ('single', 0.3854885697364807),
    ('entire', 0.384782433

In [35]:
# Reading from file
read_w2v = Word2Vec.load('word2vec.model')
getMostSimilarWords(tweets, w2v_model=read_w2v)

[[('this',
   [('next', 0.5328178405761719),
    ('which', 0.5173956155776978),
    ('that', 0.5160244107246399),
    ('every', 0.4625844657421112),
    ('another', 0.45456063747406006),
    ('the', 0.432054340839386),
    ('whole', 0.4233904182910919),
    ('it', 0.4096474051475525),
    ('college', 0.39660054445266724),
    ('forecast', 0.380306214094162)]),
  ('bag',
   [('box', 0.8275642395019531),
    ('truck', 0.8130146861076355),
    ('glass', 0.7917367219924927),
    ('bottle', 0.7726737856864929),
    ('pot', 0.7644366025924683),
    ('bowl', 0.7644003629684448),
    ('lump', 0.7609724998474121),
    ('pair', 0.7606635689735413),
    ('woods', 0.7596918940544128),
    ('wall', 0.7590094804763794)]),
  ('of',
   [('parking', 0.4510842561721802),
    ('other', 0.4028436839580536),
    ('words', 0.4018063545227051),
    ('whole', 0.3989813029766083),
    ('row', 0.39098289608955383),
    ('social', 0.390688419342041),
    ('single', 0.3854885697364807),
    ('entire', 0.384782433

In [None]:
pickle.dump(count_vectorizer, open('count_vectorizer.pickle', 'wb'))