In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import scipy.sparse

# Data Loading Stage + Pre-processing

In [2]:
# From https://www.kaggle.com/kazanova/sentiment140
# Added custom column header line to the csv after download
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1')

In [3]:
df.head(10)

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [4]:
final_large_df = df.drop(['ids', 'date', 'flag', 'user'], axis=1) # drop cols that aren't useful for our model
final_large_df

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [5]:
print('Number of Negative Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 0]))
print('Number of Neutral Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 2]))
print('Number of Positive Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 4]))

Number of Negative Sentiment Tweets: 800000
Number of Neutral Sentiment Tweets: 0
Number of Positive Sentiment Tweets: 800000


### Notice no neutral sentiment data and way too much data!

In [6]:
final_large_df.target = final_large_df.target / 4 # convert the target column to 0 and 1 labels where 1 is positive

In [7]:
final_large_df

Unnamed: 0,target,text
0,0.0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0.0,is upset that he can't update his Facebook by ...
2,0.0,@Kenichan I dived many times for the ball. Man...
3,0.0,my whole body feels itchy and like its on fire
4,0.0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,1.0,Just woke up. Having no school is the best fee...
1599996,1.0,TheWDB.com - Very cool to hear old Walt interv...
1599997,1.0,Are you ready for your MoJo Makeover? Ask me f...
1599998,1.0,Happy 38th Birthday to my boo of alll time!!! ...


In [8]:
final_df = final_large_df.sample(200000)
del final_large_df, df

In [9]:
final_df

Unnamed: 0,target,text
1321962,1.0,@bertstephani hence my previous tweet
190289,0.0,"Wow, I feel like an asshole."
1398596,1.0,just made a twitter
1266132,1.0,dear twitter i just updated my blog. You like?...
275193,0.0,must finish laundry
...,...,...
747421,0.0,Now on my face? Ugh...this just keeps getting ...
198270,0.0,"@Sweetashoneey chilling, its a beautiful day o..."
724963,0.0,its my last day of teen-dom
310428,0.0,I still feel sick what a time to get sick.


### Split into Training and Test Splits for Model Evaluation

In [10]:
train_df, test_df = train_test_split(final_df, test_size=0.20)

In [11]:
print('Training Set Stats:')
print('Size of Training Set:', len(train_df))
print('Number of Negative Sentiment Tweets:', len(train_df[train_df['target'] == 0]))
print('Number of Positive Sentiment Tweets:', len(train_df[train_df['target'] == 1]))

Training Set Stats:
Size of Training Set: 160000
Number of Negative Sentiment Tweets: 79911
Number of Positive Sentiment Tweets: 80089


In [12]:
print('Test Set Stats:')
print('Size of Test Set:', len(test_df))
print('Number of Negative Sentiment Tweets:', len(test_df[test_df['target'] == 0]))
print('Number of Positive Sentiment Tweets:', len(test_df[test_df['target'] == 1]))

Test Set Stats:
Size of Test Set: 40000
Number of Negative Sentiment Tweets: 19940
Number of Positive Sentiment Tweets: 20060


# Count-Vectorizer Model 

In [13]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), max_features=100000)
X_train = count_vectorizer.fit_transform([entry['text'] for i, entry in train_df.iterrows()])
Y_train = np.array([int(entry['target']) for i, entry in train_df.iterrows()])

X_test = count_vectorizer.transform([entry['text'] for i, entry in test_df.iterrows()])
Y_test = np.array([int(entry['target']) for i, entry in test_df.iterrows()])

In [14]:
count_vect_model = LogisticRegression(C = 0.1, max_iter=15000)
count_vect_model.fit(X_train, Y_train)

LogisticRegression(C=0.1, max_iter=15000)

In [15]:
print(f'Training Accuracy: {np.mean(count_vect_model.predict(X_train) == Y_train)}')
print(f'Testing Accuracy: {np.mean(count_vect_model.predict(X_test) == Y_test)}')

Training Accuracy: 0.80860625
Testing Accuracy: 0.771975


In [16]:
# Write everything to files
pickle.dump(count_vectorizer, open('count_vectorizer.pickle', 'wb'))
pickle.dump(count_vect_model, open('count_vect_model.pickle', 'wb'))
scipy.sparse.save_npz('count_vect_X_train.npz', X_train)
np.save('count_vect_Y_train.npy', Y_train)
scipy.sparse.save_npz('count_vect_X_test.npz', X_test)
np.save('count_vect_Y_test.npy', Y_test)

In [17]:
# Test that pickling is working
read_vect = pickle.load(open('count_vectorizer.pickle', 'rb'))
read_model = pickle.load(open('count_vect_model.pickle', 'rb'))

read_X_train = scipy.sparse.load_npz('count_vect_X_train.npz')
read_Y_train = np.load('count_vect_Y_train.npy')
read_X_test = scipy.sparse.load_npz('count_vect_X_test.npz')
read_Y_test = np.load('count_vect_Y_test.npy')
print(f'Pickled Training Accuracy: {np.mean(read_model.predict(read_X_train) == read_Y_train)}')
print(f'Pickled Testing Accuracy: {np.mean(read_model.predict(read_X_test) == read_Y_test)}')

Pickled Training Accuracy: 0.80860625
Pickled Testing Accuracy: 0.771975


In [18]:
# Check probabilities work as expected
np.mean(np.argmax(read_model.predict_proba(read_X_train), axis=1) == read_Y_train)

0.80860625

# Tf-Idf Model

In [19]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features=100000)
X_train = tfidf_vectorizer.fit_transform([entry['text'] for i, entry in train_df.iterrows()])
Y_train = np.array([int(entry['target']) for i, entry in train_df.iterrows()])

X_test = tfidf_vectorizer.transform([entry['text'] for i, entry in test_df.iterrows()])
Y_test = np.array([int(entry['target']) for i, entry in test_df.iterrows()])

In [20]:
tfidf_model = LogisticRegression(C = 1.5, max_iter=15000)
tfidf_model.fit(X_train, Y_train)

LogisticRegression(C=1.5, max_iter=15000)

In [21]:
print(f'Training Accuracy: {np.mean(tfidf_model.predict(X_train) == Y_train)}')
print(f'Testing Accuracy: {np.mean(tfidf_model.predict(X_test) == Y_test)}')

Training Accuracy: 0.8463125
Testing Accuracy: 0.774


In [22]:
pickle.dump(tfidf_vectorizer, open('tfidf_vect.pickle', 'wb'))
pickle.dump(tfidf_model, open('tfidf_model.pickle', 'wb'))
scipy.sparse.save_npz('tfidf_vect_X_train.npz', X_train)
np.save('tfidf_vect_Y_train.npy', Y_train)
scipy.sparse.save_npz('tfidf_vect_X_test.npz', X_test)
np.save('tfidf_vect_Y_test.npy', Y_test)

In [23]:
# Test that pickling is working
read_vect = pickle.load(open('tfidf_vect.pickle', 'rb'))
read_model = pickle.load(open('tfidf_model.pickle', 'rb'))

read_X_train = scipy.sparse.load_npz('tfidf_vect_X_train.npz')
read_Y_train = np.load('tfidf_vect_Y_train.npy')
read_X_test = scipy.sparse.load_npz('tfidf_vect_X_test.npz')
read_Y_test = np.load('tfidf_vect_Y_test.npy')
print(f'Pickled Training Accuracy: {np.mean(read_model.predict(read_X_train) == read_Y_train)}')
print(f'Pickled Testing Accuracy: {np.mean(read_model.predict(read_X_test) == read_Y_test)}')

Pickled Training Accuracy: 0.8463125
Pickled Testing Accuracy: 0.774


# Model and Vectorizer Learned Information

In [24]:
def getExtremeWords(vectorizer, model):
    feature_names = np.array(read_vect.get_feature_names())
    order = np.argsort(read_model.coef_)
    
    print("Top 50 Most Negative Words/Phrases in Order:")
    print(feature_names[order[0, :50]])
    print()
    print("Top 50 Most Positive Words/Phrases in Order:")
    print(feature_names[order[0, -50:]][::-1])
    
    return feature_names[order[0, :50]], feature_names[order[0, -50:]][::-1] # negative, positive    

def predict(tweets, vectorizer, model):
    tweet_vectors = vectorizer.transform(tweets)
    preds = model.predict_proba(tweet_vectors)
    returnList = []
    for i, tweet in enumerate(tweets):
        print(f'Tweet: {tweet}')
        pred = "Negative" if np.argmax(preds[i]) == 0 else "Positive"
        print(f'Prediction: {pred}')
        print(f'Confidence of {pred} Prediction (0 to 1): {np.max(preds[i])}')
        print()
        returnList.append((tweet, pred, np.max(preds[i])))
    return returnList

def analyzeTweets(tweets, vectorizer, model):
    returnList = []
    for tweet in tweets:
        tweetList = []
        for word in tweet.split():
            word = word.lower()
            if word in vectorizer.get_feature_names():
                index = vectorizer.get_feature_names().index(word)
                print(f'Word: {word}, Connotation: {model.coef_[0, index]:.3f}')
                tweetList.append((word, model.coef_[0, index]))
            else: # not a top feature
                print(f'Word: {word}, Connotation: {0:.3f}')
                tweetList.append((word, 0))
        returnList.append(tweetList)
        print()
    return returnList

In [25]:
# read_vect = pickle.load(open('count_vectorizer.pickle', 'rb'))
# read_model = pickle.load(open('count_vect_model.pickle', 'rb'))
read_vect = pickle.load(open('tfidf_vect.pickle', 'rb'))
read_model = pickle.load(open('tfidf_model.pickle', 'rb'))

getExtremeWords(vectorizer=read_vect, model=read_model)

Top 50 Most Negative Words/Phrases in Order:
['sad' 'miss' 'wish' 'sadly' 'sick' 'poor' 'missing' 'hate' 'sucks'
 'unfortunately' 'sorry' 'hurts' 'disappointed' 'missed' 'ugh' 'headache'
 'upset' 'worst' 'died' 'lost' 'broken' 'gutted' 'bad' 'broke' 'didnt'
 'rip' 'bummer' 'bummed' 'crying' 'hurt' 'horrible' 'didn' 'worried'
 'lonely' 'depressing' 'happened' 'depressed' 'hates' 'shame' 'stupid'
 'sold' 'cancelled' 'boo' 'disappointing' 'scared' 'missin' 'anymore'
 'wishing' 'alas' 'misses']

Top 50 Most Positive Words/Phrases in Order:
['thanks' 'thank' 'welcome' 'wish luck' 'awesome' 'great' 'yay' 'glad'
 'followfriday' 'hehe' 'congrats' 'happy' 'excited' 'congratulations'
 'amazing' 'love' 'ftw' 'cute' 'fantastic' 'heh' 'nice' 'hi' 'yummy'
 'smile' 'proud' 'excellent' 'don worry' 'cheers' 'loving' 'haha' 'thx'
 'isn bad' 'sweet' 'hello' 'www' 'smiling' 'enjoy' 'woohoo' 'don forget'
 'wonderful' 'interesting' 'pleasure' 'loves' 'feels good' 'hahaha' 'cool'
 'best' 'hilarious' 'good' '

(array(['sad', 'miss', 'wish', 'sadly', 'sick', 'poor', 'missing', 'hate',
        'sucks', 'unfortunately', 'sorry', 'hurts', 'disappointed',
        'missed', 'ugh', 'headache', 'upset', 'worst', 'died', 'lost',
        'broken', 'gutted', 'bad', 'broke', 'didnt', 'rip', 'bummer',
        'bummed', 'crying', 'hurt', 'horrible', 'didn', 'worried',
        'lonely', 'depressing', 'happened', 'depressed', 'hates', 'shame',
        'stupid', 'sold', 'cancelled', 'boo', 'disappointing', 'scared',
        'missin', 'anymore', 'wishing', 'alas', 'misses'], dtype='<U47'),
 array(['thanks', 'thank', 'welcome', 'wish luck', 'awesome', 'great',
        'yay', 'glad', 'followfriday', 'hehe', 'congrats', 'happy',
        'excited', 'congratulations', 'amazing', 'love', 'ftw', 'cute',
        'fantastic', 'heh', 'nice', 'hi', 'yummy', 'smile', 'proud',
        'excellent', 'don worry', 'cheers', 'loving', 'haha', 'thx',
        'isn bad', 'sweet', 'hello', 'www', 'smiling', 'enjoy', 'woohoo',
    

In [26]:
tweets = ['This bag of chips is disgusting yuck', 'i really enjoy riding my bike', 'it will be 70 degrees tomorrow']
predict(tweets, vectorizer=read_vect, model=read_model)

Tweet: This bag of chips is disgusting yuck
Prediction: Negative
Confidence of Negative Prediction (0 to 1): 0.8482958179119067

Tweet: i really enjoy riding my bike
Prediction: Positive
Confidence of Positive Prediction (0 to 1): 0.5754688288287676

Tweet: it will be 70 degrees tomorrow
Prediction: Negative
Confidence of Negative Prediction (0 to 1): 0.6703189484214831



[('This bag of chips is disgusting yuck', 'Negative', 0.8482958179119067),
 ('i really enjoy riding my bike', 'Positive', 0.5754688288287676),
 ('it will be 70 degrees tomorrow', 'Negative', 0.6703189484214831)]

In [27]:
analyzeTweets(tweets, vectorizer=read_vect, model=read_model)

Word: this, Connotation: 0.000
Word: bag, Connotation: 0.342
Word: of, Connotation: 0.000
Word: chips, Connotation: 0.460
Word: is, Connotation: 0.000
Word: disgusting, Connotation: -2.387
Word: yuck, Connotation: -2.482

Word: i, Connotation: 0.000
Word: really, Connotation: -1.449
Word: enjoy, Connotation: 3.164
Word: riding, Connotation: 0.870
Word: my, Connotation: 0.000
Word: bike, Connotation: -0.049

Word: it, Connotation: 0.000
Word: will, Connotation: 0.000
Word: be, Connotation: 0.000
Word: 70, Connotation: -0.718
Word: degrees, Connotation: -0.852
Word: tomorrow, Connotation: -0.377



[[('this', 0),
  ('bag', 0.3423577609979143),
  ('of', 0),
  ('chips', 0.45981979122814143),
  ('is', 0),
  ('disgusting', -2.386586185338159),
  ('yuck', -2.48193471891813)],
 [('i', 0),
  ('really', -1.4492154011103868),
  ('enjoy', 3.1637664915811623),
  ('riding', 0.8695688712971726),
  ('my', 0),
  ('bike', -0.048906772296951936)],
 [('it', 0),
  ('will', 0),
  ('be', 0),
  ('70', -0.7179836835723201),
  ('degrees', -0.8522556574264579),
  ('tomorrow', -0.37719673053026603)]]

# Word2Vec Model 
### Generate similarity scores of words for visualization
Referenced https://stackabuse.com/implementing-word2vec-with-gensim-library-in-python/ and https://radimrehurek.com/gensim/auto_examples/core/run_similarity_queries.html#sphx-glr-auto-examples-core-run-similarity-queries-py

In [28]:
from gensim.models import Word2Vec
import nltk

ModuleNotFoundError: No module named 'gensim'

In [None]:
tweets = [entry['text'].lower() for i, entry in train_df.iterrows()]
words = [nltk.word_tokenize(tweet) for tweet in tweets]

In [None]:
w2v = Word2Vec(words, min_count=5)
w2v.save('word2vec.model') # write to file

In [None]:
def getMostSimilarWords(tweets, w2v_model):
    returnList = []
    for tweet in tweets:
        tweetList = []
        for word in nltk.word_tokenize(tweet.lower()):
            if word in w2v_model.wv.vocab:
                tweetList.append((word, w2v_model.wv.most_similar(word)))
            else:
                tweetList.append((word, []))
        returnList.append(tweetList)
    return returnList

In [None]:
tweets = ['This bag of chips is disgusting yuck', 'i really enjoy riding my bike', 'it will be 70 degrees tomorrow']
getMostSimilarWords(tweets, w2v_model=w2v)

In [None]:
# Reading from file
read_w2v = Word2Vec.load('word2vec.model')
getMostSimilarWords(tweets, w2v_model=read_w2v)