In [18]:
import pandas as pd
import numpy as np

import pickle
import scipy.sparse

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

from sklearn.linear_model import LogisticRegression

from models import train_classifier, train_w2v, getExtremeWords, predict, analyzeTweets, getMostSimilarWords

# Data Loading Stage + Pre-Processing

In [2]:
# From https://www.kaggle.com/kazanova/sentiment140
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
df.columns = ['target','ids','date','flag','user','text']

In [3]:
df.head(10)

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [4]:
final_large_df = df.drop(['ids', 'date', 'flag', 'user'], axis=1) # drop cols that aren't useful for our model
final_large_df

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [5]:
print('Number of Negative Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 0]))
print('Number of Neutral Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 2]))
print('Number of Positive Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 4]))

Number of Negative Sentiment Tweets: 800000
Number of Neutral Sentiment Tweets: 0
Number of Positive Sentiment Tweets: 800000


### Notice no neutral sentiment data and way too much data!
The dataset only holds positive and negative sentiment tweets, so the targets are only 0 or 4. The dataset is also 1.6 million tweets which is too large for our compute resources. 

In [6]:
final_large_df.target = final_large_df.target / 4 # convert the target column to 0 and 1 labels where 1 is positive
final_df = final_large_df.sample(800000) # random sample 800k rows
del final_large_df, df
final_df

Unnamed: 0,target,text
1594484,1.0,and...my LAST/ BACK ROW CREW!! my SWEAT 6TENer...
1103711,1.0,heyyy I have 14* new followers last nite &amp;...
1530556,1.0,@heycassadee I just like how Cassadee Pope tre...
1007094,1.0,@LilCease Awwww that was sweet of you...thats ...
655573,0.0,One of those rainy days in chicago..I wanna st...
...,...,...
797663,0.0,has done a lot of work today. The bedroom is v...
82245,0.0,I woke up from a dream. I was playing the slo...
826584,1.0,@toddbeltz I'm good! I think your work is like...
140017,0.0,fuck! i hate streaming seasons of shows online...


### Split into Training and Test Splits for Model Evaluation
Notice with the sampling and splits of the data, we still maintain strong class balance in the datasets. 

In [7]:
train_df, test_df = train_test_split(final_df, test_size=0.20)

In [8]:
print('Training Set Stats:')
print('Size of Training Set:', len(train_df))
print('Number of Negative Sentiment Tweets:', len(train_df[train_df['target'] == 0]))
print('Number of Positive Sentiment Tweets:', len(train_df[train_df['target'] == 1]))

Training Set Stats:
Size of Training Set: 640000
Number of Negative Sentiment Tweets: 320220
Number of Positive Sentiment Tweets: 319780


In [9]:
print('Test Set Stats:')
print('Size of Test Set:', len(test_df))
print('Number of Negative Sentiment Tweets:', len(test_df[test_df['target'] == 0]))
print('Number of Positive Sentiment Tweets:', len(test_df[test_df['target'] == 1]))

Test Set Stats:
Size of Test Set: 160000
Number of Negative Sentiment Tweets: 79864
Number of Positive Sentiment Tweets: 80136


# Count-Vectorizer Model 

In [10]:
# train function writes the model, vectorizer, and data used to files for use by the web app or other functions
train_classifier(train_df, test_df, CountVectorizer, stop_words='english', max_features=200000, ngram_range=(1,3), C=0.1)

Vectorizing data...
Training Model...

LogisticRegression Classifier using CountVectorizer
Training Accuracy: 0.8206421875
Testing Accuracy: 0.79059375


# Tf-Idf Model

In [11]:
train_classifier(train_df, test_df, TfidfVectorizer, stop_words='english', max_features=200000, ngram_range=(1,3), C=1.5)

Vectorizing data...
Training Model...

LogisticRegression Classifier using TfidfVectorizer
Training Accuracy: 0.84041875
Testing Accuracy: 0.79191875


## Boilerplate Code for Loading Saved Model/Vectorizer/Data
Notice the training and testing accuracies match the ones above, which means we have saved and loaded everything correctly!

In [12]:
# Test how to load the files that were written by train_classifier
read_vect = pickle.load(open('./out/tfidf_vect.pickle', 'rb'))
read_model = pickle.load(open('./out/tfidf_model.pickle', 'rb'))

read_X_train = scipy.sparse.load_npz('./out/tfidf_vect_X_train.npz')
read_Y_train = np.load('./out/tfidf_vect_Y_train.npy')
read_X_test = scipy.sparse.load_npz('./out/tfidf_vect_X_test.npz')
read_Y_test = np.load('./out/tfidf_vect_Y_test.npy')
print(f'Pickled Training Accuracy: {np.mean(read_model.predict(read_X_train) == read_Y_train)}')
print(f'Pickled Testing Accuracy: {np.mean(read_model.predict(read_X_test) == read_Y_test)}')

Pickled Training Accuracy: 0.84041875
Pickled Testing Accuracy: 0.79191875


# Model and Vectorizer Learned Information
### Analyze a chosen model and its predictions on some sample tweets

In [13]:
# read_vect = pickle.load(open('./out/count_vect.pickle', 'rb'))
# read_model = pickle.load(open('./out/count_model.pickle', 'rb'))
read_vect = pickle.load(open('./out/tfidf_vect.pickle', 'rb'))
read_model = pickle.load(open('./out/tfidf_model.pickle', 'rb'))

getExtremeWords(vectorizer=read_vect, model=read_model)

(array(['sad', 'sick', 'miss', "can't", 'poor', 'missing', 'sadly',
        'sucks', 'hurts', 'bummed', 'headache', 'wish', 'unfortunately',
        'died', 'gutted', 'ugh', 'hate', 'broke', 'lost', 'disappointed',
        'depressing', 'upset', 'missed', 'bummer', 'rip', 'disappointing',
        'lonely', 'ruined', 'missin', 'depressed', 'cancelled', 'broken',
        'worst', 'sorry', "didn't", 'horrible', 'anymore', 'bad', 'sigh',
        '. miss', 'closed', 'hurt', 'failed', 'stuck', 'misses', 'damn',
        'ouch', 'hates', 'argh', 'hurting'], dtype='<U59'),
 array(["can't wait", 'wish luck', 'thank', 'thanks', 'welcome', 'smile',
        'yay', "don't forget", '#followfriday', 'glad', 'awesome', '=(',
        "isn't bad", 'congrats', 'excited', 'proud', 'congratulations',
        'happy', 'amazing', 'gotta love', 'blessed', 'smiling', 'loving',
        "don't worry", 'hehe', 'wonderful', 'let know', 'great', 'hi',
        'love', 'sweet', "don't need", 'hehehe', 'pleasure', "was

In [14]:
tweets = ['This bag of chips is disgusting yuck', 'i really enjoy riding my bike', 'This cameraman is nice!']
predict(tweets, vectorizer=read_vect, model=read_model)

[('This bag of chips is disgusting yuck', 'Negative', 0.93666026929223),
 ('i really enjoy riding my bike', 'Positive', 0.7951799929885445),
 ('This cameraman is nice!', 'Positive', 0.9681563649737266)]

In [15]:
analyzeTweets(tweets, vectorizer=read_vect, model=read_model)

[[('this', 0),
  ('bag', 0.11614393968954008),
  ('of', 0),
  ('chips', -0.012730931477821544),
  ('is', 0),
  ('disgusting', -3.331450053894557),
  ('yuck', -3.587036261276761)],
 [('i', 0),
  ('really', -1.895889860493235),
  ('enjoy', 3.9429369045956975),
  ('riding', 1.1123229038476234),
  ('my', 0),
  ('bike', -0.38522295107391946)],
 [('this', 0),
  ('cameraman', 0),
  ('is', 0),
  ('nice', 3.6284864899735725),
  ('!', 2.631494531320365)]]

# Word2Vec Model 
### Generate the most similar words and similarity scores per word in sample tweets for visualization
Referenced https://stackabuse.com/implementing-word2vec-with-gensim-library-in-python/ and https://radimrehurek.com/gensim/auto_examples/core/run_similarity_queries.html#sphx-glr-auto-examples-core-run-similarity-queries-py

In [16]:
train_w2v(train_df)

In [17]:
# Reading model from file
read_w2v = Word2Vec.load('./out/word2vec.model')

tweets = ['This bag of chips is disgusting yuck', 'i really enjoy riding my bike', 'This cameraman is nice!']
getMostSimilarWords(tweets, w2v_model=read_w2v)

[[('this',
   [('that', 0.5846254825592041),
    ('which', 0.5327487587928772),
    ('next', 0.5212511420249939),
    ('it', 0.5002129077911377),
    ('the', 0.49814873933792114),
    ('every', 0.46154963970184326),
    ('today', 0.410861611366272),
    ('mid', 0.4086470901966095),
    ('a', 0.40743839740753174),
    ('another', 0.40245521068573)]),
  ('bag',
   [('box', 0.7764383554458618),
    ('shoe', 0.7533471584320068),
    ('jacket', 0.74973064661026),
    ('bottle', 0.746452808380127),
    ('mug', 0.7421192526817322),
    ('belt', 0.7348504066467285),
    ('pen', 0.7292981147766113),
    ('pocket', 0.7237828969955444),
    ('pot', 0.7233998775482178),
    ('tank', 0.7175831198692322)]),
  ('of',
   [('obstacle', 0.43935900926589966),
    ('recent', 0.434344619512558),
    ('actual', 0.4305911362171173),
    ("world's", 0.4273523688316345),
    ('current', 0.41370707750320435),
    ('thereof', 0.41030368208885193),
    ('parking', 0.4006973206996918),
    ('male', 0.3994006812572