In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import scipy.sparse

from nltk.tokenize import TweetTokenizer

In [2]:
tweetTokenizer = TweetTokenizer(strip_handles=True, preserve_case=False)

# Data Loading Stage + Pre-processing

In [3]:
# From https://www.kaggle.com/kazanova/sentiment140
# Added custom column header line to the csv after download
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
df.columns = ['target','ids','date','flag','user','text']

In [4]:
df.head(10)

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [5]:
final_large_df = df.drop(['ids', 'date', 'flag', 'user'], axis=1) # drop cols that aren't useful for our model
final_large_df

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [6]:
print('Number of Negative Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 0]))
print('Number of Neutral Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 2]))
print('Number of Positive Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 4]))

Number of Negative Sentiment Tweets: 800000
Number of Neutral Sentiment Tweets: 0
Number of Positive Sentiment Tweets: 800000


### Notice no neutral sentiment data and way too much data!

In [7]:
final_large_df.target = final_large_df.target / 4 # convert the target column to 0 and 1 labels where 1 is positive

In [8]:
final_large_df

Unnamed: 0,target,text
0,0.0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0.0,is upset that he can't update his Facebook by ...
2,0.0,@Kenichan I dived many times for the ball. Man...
3,0.0,my whole body feels itchy and like its on fire
4,0.0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,1.0,Just woke up. Having no school is the best fee...
1599996,1.0,TheWDB.com - Very cool to hear old Walt interv...
1599997,1.0,Are you ready for your MoJo Makeover? Ask me f...
1599998,1.0,Happy 38th Birthday to my boo of alll time!!! ...


In [9]:
final_df = final_large_df.sample(800000)
del final_large_df, df

In [10]:
final_df

Unnamed: 0,target,text
1552753,1.0,good morning twitter!
1195104,1.0,@etvthetrainer It's cool bro... I had your bac...
151484,0.0,"Finally saw Star Trek, It was good. Had some s..."
151130,0.0,ughh headached
77189,0.0,eating toast and hoping I'm not gonna be sick ...
...,...,...
97397,0.0,ugghhh.. i have a headache
1289824,1.0,@Strok Have fun on the sleeper
290325,0.0,I didn't watch MTV MA sucks..
412779,0.0,PS. i'm like 2 shades darker now after playing...


### Split into Training and Test Splits for Model Evaluation

In [11]:
train_df, test_df = train_test_split(final_df, test_size=0.20)

In [12]:
print('Training Set Stats:')
print('Size of Training Set:', len(train_df))
print('Number of Negative Sentiment Tweets:', len(train_df[train_df['target'] == 0]))
print('Number of Positive Sentiment Tweets:', len(train_df[train_df['target'] == 1]))

Training Set Stats:
Size of Training Set: 640000
Number of Negative Sentiment Tweets: 319778
Number of Positive Sentiment Tweets: 320222


In [13]:
print('Test Set Stats:')
print('Size of Test Set:', len(test_df))
print('Number of Negative Sentiment Tweets:', len(test_df[test_df['target'] == 0]))
print('Number of Positive Sentiment Tweets:', len(test_df[test_df['target'] == 1]))

Test Set Stats:
Size of Test Set: 160000
Number of Negative Sentiment Tweets: 80040
Number of Positive Sentiment Tweets: 79960


# Count-Vectorizer Model 

In [14]:
count_vectorizer = CountVectorizer(stop_words='english', max_features=200000, tokenizer=tweetTokenizer.tokenize, ngram_range=(1,3))
X_train = count_vectorizer.fit_transform([entry['text'] for i, entry in train_df.iterrows()])
Y_train = np.array([int(entry['target']) for i, entry in train_df.iterrows()])

X_test = count_vectorizer.transform([entry['text'] for i, entry in test_df.iterrows()])
Y_test = np.array([int(entry['target']) for i, entry in test_df.iterrows()])

In [15]:
count_vect_model = LogisticRegression(C = 0.1, max_iter=15000)
count_vect_model.fit(X_train, Y_train)

LogisticRegression(C=0.1, max_iter=15000)

In [16]:
print(f'Training Accuracy: {np.mean(count_vect_model.predict(X_train) == Y_train)}')
print(f'Testing Accuracy: {np.mean(count_vect_model.predict(X_test) == Y_test)}')

Training Accuracy: 0.8215390625
Testing Accuracy: 0.78840625


In [17]:
# Write everything to files
pickle.dump(count_vectorizer, open('count_vectorizer.pickle', 'wb'))
pickle.dump(count_vect_model, open('count_vect_model.pickle', 'wb'))
scipy.sparse.save_npz('count_vect_X_train.npz', X_train)
np.save('count_vect_Y_train.npy', Y_train)
scipy.sparse.save_npz('count_vect_X_test.npz', X_test)
np.save('count_vect_Y_test.npy', Y_test)

In [18]:
# Test that pickling is working
read_vect = pickle.load(open('count_vectorizer.pickle', 'rb'))
read_model = pickle.load(open('count_vect_model.pickle', 'rb'))

read_X_train = scipy.sparse.load_npz('count_vect_X_train.npz')
read_Y_train = np.load('count_vect_Y_train.npy')
read_X_test = scipy.sparse.load_npz('count_vect_X_test.npz')
read_Y_test = np.load('count_vect_Y_test.npy')
print(f'Pickled Training Accuracy: {np.mean(read_model.predict(read_X_train) == read_Y_train)}')
print(f'Pickled Testing Accuracy: {np.mean(read_model.predict(read_X_test) == read_Y_test)}')

Pickled Training Accuracy: 0.8215390625
Pickled Testing Accuracy: 0.78840625


In [19]:
# Check probabilities work as expected
np.mean(np.argmax(read_model.predict_proba(read_X_train), axis=1) == read_Y_train)

0.8215390625

# Tf-Idf Model

In [20]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=200000, tokenizer=tweetTokenizer.tokenize, ngram_range=(1,3))
X_train = tfidf_vectorizer.fit_transform([entry['text'] for i, entry in train_df.iterrows()])
Y_train = np.array([int(entry['target']) for i, entry in train_df.iterrows()])

X_test = tfidf_vectorizer.transform([entry['text'] for i, entry in test_df.iterrows()])
Y_test = np.array([int(entry['target']) for i, entry in test_df.iterrows()])

In [21]:
tfidf_model = LogisticRegression(C = 1.5, max_iter=15000)
tfidf_model.fit(X_train, Y_train)

LogisticRegression(C=1.5, max_iter=15000)

In [22]:
print(f'Training Accuracy: {np.mean(tfidf_model.predict(X_train) == Y_train)}')
print(f'Testing Accuracy: {np.mean(tfidf_model.predict(X_test) == Y_test)}')

Training Accuracy: 0.8408359375
Testing Accuracy: 0.7907375


In [23]:
pickle.dump(tfidf_vectorizer, open('tfidf_vect.pickle', 'wb'))
pickle.dump(tfidf_model, open('tfidf_model.pickle', 'wb'))
scipy.sparse.save_npz('tfidf_vect_X_train.npz', X_train)
np.save('tfidf_vect_Y_train.npy', Y_train)
scipy.sparse.save_npz('tfidf_vect_X_test.npz', X_test)
np.save('tfidf_vect_Y_test.npy', Y_test)

In [24]:
# Test that pickling is working
read_vect = pickle.load(open('tfidf_vect.pickle', 'rb'))
read_model = pickle.load(open('tfidf_model.pickle', 'rb'))

read_X_train = scipy.sparse.load_npz('tfidf_vect_X_train.npz')
read_Y_train = np.load('tfidf_vect_Y_train.npy')
read_X_test = scipy.sparse.load_npz('tfidf_vect_X_test.npz')
read_Y_test = np.load('tfidf_vect_Y_test.npy')
print(f'Pickled Training Accuracy: {np.mean(read_model.predict(read_X_train) == read_Y_train)}')
print(f'Pickled Testing Accuracy: {np.mean(read_model.predict(read_X_test) == read_Y_test)}')

Pickled Training Accuracy: 0.8408359375
Pickled Testing Accuracy: 0.7907375


# Model and Vectorizer Learned Information

In [25]:
def getExtremeWords(vectorizer, model):
    feature_names = np.array(read_vect.get_feature_names())
    order = np.argsort(read_model.coef_)
    
    print("Top 50 Most Negative Words/Phrases in Order:")
    print(feature_names[order[0, :50]])
    print()
    print("Top 50 Most Positive Words/Phrases in Order:")
    print(feature_names[order[0, -50:]][::-1])
    
    return feature_names[order[0, :50]], feature_names[order[0, -50:]][::-1] # negative, positive    

def predict(tweets, vectorizer, model):
    tweet_vectors = vectorizer.transform(tweets)
    preds = model.predict_proba(tweet_vectors)
    returnList = []
    for i, tweet in enumerate(tweets):
        print(f'Tweet: {tweet}')
        pred = "Negative" if np.argmax(preds[i]) == 0 else "Positive"
        print(f'Prediction: {pred}')
        print(f'Confidence of {pred} Prediction (0 to 1): {np.max(preds[i])}')
        print()
        returnList.append((tweet, pred, np.max(preds[i])))
    return returnList

def analyzeTweets(tweets, vectorizer, model):
    returnList = []
    for tweet in tweets:
        tweetList = []
        for word in tweet.split():
            word = word.lower()
            if word in vectorizer.get_feature_names():
                index = vectorizer.get_feature_names().index(word)
                print(f'Word: {word}, Connotation: {model.coef_[0, index]:.3f}')
                tweetList.append((word, model.coef_[0, index]))
            else: # not a top feature
                print(f'Word: {word}, Connotation: {0:.3f}')
                tweetList.append((word, 0))
        returnList.append(tweetList)
        print()
    return returnList

In [26]:
# read_vect = pickle.load(open('count_vectorizer.pickle', 'rb'))
# read_model = pickle.load(open('count_vect_model.pickle', 'rb'))
read_vect = pickle.load(open('tfidf_vect.pickle', 'rb'))
read_model = pickle.load(open('tfidf_model.pickle', 'rb'))

getExtremeWords(vectorizer=read_vect, model=read_model)

Top 50 Most Negative Words/Phrases in Order:
['sad' 'miss' 'sick' 'poor' 'missing' 'sucks' "can't" 'sadly' 'hurts'
 'bummed' 'wish' 'headache' 'died' 'unfortunately' 'gutted' 'upset' 'hate'
 'lost' 'ugh' 'cancelled' 'disappointed' 'broke' 'depressed' 'rip'
 'missed' 'bummer' 'lonely' 'depressing' 'broken' "didn't" 'ruined' 'hurt'
 'missin' 'horrible' 'misses' 'closed' 'sold' 'worst' 'disappointing'
 'fml' 'hates' 'crying' 'sorry' 'failed' 'didnt' 'passed away' 'stuck'
 'going miss' 'ouch' 'bad']

Top 50 Most Positive Words/Phrases in Order:
["can't wait" 'wish luck' 'thank' 'thanks' 'smile' 'welcome' 'yay' 'proud'
 'glad' 'congrats' 'awesome' "isn't bad" 'congratulations' 'blessed' '=('
 'amazing' 'hehe' "don't forget" 'excited' 'great' 'let know'
 '#followfriday' 'love' "doesn't mean" 'hi' 'hello' 'pleasure' 'happy'
 "don't feel bad" 'feels good' 'hilarious' 'cute' "doesn't hurt" 'loving'
 'hehehe' 'yummy' 'heh' 'best' 'smiling' 'nice' 'wonderful' "don't need"
 'woohoo' 'sweet' 'feel 

(array(['sad', 'miss', 'sick', 'poor', 'missing', 'sucks', "can't",
        'sadly', 'hurts', 'bummed', 'wish', 'headache', 'died',
        'unfortunately', 'gutted', 'upset', 'hate', 'lost', 'ugh',
        'cancelled', 'disappointed', 'broke', 'depressed', 'rip', 'missed',
        'bummer', 'lonely', 'depressing', 'broken', "didn't", 'ruined',
        'hurt', 'missin', 'horrible', 'misses', 'closed', 'sold', 'worst',
        'disappointing', 'fml', 'hates', 'crying', 'sorry', 'failed',
        'didnt', 'passed away', 'stuck', 'going miss', 'ouch', 'bad'],
       dtype='<U59'),
 array(["can't wait", 'wish luck', 'thank', 'thanks', 'smile', 'welcome',
        'yay', 'proud', 'glad', 'congrats', 'awesome', "isn't bad",
        'congratulations', 'blessed', '=(', 'amazing', 'hehe',
        "don't forget", 'excited', 'great', 'let know', '#followfriday',
        'love', "doesn't mean", 'hi', 'hello', 'pleasure', 'happy',
        "don't feel bad", 'feels good', 'hilarious', 'cute',
        

In [27]:
tweets = ['This bag of chips is disgusting yuck', 'i really enjoy riding my bike', 'it will be 70 degrees tomorrow']
predict(tweets, vectorizer=read_vect, model=read_model)

Tweet: This bag of chips is disgusting yuck
Prediction: Negative
Confidence of Negative Prediction (0 to 1): 0.956246861827081

Tweet: i really enjoy riding my bike
Prediction: Positive
Confidence of Positive Prediction (0 to 1): 0.7709826743661118

Tweet: it will be 70 degrees tomorrow
Prediction: Positive
Confidence of Positive Prediction (0 to 1): 0.6524418146486487



[('This bag of chips is disgusting yuck', 'Negative', 0.956246861827081),
 ('i really enjoy riding my bike', 'Positive', 0.7709826743661118),
 ('it will be 70 degrees tomorrow', 'Positive', 0.6524418146486487)]

In [28]:
analyzeTweets(tweets, vectorizer=read_vect, model=read_model)

Word: this, Connotation: 0.000
Word: bag, Connotation: -0.120
Word: of, Connotation: 0.000
Word: chips, Connotation: -0.022
Word: is, Connotation: 0.000
Word: disgusting, Connotation: -3.183
Word: yuck, Connotation: -3.294

Word: i, Connotation: 0.000
Word: really, Connotation: -2.093
Word: enjoy, Connotation: 3.382
Word: riding, Connotation: 1.433
Word: my, Connotation: 0.000
Word: bike, Connotation: -0.039

Word: it, Connotation: 0.000
Word: will, Connotation: 0.000
Word: be, Connotation: 0.000
Word: 70, Connotation: 0.348
Word: degrees, Connotation: -1.107
Word: tomorrow, Connotation: -0.678



[[('this', 0),
  ('bag', -0.1203894116242802),
  ('of', 0),
  ('chips', -0.021783181965084244),
  ('is', 0),
  ('disgusting', -3.1827745202146605),
  ('yuck', -3.2939277923415817)],
 [('i', 0),
  ('really', -2.092914940798131),
  ('enjoy', 3.381730950190004),
  ('riding', 1.4327370573246485),
  ('my', 0),
  ('bike', -0.03879990782602385)],
 [('it', 0),
  ('will', 0),
  ('be', 0),
  ('70', 0.34848290843005564),
  ('degrees', -1.1068396914910374),
  ('tomorrow', -0.6779486903221125)]]

# Word2Vec Model 
### Generate similarity scores of words for visualization
Referenced https://stackabuse.com/implementing-word2vec-with-gensim-library-in-python/ and https://radimrehurek.com/gensim/auto_examples/core/run_similarity_queries.html#sphx-glr-auto-examples-core-run-similarity-queries-py

In [29]:
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/nikhilpathak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
tweets = [entry['text'].lower() for i, entry in train_df.iterrows()]
words = [tweetTokenizer.tokenize(tweet) for tweet in tweets]

In [3]:
w2v = Word2Vec(words, min_count=5)
w2v.save('word2vec.model') # write to file

NameError: name 'Word2Vec' is not defined

In [1]:
def getMostSimilarWords(tweets):
    returnList = []
    for tweet in tweets:
        tweetList = []
        for word in tweetTokenizer.tokenize(tweet.lower()):
            if word in w2v_model.wv.vocab:
                tweetList.append((word, w2v_model.wv.most_similar(word)))
            else:
                tweetList.append((word, []))
        returnList.append(tweetList)
    return returnList

In [2]:
tweets = ['This bag of chips is disgusting yuck', 'i really enjoy riding my bike', 'it will be 70 degrees tomorrow']
getMostSimilarWords(tweets, w2v_model=w2v)

NameError: name 'w2v' is not defined

In [34]:
# Reading from file
read_w2v = Word2Vec.load('word2vec.model')
getMostSimilarWords(tweets, w2v_model=read_w2v)

[[('this',
   [('that', 0.5935733914375305),
    ('which', 0.5301090478897095),
    ('the', 0.49837833642959595),
    ('it', 0.47747719287872314),
    ('every', 0.4759232997894287),
    ('next', 0.4756498634815216),
    ('another', 0.40533578395843506),
    ('a', 0.40285879373550415),
    ('today', 0.3960303068161011),
    ('ths', 0.39152267575263977)]),
  ('bag',
   [('pocket', 0.7745541930198669),
    ('box', 0.7635934352874756),
    ('closet', 0.729519784450531),
    ('purse', 0.7203100919723511),
    ('hole', 0.715181827545166),
    ('mug', 0.7074861526489258),
    ('fridge', 0.7056729197502136),
    ('shoe', 0.7011345624923706),
    ('bottle', 0.6992654204368591),
    ('truck', 0.6980395913124084)]),
  ('of',
   [('current', 0.4695361852645874),
    ("world's", 0.435147225856781),
    ('valuable', 0.4214920103549957),
    ('stanley', 0.42116615176200867),
    ('involving', 0.41132527589797974),
    ('recent', 0.4069414436817169),
    ('gun', 0.3961132764816284),
    ('led', 0.3896