In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import scipy.sparse

# Data Loading Stage + Pre-processing

In [2]:
# From https://www.kaggle.com/kazanova/sentiment140
# Added custom column header line to the csv after download
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1')

In [3]:
df.head(10)

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [4]:
final_large_df = df.drop(['ids', 'date', 'flag', 'user'], axis=1) # drop cols that aren't useful for our model
final_large_df

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [5]:
print('Number of Negative Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 0]))
print('Number of Neutral Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 2]))
print('Number of Positive Sentiment Tweets:', len(final_large_df[final_large_df['target'] == 4]))

Number of Negative Sentiment Tweets: 800000
Number of Neutral Sentiment Tweets: 0
Number of Positive Sentiment Tweets: 800000


### Notice no neutral sentiment data and way too much data!

In [6]:
final_large_df.target = final_large_df.target / 4 # convert the target column to 0 and 1 labels where 1 is positive

In [7]:
final_large_df

Unnamed: 0,target,text
0,0.0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0.0,is upset that he can't update his Facebook by ...
2,0.0,@Kenichan I dived many times for the ball. Man...
3,0.0,my whole body feels itchy and like its on fire
4,0.0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,1.0,Just woke up. Having no school is the best fee...
1599996,1.0,TheWDB.com - Very cool to hear old Walt interv...
1599997,1.0,Are you ready for your MoJo Makeover? Ask me f...
1599998,1.0,Happy 38th Birthday to my boo of alll time!!! ...


In [8]:
final_df = final_large_df.sample(200000)
del final_large_df, df

In [9]:
final_df

Unnamed: 0,target,text
1484929,1.0,@ACEU hello
689387,0.0,I got a huge penis. Oh wait. Nevermind. That w...
1310043,1.0,@THE_WOCKEEZ i wanna say thanks for brining hi...
637531,0.0,http://twitpic.com/7s2rw - 6am sunrise got kin...
516209,0.0,Sarcoid masses in both of them. URI's are dea...
...,...,...
182340,0.0,@Lucy_nessa same I don't want it to close xx...
397657,0.0,i have a massive headace that wont go away....
595996,0.0,So tired. Taylor just left
1468934,1.0,@jcornell26 I am well aware


### Split into Training and Test Splits for Model Evaluation

In [10]:
train_df, test_df = train_test_split(final_df, test_size=0.20)

In [11]:
print('Training Set Stats:')
print('Size of Training Set:', len(train_df))
print('Number of Negative Sentiment Tweets:', len(train_df[train_df['target'] == 0]))
print('Number of Positive Sentiment Tweets:', len(train_df[train_df['target'] == 1]))

Training Set Stats:
Size of Training Set: 160000
Number of Negative Sentiment Tweets: 80071
Number of Positive Sentiment Tweets: 79929


In [12]:
print('Test Set Stats:')
print('Size of Test Set:', len(test_df))
print('Number of Negative Sentiment Tweets:', len(test_df[test_df['target'] == 0]))
print('Number of Positive Sentiment Tweets:', len(test_df[test_df['target'] == 1]))

Test Set Stats:
Size of Test Set: 40000
Number of Negative Sentiment Tweets: 19936
Number of Positive Sentiment Tweets: 20064


# Count-Vectorizer Model 

In [13]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), max_features=100000)
X_train = count_vectorizer.fit_transform([entry['text'] for i, entry in train_df.iterrows()])
Y_train = np.array([int(entry['target']) for i, entry in train_df.iterrows()])

X_test = count_vectorizer.transform([entry['text'] for i, entry in test_df.iterrows()])
Y_test = np.array([int(entry['target']) for i, entry in test_df.iterrows()])

In [14]:
count_vect_model = LogisticRegression(C = 0.1, max_iter=15000)
count_vect_model.fit(X_train, Y_train)

LogisticRegression(C=0.1, max_iter=15000)

In [15]:
print(f'Training Accuracy: {np.mean(count_vect_model.predict(X_train) == Y_train)}')
print(f'Testing Accuracy: {np.mean(count_vect_model.predict(X_test) == Y_test)}')

Training Accuracy: 0.80885625
Testing Accuracy: 0.770825


In [16]:
# Write everything to files
pickle.dump(count_vectorizer, open('count_vectorizer.pickle', 'wb'))
pickle.dump(count_vect_model, open('count_vect_model.pickle', 'wb'))
scipy.sparse.save_npz('count_vect_X_train.npz', X_train)
np.save('count_vect_Y_train.npy', Y_train)
scipy.sparse.save_npz('count_vect_X_test.npz', X_test)
np.save('count_vect_Y_test.npy', Y_test)

In [27]:
# Test that pickling is working
read_vect = pickle.load(open('count_vectorizer.pickle', 'rb'))
read_model = pickle.load(open('count_vect_model.pickle', 'rb'))

read_X_train = scipy.sparse.load_npz('count_vect_X_train.npz')
read_Y_train = np.load('count_vect_Y_train.npy')
read_X_test = scipy.sparse.load_npz('count_vect_X_test.npz')
read_Y_test = np.load('count_vect_Y_test.npy')
print(f'Pickled Training Accuracy: {np.mean(read_model.predict(read_X_train) == read_Y_train)}')
print(f'Pickled Testing Accuracy: {np.mean(read_model.predict(read_X_test) == read_Y_test)}')

Pickled Training Accuracy: 0.80885625
Pickled Testing Accuracy: 0.770825


In [28]:
# Check probabilities work as expected
np.mean(np.argmax(read_model.predict_proba(read_X_train), axis=1) == read_Y_train)

0.80885625

### Model and Vectorizer Learned Information

In [19]:
feature_names = np.array(read_vect.get_feature_names())
print("Top 50 Most Negative Words/Phrases in Order:")
print(feature_names[np.argsort(read_model.coef_)[0, :50]])

print("Top 50 Most Positive Words/Phrases in Order:")
print(feature_names[np.argsort(read_model.coef_)[0, -50:]][::-1])

Top 50 Most Negative Words/Phrases in Order:
['sad' 'missing' 'hurts' 'poor' 'sucks' 'miss' 'sadly' 'sick'
 'unfortunately' 'headache' 'gutted' 'rip' 'horrible' 'wish' 'died'
 'bummed' 'lost' 'ugh' 'cancelled' 'hate' 'lonely' 'broken' 'ruined'
 'depressed' 'missed' 'broke' 'upset' 'hates' 'worst' 'depressing' 'stuck'
 'failed' 'crying' 'bummer' 'missin' 'anymore' 'closed' 'hurting'
 'disappointed' 'sore' 'sigh' 'pain' 'sorry' 'boo' 'tummy' 'noooo'
 'happened' 'wishing' 'afford' 'goodbye']
Top 50 Most Positive Words/Phrases in Order:
['welcome' 'thank' 'thanks' 'wish luck' 'smile' 'congrats'
 'congratulations' 'excited' 'yay' 'sweet' 'hehe' 'proud' 'woohoo' 'glad'
 'hello' 'happy' 'goodnight' 'awesome' 'smiling' 'pleasure' 'loving'
 'followfriday' 'hehehe' 'hahahaha' 'adorable' 'heh' 'hi' 'woo' 'amazing'
 'wonderful' 'love' 'thx' 'cute' 'don need' 'lets' 'gorgeous' 'best'
 'great' 'don worry' 'let know' 'www' 'relaxing' 'listening' 'enjoy'
 'haha' 'nice' 'cool' 'cheers' 'thankyou' 'feel

### Model Prediction Analysis

In [34]:
tweets = ['This bag of chips is disgusting yuck', 'i really enjoy riding my bike', 'it will be 70 degrees tomorrow']
tweet_vectors = read_vect.transform(tweets)
preds = read_model.predict_proba(tweet_vectors)
for i, tweet in enumerate(tweets):
    print(f'Tweet: {tweet}')
    pred = "Negative" if np.argmax(preds[i]) == 0 else "Positive"
    print(f'Prediction: {pred}')
    print(f'Confidence of {pred} Prediction (0 to 1): {np.max(preds[i])}')
    print()

Tweet: This bag of chips is disgusting yuck
Prediction: Negative
Confidence of Negative Prediction (0 to 1): 0.6697804408467531

Tweet: i really enjoy riding my bike
Prediction: Positive
Confidence of Positive Prediction (0 to 1): 0.69840496422425

Tweet: it will be 70 degrees tomorrow
Prediction: Negative
Confidence of Negative Prediction (0 to 1): 0.5632257548822567



# Tf-Idf Model

In [21]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features=100000)
X_train = tfidf_vectorizer.fit_transform([entry['text'] for i, entry in train_df.iterrows()])
Y_train = np.array([int(entry['target']) for i, entry in train_df.iterrows()])

X_test = tfidf_vectorizer.transform([entry['text'] for i, entry in test_df.iterrows()])
Y_test = np.array([int(entry['target']) for i, entry in test_df.iterrows()])

In [22]:
tfidf_model = LogisticRegression(C = 1.5, max_iter=15000)
tfidf_model.fit(X_train, Y_train)

LogisticRegression(C=1.5, max_iter=15000)

In [23]:
print(f'Training Accuracy: {np.mean(tfidf_model.predict(X_train) == Y_train)}')
print(f'Testing Accuracy: {np.mean(tfidf_model.predict(X_test) == Y_test)}')

Training Accuracy: 0.84753125
Testing Accuracy: 0.775125


In [24]:
pickle.dump(tfidf_vectorizer, open('tfidf_vect.pickle', 'wb'))
pickle.dump(tfidf_model, open('tfidf_model.pickle', 'wb'))
scipy.sparse.save_npz('tfidf_vect_X_train.npz', X_train)
np.save('tfidf_vect_Y_train.npy', Y_train)
scipy.sparse.save_npz('tfidf_vect_X_test.npz', X_test)
np.save('tfidf_vect_Y_test.npy', Y_test)

In [35]:
# Test that pickling is working
read_vect = pickle.load(open('tfidf_vect.pickle', 'rb'))
read_model = pickle.load(open('tfidf_model.pickle', 'rb'))

read_X_train = scipy.sparse.load_npz('tfidf_vect_X_train.npz')
read_Y_train = np.load('tfidf_vect_Y_train.npy')
read_X_test = scipy.sparse.load_npz('tfidf_vect_X_test.npz')
read_Y_test = np.load('tfidf_vect_Y_test.npy')
print(f'Pickled Training Accuracy: {np.mean(read_model.predict(read_X_train) == read_Y_train)}')
print(f'Pickled Testing Accuracy: {np.mean(read_model.predict(read_X_test) == read_Y_test)}')

Pickled Training Accuracy: 0.84753125
Pickled Testing Accuracy: 0.775125


### Model and Vectorizer Learned Information

In [37]:
feature_names = np.array(read_vect.get_feature_names())
print("Top 50 Most Negative Words/Phrases in Order:")
print(feature_names[np.argsort(read_model.coef_)[0, :50]])

print("Top 50 Most Positive Words/Phrases in Order:")
print(feature_names[np.argsort(read_model.coef_)[0, -50:]][::-1])

Top 50 Most Negative Words/Phrases in Order:
['sad' 'sick' 'miss' 'poor' 'missing' 'sucks' 'hurts' 'sadly' 'wish'
 'unfortunately' 'rip' 'lost' 'gutted' 'horrible' 'died' 'ugh' 'hate'
 'bummed' 'headache' 'sorry' 'bad' 'broken' 'cancelled' 'missed' 'ruined'
 'upset' 'depressed' 'stuck' 'broke' 'failed' 'didn' 'anymore'
 'depressing' 'crying' 'lonely' 'worst' 'damn' 'didnt' 'missin' 'closed'
 'hurting' 'tried' 'sore' 'pain' 'hates' 'hurt' 'sigh' 'disappointed'
 'happened' 'booo']
Top 50 Most Positive Words/Phrases in Order:
['thanks' 'thank' 'welcome' 'wish luck' 'smile' 'yay' 'awesome' 'glad'
 'excited' 'sweet' 'love' 'great' 'hehe' 'congrats' 'congratulations'
 'hello' 'happy' 'woohoo' 'best' 'proud' 'smiling' 'followfriday' 'haha'
 'amazing' 'hi' 'hehehe' 'adorable' 'loving' 'www' 'good luck' 'pleasure'
 'cute' 'feels good' 'lets' 'don need' 'heh' 'hey' 'don worry' 'goodnight'
 'hahahaha' 'let know' 'wonderful' 'beautiful' 'relaxing' 'gorgeous'
 'feel free' 'hell yeah' 'woo' 'enjoy' 

### Model Prediction Analysis

In [38]:
tweets = ['This bag of chips is disgusting yuck', 'i really enjoy riding my bike', 'it will be 70 degrees tomorrow']
tweet_vectors = read_vect.transform(tweets)
preds = read_model.predict_proba(tweet_vectors)
for i, tweet in enumerate(tweets):
    print(f'Tweet: {tweet}')
    pred = "Negative" if np.argmax(preds[i]) == 0 else "Positive"
    print(f'Prediction: {pred}')
    print(f'Confidence of {pred} Prediction (0 to 1): {np.max(preds[i])}')
    print()

Tweet: This bag of chips is disgusting yuck
Prediction: Negative
Confidence of Negative Prediction (0 to 1): 0.8276104024048417

Tweet: i really enjoy riding my bike
Prediction: Positive
Confidence of Positive Prediction (0 to 1): 0.7459360851060887

Tweet: it will be 70 degrees tomorrow
Prediction: Negative
Confidence of Negative Prediction (0 to 1): 0.6133153661578392



# Word2Vec Model 
### Generate similarity scores of words for visualization