# Twitter Sentiment Analysis

#### Goal: To create a model that can effectively predict sentiment (Positive or Negative) in tweets.

Data is from Sentiment140 which provides 1.6 million labeled Tweets.

### Plan
1. Take a small subset of the data in order to tune the XGB model
2. Once the best parameters are chosen, will use grid search to tune the vectorizer.
3. Increase the amount of data to train the model on.

In [1]:
############################### Imports ##################################

# Basic
import numpy as np
import pandas as pd
import scipy
import re
import time
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import boto3
import io
import warnings
warnings.filterwarnings('ignore')

# NLP
import nltk
import spacy
spacy.load('en')
from nltk.corpus import stopwords
import preprocessor as p

# Model Infrastructure
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


# Models
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [2]:
#################################### Bring in Data #############################################
start_time = time.time()
s3 = boto3.client('s3')

#Bring in Training Data
obj = s3.get_object(Bucket='data-science-project-data', Key='Twitter_Sentiment_Analysis/training.1600000.processed.noemoticon.csv')
cols = ['sentiment','id','date','query_string','user','text']
tweets = pd.read_csv(io.BytesIO(obj['Body'].read()),header=None, names=cols, encoding = "ISO-8859-1")
#train.set_index('bidder_id', inplace=True)

print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 6.1940016746521 seconds ---


In [3]:
tweets.head()

Unnamed: 0,sentiment,id,date,query_string,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
# Just Need the Sentiment and the Text
tweets.drop(['id','date','query_string','user'],axis=1,inplace=True)

In [4]:
# Clean the tweets
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.HASHTAG)
def preprocess_tweet(tweet):
    return p.clean(tweet)

# Clean the tweets, by removing special characters
start_time = time.time()
tweets['Clean'] = tweets['text'].apply(lambda x: preprocess_tweet(x))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 54.30187749862671 seconds ---


In [5]:
# Down Sample
tweets_subsampled_1, tweets_subsampled_2 = train_test_split(tweets, test_size=0.1)

In [6]:
#Split between outcome and Features
y = tweets_subsampled_2['sentiment']
X = tweets_subsampled_2['Clean']

### Model Training
Will use tuning to find the optimal parameters for the model.

#### Transform Data

In [9]:
start_time = time.time()
# Create lemmatizer using spacy
lemmatizer = spacy.lang.en.English()

def custom_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens if not token.is_punct])

vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, stop_words='english',
                             lowercase=True, use_idf=True, max_df=0.5, max_features=10000,
                             min_df=2, norm='l2', smooth_idf=True, ngram_range=(1, 2))

tweets_tfidf = vectorizer.fit_transform(X)
print("Vectorizing Finished. Number of features: %d" % tweets_tfidf.get_shape()[1])
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

Vectorizing Finished. Number of features: 10000
-- Execution time: 19.35665512084961 seconds ---


In [12]:
start_time = time.time()

pipe = Pipeline(steps=[
                 ('svd', TruncatedSVD(5000)),
                 ('norm',Normalizer(copy=False))
                       ])

tweets_transform = pipe.fit_transform(tweets_tfidf)

print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 597.3474566936493 seconds ---


In [13]:
# How much of the variance does the svd explain? With max_features NOT set
# 500 componenets explaines 47% of variance
# 1000 components explains 59% of variance
# 2000 compoenents explains 71% of variance
# 3000 components explaines 77% of variance
# 4000 components explains 82% of variance
# With MAX Features to 10,000
# 500 explains 43% of data, but its Much faster (34 seconds)
# 5000 explains 86% of the variance at 597 seconds. 
pipe.get_params()['svd'].explained_variance_ratio_.sum()

0.8685416919446195

In [14]:
#splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(tweets_transform,y,test_size=0.25)

In [10]:
# Validate the multithreading is working
# Note this test was down with only 0.05 of the data, and trucated to 100 SVD
results = []
num_jobs = [1, 2, 3, 4]
for n in num_jobs:
  start = time.time()
  model = XGBClassifier(n_jobs=n)
  model.fit(X_train, y_train)
  elapsed = time.time() - start
  print(n, elapsed)
  results.append(elapsed)

1 41.16322708129883
2 20.507357120513916
3 13.794840574264526
4 10.435452938079834


#### Grid Search Model

In [17]:
###### Test the max features of 10,000 and SVD of 5000 components #########
warnings.filterwarnings('ignore')
start_time = time.time()
xgb_model = XGBClassifier(learning_rate =0.1,
                             max_depth=5,
                             min_child_weight=1,
                             gamma=0,
                             n_estimators=500, 
                             subsample=0.8,
                             colsample_bytree=0.8,
                             scale_pos_weight=1,
                             random_state=10,
                             n_jobs=-1)


xgb_model.fit(X_train, y_train)

print("Test Set Score: " + str(xgb_model.score(X_test, y_test)))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

Test Set Score: 0.736
-- Execution time: 711.132809638977 seconds ---


In [9]:
####### Base Line Model ########
####### Didnt run this cell again for documentation purposes. This ran before used maxed features #######
warnings.filterwarnings('ignore')
start_time = time.time()

xgb_model = XGBClassifier(random_state=10) 


parameters = {'n_jobs':[-1],
             }

clf = GridSearchCV(xgb_model, parameters,cv=3, verbose=0,n_jobs=1)
clf.fit(X_train, y_train)
print("-- CV Score: " + str(clf.best_score_))
print("-- Test Set Score: " + str(clf.score(X_test, y_test)))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- CV Score: 0.6926416666666667
-- Test Set Score: 0.690625
-- Execution time: 34.98430633544922 seconds ---


In [14]:
# Tune n_estimators given the learning rate
# n_estimators is the number of trees to use
# learning_rate is to make the model more robust by shrinking the weights on each step. It determines the impact of each tree on the final outcome
warnings.filterwarnings('ignore')
start_time = time.time()
xgb_model = XGBClassifier(learning_rate =0.1,
                             max_depth=5,
                             min_child_weight=1,
                             gamma=0,
                             subsample=0.8,
                             colsample_bytree=0.8,
                             scale_pos_weight=1,
                             random_state=10) 
 

parameters = {'n_jobs':[30],
             'n_estimators':range(100,1000,200)
             }

clf = GridSearchCV(xgb_model, parameters,cv=3, verbose=0,n_jobs=1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

0.72365
-- Execution time: 1408.1651225090027 seconds ---


In [15]:
# Find Best Parameters
clf.best_params_

{'n_estimators': 500, 'n_jobs': 30}

In [16]:
# Tune Max_depth and min_child_weight
# Max_depth is the maximum depth of the tree. Note the more, the more likelhood for overfitting
# min_child_weight Defines the minimum sum of weights of all observations required in a child. its used to control overfitting as well
warnings.filterwarnings('ignore')
start_time = time.time()
xgb_model = XGBClassifier(learning_rate =0.1,
                             n_estimators=500,
                             gamma=0,
                             subsample=0.8,
                             colsample_bytree=0.8,
                             scale_pos_weight=1,
                             random_state=10) 
 

parameters = {'n_jobs':[35],
             'max_depth':range(3,10,2),
             'min_child_weight':range(1,6,2)
             }

clf = GridSearchCV(xgb_model, parameters,cv=3, verbose=0,n_jobs=1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

0.72605
-- Execution time: 3803.7127678394318 seconds ---


In [17]:
# Find Best Parameters
clf.best_params_

{'max_depth': 5, 'min_child_weight': 5, 'n_jobs': 35}

In [18]:
# Tuned Gamma
# Gamma specifies the minimum loss reduction required to make a split (on positive reduction in the loss function).
warnings.filterwarnings('ignore')
start_time = time.time()
xgb_model = XGBClassifier(learning_rate =0.1,
                             n_estimators=500,
                             max_depth=5,
                             min_child_weight=5,
                             subsample=0.8,
                             colsample_bytree=0.8,
                             scale_pos_weight=1,
                             random_state=10) 
 

parameters = {'n_jobs':[35],
             'gamma':[i/10.0 for i in range(0,5)]
             }

clf = GridSearchCV(xgb_model, parameters,cv=3, verbose=0,n_jobs=1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))


0.727625
-- Execution time: 1406.7063238620758 seconds ---


In [19]:
# Find Best Parameters
clf.best_params_

{'gamma': 0.1, 'n_jobs': 35}

In [22]:
# Tune subsample and colsample_bytree
# Subsample denotes the fraction of observates to be randomly sampled. Lower values make algorithm more conservative 2 small leads to underfitting
# colsample_bytree denotes the fraction of coumns to be randomly sampled for each tree

warnings.filterwarnings('ignore')
start_time = time.time()
xgb_model = XGBClassifier(learning_rate =0.1,
                             n_estimators=500,
                             max_depth=5,
                             min_child_weight=5,
                             gamma=0.1,
                             scale_pos_weight=1,
                             random_state=10) 
 

parameters = {'n_jobs':[200],
             'subsample':[i/100.0 for i in range(75,90,5)],
             'colsample_bytree':[i/100.0 for i in range(75,90,5)]
             }

clf = GridSearchCV(xgb_model, parameters,cv=3, verbose=0,n_jobs=1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

0.727625
-- Execution time: 5729.693859577179 seconds ---


In [23]:
# Find Best Parameters
clf.best_params_

{'colsample_bytree': 0.8, 'n_jobs': 200, 'subsample': 0.8}

In [None]:
# Tune the regularization parameter
# reg_alpha is the L1 regularization on the weights. Can be used with high dimensionality so it runs faster.
# reg_lambda is the L2 regularization on the weights
# EC2 instance got terminated before had chance to submit. The L1 and L2 regularization was the default ones

warnings.filterwarnings('ignore')
start_time = time.time()

xgb_model = XGBClassifier(learning_rate =0.1,
                             n_estimators=500,
                             max_depth=5,
                             min_child_weight=5,
                             gamma=0.1,
                             subsample=0.8,
                             colsample_bytree=0.8, 
                             scale_pos_weight=1,
                             random_state=10) 
 

parameters = {'n_jobs':[-1],
             'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05],
             'reg_lambda':[0, 0.5, 1]
             }

clf = GridSearchCV(xgb_model, parameters,cv=3, verbose=0,n_jobs=1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))


In [None]:
# Find Best Parameters
clf.best_params_

In [None]:
# Tune estimators and the learning parameter
warnings.filterwarnings('ignore')
start_time = time.time()
xgb_model = XGBClassifier(   max_depth=5,
                             min_child_weight=5,
                             gamma=0.1,
                             subsample=0.8,
                             colsample_bytree=0.8, 
                             scale_pos_weight=1,
                             random_state=10) 
 

parameters = {'n_jobs':[-1],
             'n_estimators':range(500,5501,1000),
             'learning_rate':[0.0001, 0.001, 0.01, 0.1, 0.2]
             }

clf = GridSearchCV(xgb_model, parameters,cv=3, verbose=0,n_jobs=1)
clf.fit(X_train, y_train)
print("-- CV Score: " + str(clf.best_score_))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

### Vectorizer Tuning

In [None]:
#splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [None]:
start_time = time.time()
# Create lemmatizer using spacy
lemmatizer = spacy.lang.en.English()

def custom_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens if not token.is_punct])

xgb_model = XGBClassifier(   max_depth=5,
                             min_child_weight=5,
                             gamma=0.1,
                             subsample=0.8,
                             colsample_bytree=0.8, 
                             scale_pos_weight=1,
                             random_state=10,
                             n_jobs=200) 

pipe = Pipeline(steps=[('vectidf', TfidfVectorizer(tokenizer=custom_tokenizer, stop_words='english',lowercase=True,use_idf=True,max_df=0.5)),
                 ('svd', TruncatedSVD(500)),
                 #('norm',Normalizer(copy=False)),
                 ('xgb',xgb_model)
                 ])

parameters = {'vectidf__ngram_range': [(1, 1), (1, 2)],
              'vectidf__min_df':(1,2),
              'vectidf__norm':['l1','l2']
              }

grid = GridSearchCV(pipe, parameters, n_jobs=1, cv=3, verbose=0)
grid.fit(X_train, y_train)

print("-- Execution time: %s seconds ---" % (time.time() - start_time))

### Increasing Training Data

In [None]:
#Split between outcome and Features
y = tweets['sentiment']
X = tweets['Clean']

#splitting into training and test sets even though still going to do k folds on the training data.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [None]:
# Train Model

# Going to remove the normalizer step since the tfidfvectoizer already normalizes, and the model doesnt need it to be normalized.

start_time = time.time()
# Create lemmatizer using spacy
lemmatizer = spacy.lang.en.English()

def custom_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens if not token.is_punct])

xgb_model = XGBClassifier(   max_depth=5,
                             min_child_weight=5,
                             gamma=0.1,
                             subsample=0.8,
                             colsample_bytree=0.8, 
                             scale_pos_weight=1,
                             random_state=10,
                             n_jobs=-1) 

pipe = Pipeline(steps=[('vectidf', TfidfVectorizer(tokenizer=custom_tokenizer, stop_words='english',lowercase=True,use_idf=True,max_df=0.5)),
                 ('svd', TruncatedSVD(2000)),
                 ('xgb',xgb_model)
                 ])

parameters = {'vectidf__ngram_range': [(1, 1), (1, 2)],
              'vectidf__min_df':(1,2),
              'vectidf__norm':['l1','l2'],
              'vectidf__smooth_idf':[True, False]
              }
# Pre dispatch controls the number of jobs that gets dispatched during parallel execution. 
grid = GridSearchCV(pipe, parameters, pre_dispatch=3, cv=3, verbose=0)
grid.fit(X_train, y_train)

print("-- Execution time: %s seconds ---" % (time.time() - start_time))

### Model Check

Want to see how the model handles unvectorized tweets.

In [7]:
start_time = time.time()
# Down Sample
tweets_subsampled_1, tweets_subsampled_2 = train_test_split(tweets, test_size=0.1)

#Split between outcome and Features
y = tweets_subsampled_2['sentiment']
X = tweets_subsampled_2['Clean']

# Split to Test Train
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

start_time = time.time()
# Create lemmatizer using spacy
lemmatizer = spacy.lang.en.English()

def custom_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens if not token.is_punct])


xgb_model = XGBClassifier(   max_depth=5,
                             min_child_weight=5,
                             gamma=0.1,
                             subsample=0.8,
                             colsample_bytree=0.8, 
                             scale_pos_weight=1,
                             random_state=10,
                             n_jobs=-1) 

pipe = Pipeline(steps=[('vectidf', TfidfVectorizer(tokenizer=custom_tokenizer, stop_words='english',
                             lowercase=True, use_idf=True, max_df=0.5, max_features=1000,
                             min_df=2, norm='l2', smooth_idf=True, ngram_range=(1, 2))),
                 ('svd', TruncatedSVD(500)),
                 ('xgb',xgb_model)
                 ])

pipe.fit(X_train, y_train)
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 60.7186336517334 seconds ---


In [17]:
X_test[:5]

753429                Dyinn , waitin on mother nature uggghh
493063     Update on snake tweet: the copperhead has been...
565646                           Allah yesms3 mink ya nawaal
1463444                    is a virtual hug good enough too?
536247                      Why is my throat still hurting?!
Name: Clean, dtype: object

In [18]:
prediction = pipe.predict(X_test[:5])

  if diff:


In [19]:
prediction

array([4, 0, 4, 4, 0])