# Twitter Sentiment Analysis

#### Goal: To create a model that can effectively predict sentiment (Positive or Negative) in tweets.

Data is from Sentiment140 which provides 1.6 million labeled Tweets.

In [1]:
############################### Imports ##################################

# Basic
import numpy as np
import pandas as pd
import scipy
import re
import time
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import boto3
import io
import warnings
warnings.filterwarnings('ignore')

# NLP
import nltk
import spacy
spacy.load('en')
from nltk.corpus import stopwords
import preprocessor as p

# Model Infrastructure
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


# Models
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [None]:
#################################### Bring in Data #############################################
start_time = time.time()
s3 = boto3.client('s3')

#Bring in Training Data
obj = s3.get_object(Bucket='data-science-project-data', Key='Twitter_Sentiment_Analysis/training.1600000.processed.noemoticon.csv')
cols = ['sentiment','id','date','query_string','user','text']
tweets = pd.read_csv(io.BytesIO(obj['Body'].read()),header=None, names=cols, encoding = "ISO-8859-1")
#train.set_index('bidder_id', inplace=True)

print("-- Execution time: %s seconds ---" % (time.time() - start_time))

In [None]:
tweets.head()

In [None]:
# Just Need the Sentiment and the Text
tweets.drop(['id','date','query_string','user'],axis=1,inplace=True)

In [None]:
# Clean the tweets
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.EMOJI)
def preprocess_tweet(tweet):
    return p.clean(tweet)

# Clean the tweets, by removing special characters
start_time = time.time()
tweets['Clean'] = tweets['text'].apply(lambda x: preprocess_tweet(x))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

In [None]:
# Down Sample
tweets_subsampled_1, tweets_subsampled_2 = train_test_split(tweets, test_size=0.1)

In [None]:
#Split between outcome and Features
y = tweets_subsampled_2['sentiment']
X = tweets_subsampled_2['Clean']

### Model Training

#### Transform Data

In [None]:
start_time = time.time()
# Create lemmatizer using spacy
lemmatizer = spacy.lang.en.English()

def custom_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens if not token.is_punct])

pipe = Pipeline(steps=[('vectidf', TfidfVectorizer(tokenizer=custom_tokenizer, stop_words='english',
                                                   lowercase=True,use_idf=True,max_df=0.5,
                                                  min_df=2, norm='l2', smooth_idf=True)),
                 ('svd', TruncatedSVD(500)),
                 ('norm',Normalizer(copy=False))
                       ])

tweets_transform = pipe.fit_transform(X)

print("-- Execution time: %s seconds ---" % (time.time() - start_time))

In [None]:
#splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(tweets_transform,y,test_size=0.25)

#### Grid Search Model

In [None]:
###################### XGB #############################
start_time = time.time()
xgb_model = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', scale_pos_weight=1, seed=27) 
 

parameters = {'n_jobs':[50],
             'max_depth':range(3,10,2),
             'min_child_weight':range(1,6,2)}

clf = GridSearchCV(xgb_model, parameters,cv=3, verbose=0,n_jobs=1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))