In [None]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

df = pd.read_csv('training_cleaned.csv')


# Data Exploration

Confirm only string objects are in the data, by only selecting str types from intial dataset and create a copy of the dataframe

In [None]:
df_new = df.loc[df['Tweet'].apply(type) == str].copy()

print("Old Dataset with all types", df.count())
print("New Dataset with only str types",df_new.count())

Remove single characters words (Ex. 'A', 'I')

In [None]:
df_new['Tweet_Clean'] = df_new['Tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)> 1]))

df_new.head()

Does length of tweet affect sentiment?

In [None]:
df_new['Tweet_Length'] = df_new['Tweet_Clean'].apply(lambda x: len(x))
sentence_length = df_new.groupby(['Sentiment']).mean()
sentence_length.plot(y='Tweet_Length',kind='bar')

Create Feature Column showing the number of "!" occurances

In [None]:
#df_new['Countof!'] = df_new['Tweet_Clean'].str.count(r'([!]+)')

#CountofExcalamation = df_new.groupby(['Sentiment']).mean()
#sentence_length.plot(y='Tweet_Length',kind='bar')
#CountofExcalamation.head()

Create Feature Column showing the number of at least ".."  occurances

In [None]:
#df_new['Countof.'] = df_new['Tweet_Clean'].str.count(r'([..]+)')

# Word Clouds

We want to get an understanding of what are the most common words in all the tweets

In [None]:
all_words = ' '.join([text for text in df_new['Tweet_Clean']])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

We can see there is a lot of words that you would associate postive sentiment that occure the most (ie. love, good, well, will..)

It is harder to see any negative words, we will know look postive labeled tweets and negative labeled tweets seperate

## Negative Sentiment Words

In [None]:
normal_words =' '.join([text for text in df_new['Tweet_Clean'][df_new['Sentiment'] == 0]])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

We can see the most common occuring words in the negative sentiment are ; Sad, Miss, Sleep, Still, Work, Suck...


## Postive Sentiment Words

In [None]:
normal_words =' '.join([text for text in df_new['Tweet_Clean'][df_new['Sentiment'] == 4]])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

We see the postive word cloud is very similar to the overall word cloud, with most common words being love, go, good, nice ...

# Feature Engineering

## Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=100, stop_words='english')
cv_matrix = cv.fit_transform(df_new['Tweet_Clean'])
cv_matrix = cv_matrix.toarray()
cv_matrix

Check to See Number of Words

In [None]:
len(cv_matrix[0])

In [None]:
# get all unique words in the corpus
vocab = cv.get_feature_names()
# show document feature vectors
BOW = pd.DataFrame(cv_matrix, columns=vocab)

In [None]:
BOW_dataset = pd.concat([df_new['Sentiment'], BOW], axis=1, join='inner')

In [None]:
BOW_dataset.head()

## Bag of N-Grams Model

### 2 Grams Model

In [None]:
# you can set the n-gram range to 1,2 to get unigrams as well as bigrams
bv = CountVectorizer(ngram_range=(2,2),max_df=0.90, max_features=1000, stop_words='english')
bv_matrix = bv.fit_transform(df_new['Tweet_Clean'])

bv_matrix = bv_matrix.toarray()
vocab_2gram = bv.get_feature_names()
TwoGram = pd.DataFrame(bv_matrix, columns=vocab_2gram)

TwoGram_dataset = pd.concat([df_new['Sentiment'], TwoGram], axis=1, join='inner')
TwoGram_dataset.head()

## 3 Grams Model

In [None]:
# you can set the n-gram range to 1,2 to get unigrams as well as bigrams
bv = CountVectorizer(ngram_range=(3,3),max_df=0.90, max_features=1000, stop_words='english')
bv_matrix = bv.fit_transform(df_new['Tweet_Clean'])

bv_matrix = bv_matrix.toarray()
vocab_3gram = bv.get_feature_names()
ThreeGram = pd.DataFrame(bv_matrix, columns=vocab_3gram)

ThreeGram_dataset = pd.concat([df_new['Sentiment'], ThreeGram], axis=1, join='inner')
ThreeGram_dataset.head()

##  TF-IDF Model


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(max_df=0.90, max_features=1000, stop_words='english', use_idf=True)
tv_matrix = tv.fit_transform(df_new['Tweet_Clean'])
tv_matrix = tv_matrix.toarray()

vocab_tf = tv.get_feature_names()
tf_idf = pd.DataFrame(np.round(tv_matrix, 2), columns=vocab_tf)
tf_idf_dataset = pd.concat([df_new['Sentiment'], tf_idf], axis=1, join='inner')
tf_idf_dataset.head()

# Feature Selection

Now that we have some features engineered we want to select which features are most important to identifying the sentiment

## Feature Importance

We will create multiple random decision trees to help us find our most important features

In [None]:

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

dataframe = tf_idf_dataset
array = dataframe.values
X = array[:,1:len(array)-1]
Y = array[:,0]

dataframe.head()

# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)

# Select the important features of Model
sel = SelectFromModel(model, prefit=True)

# Subset features
X_new = sel.transform(X)

X_new_df = pd.DataFrame(X_new)

tf_idf_data_final = pd.concat([df_new['Sentiment'], X_new_df], axis=1, join='inner')



In [None]:
print("Old Number of Features: " ,len(X[0]))
print("New Number of Features: " ,len(X_new[0]))


In [None]:
#Merging TF_IDF and 3GRAM features, dropped sentiment column from ThreeGram, already in tf_idf_dataset
dataframe = pd.concat([tf_idf_dataset, ThreeGram_dataset.iloc[:,1:len(ThreeGram_dataset.columns)] ], axis=1, join='inner')
array = dataframe.values
X = array[:,1:len(array)-1]
Y = array[:,0]

dataframe.head()

# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)

# Select the important features of Model
sel = SelectFromModel(model, prefit=True)

# Subset features
X_mix = sel.transform(X)

X_mix_df = pd.DataFrame(X_new)

tf_idf_3GRAM_data_final = pd.concat([df_new['Sentiment'], X_mix_df], axis=1, join='inner')

print(len(X[0]))
print(len(X_mix[0]))

# Model Selection and Evaluation

## Logistic Regression

In [None]:
# Cross Validation Classification Report
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score 

df_2g_tf = pd.concat([tf_idf_dataset, TwoGram_dataset.iloc[:,1:len(TwoGram_dataset.columns)] ], axis=1, join='inner')

metric_names = ['BOW', '2GRAM','3GRAM', 'TF_IDF','TF_IDF_Opt','3GRAM+TF_IDF_OPT','2GRAM+TF_IDF']

scores_df = pd.DataFrame(index=metric_names, columns=['roc_auc', 'accuracy']) # to store the scores
model = LogisticRegression(solver='lbfgs')
test_size = 0.2
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
#for metric in metric_names:
data = [BOW_dataset,TwoGram_dataset,ThreeGram_dataset,tf_idf_dataset,tf_idf_data_final,tf_idf_3GRAM_data_final,df_2g_tf]
count = 0
for dataset in data:
    metric = metric_names[count]
    dataframe = dataset
    array = dataframe.values
    X = array[:,1:len(array)-1]
    Y = array[:,0]
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
    AUC = cross_val_score(model, X_train, Y_train, scoring='roc_auc', cv=kfold).mean()
    accuracy = cross_val_score(model, X_train, Y_train, scoring='accuracy', cv=kfold).mean()
    scores_df.loc[metric] = [AUC, accuracy]
    count += 1

In [None]:
scores_df

## Naïve Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

df_2g_tf = pd.concat([tf_idf_dataset, TwoGram_dataset.iloc[:,1:len(TwoGram_dataset.columns)] ], axis=1, join='inner')

metric_names = ['BOW', '2GRAM','3GRAM', 'TF_IDF','TF_IDF_Opt','3GRAM+TF_IDF_OPT','2GRAM+TF_IDF']

scores_df = pd.DataFrame(index=metric_names, columns=['roc_auc', 'accuracy']) # to store the scores
model = GaussianNB()
test_size = 0.2
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
#for metric in metric_names:
data = [BOW_dataset,TwoGram_dataset,ThreeGram_dataset,tf_idf_dataset,tf_idf_data_final,tf_idf_3GRAM_data_final,df_2g_tf]
count = 0
for dataset in data:
    metric = metric_names[count]
    dataframe = dataset
    array = dataframe.values
    X = array[:,1:len(array)-1]
    Y = array[:,0]
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
    AUC = cross_val_score(model, X_train, Y_train, scoring='roc_auc', cv=kfold).mean()
    accuracy = cross_val_score(model, X_train, Y_train, scoring='accuracy', cv=kfold).mean()
    scores_df.loc[metric] = [AUC, accuracy]
    print(metric)
    print(accuracy)
    count += 1

In [None]:
scores_df