In [1]:
#Importing required Libraries 
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

# For preprocessing
import unicodedata   ## Removing Accented Characters
import contractions #from contractions.py ##Expanding Contractions
import re ## Removing Special Character
import spacy  ## Lemmatization
nlp = spacy.load('en_core_web_sm')
import string ## Removing Punctuation
import nltk ##Stemming
from nltk.tokenize import ToktokTokenizer ## Removing Stopwords
tokenizer = ToktokTokenizer() ## Removing Stopwords
stopword_list = nltk.corpus.stopwords.words('english') ## Removing Stopwords
from sklearn.feature_extraction.text import TfidfVectorizer ##TfIdf
from sklearn.model_selection import train_test_split ##Spliting the data
from sklearn.model_selection import cross_val_score ##Cross Validation
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
#Reading Data
train_data = pd.read_csv("train-balanced.csv", sep='\t')
keys = pd.read_csv("key.csv", sep='\t')

In [3]:
#Adding column names to train and test dataframe
train_data.columns = keys.columns
train_data.info()  ##Some comments are missing. I dropped them

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010825 entries, 0 to 1010824
Data columns (total 10 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   label           1010825 non-null  int64 
 1   comment         1010772 non-null  object
 2   author          1010825 non-null  object
 3   subreddit       1010825 non-null  object
 4   score           1010825 non-null  int64 
 5   ups             1010825 non-null  int64 
 6   downs           1010825 non-null  int64 
 7   date            1010825 non-null  object
 8   created_utc     1010825 non-null  int64 
 9   parent_comment  1010825 non-null  object
dtypes: int64(5), object(5)
memory usage: 77.1+ MB


In [4]:
train_data.dropna(subset=['comment'], inplace=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1010772 entries, 0 to 1010824
Data columns (total 10 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   label           1010772 non-null  int64 
 1   comment         1010772 non-null  object
 2   author          1010772 non-null  object
 3   subreddit       1010772 non-null  object
 4   score           1010772 non-null  int64 
 5   ups             1010772 non-null  int64 
 6   downs           1010772 non-null  int64 
 7   date            1010772 non-null  object
 8   created_utc     1010772 non-null  int64 
 9   parent_comment  1010772 non-null  object
dtypes: int64(5), object(5)
memory usage: 84.8+ MB


In [5]:
train_data.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,1477959850,The blazers and Mavericks (The wests 5 and 6 s...
1,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,1474580737,They're favored to win.
2,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,1476824627,deadass don't kill my buzz
3,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,1483117213,Yep can confirm I saw the tool they use for th...
4,0,"I don't pay attention to her, but as long as s...",only7inches,AskReddit,0,0,0,2016-09,1472812508,do you find ariana grande sexy ?


In [6]:
sar_train_data = train_data[train_data['label'] == 1]
neu_train_data = train_data[train_data['label'] == 0]
sar_train_data = sar_train_data[:25000]
neu_train_data = neu_train_data[:25000]

frames = [sar_train_data, neu_train_data]
data_train = pd.concat(frames, sort=False)
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 32 to 42338
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   label           50000 non-null  int64 
 1   comment         50000 non-null  object
 2   author          50000 non-null  object
 3   subreddit       50000 non-null  object
 4   score           50000 non-null  int64 
 5   ups             50000 non-null  int64 
 6   downs           50000 non-null  int64 
 7   date            50000 non-null  object
 8   created_utc     50000 non-null  int64 
 9   parent_comment  50000 non-null  object
dtypes: int64(5), object(5)
memory usage: 4.2+ MB


In [7]:
# Removing Accented Characters

# Funtion Definition
def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

# Function Call
data_train['cleancomment']=data_train['comment'].map(lambda s:remove_accented_chars(s))

In [8]:
# Expanding Contractions

# Funtion Definition
def expand_contractions(text):
    return contractions.fix(text)

# Function Call 
data_train['cleancomment']=data_train['cleancomment'].map(lambda s:expand_contractions(s))

In [9]:
# Removing Special Characters

# Funtion Definition
def remove_special_characters(text):
    # define the pattern to keep
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', text)
 
# Function Call 
data_train['cleancomment']=data_train['cleancomment'].map(lambda s:remove_special_characters(s))

In [10]:
# Removing Punctuation

# Funtion Definition
def remove_punctuation(text):
    text = ''.join([c for c in text if c not in string.punctuation])
    return text

# Function Call
data_train['cleancomment']=data_train['cleancomment'].map(lambda s:remove_punctuation(s))

In [11]:
# Removing Stopwords

# Funtion Definition
def remove_stopwords(text):
    # convert sentence into token of words
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    # check in lowercase 
    t = [token for token in tokens if token.lower() not in stopword_list]
    text = ' '.join(t)    
    return text

# Function Call
data_train['cleancomment']=data_train['cleancomment'].map(lambda s:remove_stopwords(s))

In [12]:
# Removing extra whitespaces and tabs

# Funtion Definition
def remove_extra_whitespace_tabs(text):
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()

# Function Call
data_train['cleancomment']=data_train['cleancomment'].map(lambda s:remove_extra_whitespace_tabs(s))

In [13]:
# Lowercase

# Funtion Definition
def to_lowercase(text):
    return text.lower()

# Function Call
data_train['cleancomment']=data_train['cleancomment'].map(lambda s:to_lowercase(s))

In [14]:
data_train.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,cleancomment
32,1,But they'll have all those reviews!,RoguishPoppet,ProductTesting,0,-1,-1,2016-11,1477965899,"The dumb thing is, they are risking their sell...",reviews
43,1,wow it is totally unreasonable to assume that ...,pb2crazy,politics,2,-1,-1,2016-11,1477968131,Clinton campaign accuses FBI of 'blatant doubl...,wow totally unreasonable assume agency covered...
44,1,Ho ho ho... But Melania said that there is no ...,pb2crazy,politics,8,-1,-1,2016-10,1476807653,Anyone else think that it was interesting the ...,ho ho ho melania said way could happened know ...
65,1,I can't wait until @potus starts a twitter war...,kitduncan,politics,3,-1,-1,2016-11,1477970553,Here's what happens when Obama gives up his Tw...,wait potus starts twitter war morning joe
68,1,gotta love the teachers who give exams on the ...,DEP61,CFBOffTopic,3,-1,-1,2016-11,1477971011,Monday night Drinking thread Brought to You by...,got love teachers give exams day halloween


In [15]:
data_train['label'].value_counts()

0    25000
1    25000
Name: label, dtype: int64

In [24]:
X = data_train['cleancomment']
y = data_train['label']

from sklearn.model_selection import train_test_split
SEED = 101
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(X, y, test_size=.1, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [25]:
print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

Train set has total 45000 entries with 50.00% negative, 50.00% positive
Validation set has total 2500 entries with 50.16% negative, 49.84% positive
Test set has total 2500 entries with 49.76% negative, 50.24% positive


In [26]:
#Creating CountVerctorization matrix
cv = CountVectorizer(stop_words='english')
cv.fit_transform(x_train)

<45000x33401 sparse matrix of type '<class 'numpy.int64'>'
	with 207827 stored elements in Compressed Sparse Row format>

In [27]:
x_train_cv = cv.transform(x_train)

In [28]:
x_validation_cv = cv.transform(x_validation).toarray()

In [29]:
%%time
#Defining and training Random Forest model
model = RandomForestClassifier()
model.fit(x_train_cv, y_train)

Wall time: 8min 21s


RandomForestClassifier()

In [31]:
model.score(x_validation_cv, y_validation)

0.6252

In [34]:
model.score(x_train_cv, y_train)

0.9814222222222222

# 2nd model with "author" feature

In [35]:
#Top 10 sarcastic authors
sarcastic_author = data_train[data_train['label'] == 1]
sarcastic_author = sarcastic_author[['label', 'author']].copy()
sarcastic_author = sarcastic_author.groupby('author').agg(['count']).reset_index()
sarcastic_author.columns = ["author", "Count"]
sarcastic_author = sarcastic_author.sort_values(by=['Count'], ascending=False) 
sarcastic_author.head(10)

Unnamed: 0,author,Count
19308,sleaze_bag_alert,31
10994,TouchMeHerePls,25
3220,EggCouncil,21
9348,ShyBiDude89,20
18217,pokemon_fetish,15
20995,xVoltage360,14
17168,mindlessrabble,13
2392,Cynner,13
13151,brainiac3397,12
10941,TombstoneAintThatBad,12


In [36]:
#Top 10 neutral authors
neutral_author = data_train[data_train['label'] == 0]
neutral_author = neutral_author[['label', 'author']].copy()
neutral_author = neutral_author.groupby('author').agg(['count']).reset_index()
neutral_author.columns = ["author", "Count"]
neutral_author = neutral_author.sort_values(by=['Count'], ascending=False) 
neutral_author.head(10)

Unnamed: 0,author,Count
11678,TouchMeHerePls,16
12132,Vince5970,13
22064,xVoltage360,13
18191,mindlessrabble,12
20369,sleaze_bag_alert,12
8592,PianoRainMelody,9
16350,hookyboysb,9
6883,Makdranon,8
21552,ukulelej,8
11624,TombstoneAintThatBad,8


In [37]:
#Top 10 subreddits for sarcastic comments
sarcastic_subs = data_train[data_train['label'] == 1]
sarcastic_subs = sarcastic_subs[['label', 'subreddit']].copy()
sarcastic_subs = sarcastic_subs.groupby('subreddit').agg(['count']).reset_index()
sarcastic_subs.columns = ["subreddit", "Count"]
sarcastic_subs = sarcastic_subs.sort_values(by=['Count'], ascending=False) 
sarcastic_subs.head(10)

Unnamed: 0,subreddit,Count
2033,politics,2043
94,AskReddit,1307
1068,The_Donald,672
2406,worldnews,624
1997,pcmasterrace,486
1795,leagueoflegends,434
1913,news,411
1915,nfl,399
483,GlobalOffensive,270
2014,pics,267


In [38]:
#Top 10 subreddits for normal comments
normal_subs = data_train[data_train['label'] == 0]
normal_subs = normal_subs[['label', 'subreddit']].copy()
normal_subs = normal_subs.groupby('subreddit').agg(['count']).reset_index()
normal_subs.columns = ["subreddit", "Count"]
normal_subs = normal_subs.sort_values(by=['Count'], ascending=False) 
normal_subs.head(10)

Unnamed: 0,subreddit,Count
127,AskReddit,1852
2661,politics,1205
1408,The_Donald,676
2360,leagueoflegends,425
2619,pcmasterrace,372
2531,nfl,364
3147,worldnews,337
2112,funny,268
2508,nba,264
2525,news,262


In [39]:
data_train.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,cleancomment
32,1,But they'll have all those reviews!,RoguishPoppet,ProductTesting,0,-1,-1,2016-11,1477965899,"The dumb thing is, they are risking their sell...",reviews
43,1,wow it is totally unreasonable to assume that ...,pb2crazy,politics,2,-1,-1,2016-11,1477968131,Clinton campaign accuses FBI of 'blatant doubl...,wow totally unreasonable assume agency covered...
44,1,Ho ho ho... But Melania said that there is no ...,pb2crazy,politics,8,-1,-1,2016-10,1476807653,Anyone else think that it was interesting the ...,ho ho ho melania said way could happened know ...
65,1,I can't wait until @potus starts a twitter war...,kitduncan,politics,3,-1,-1,2016-11,1477970553,Here's what happens when Obama gives up his Tw...,wait potus starts twitter war morning joe
68,1,gotta love the teachers who give exams on the ...,DEP61,CFBOffTopic,3,-1,-1,2016-11,1477971011,Monday night Drinking thread Brought to You by...,got love teachers give exams day halloween


In [40]:
data_train['cc_aut'] = data_train.agg('{0[cleancomment]} {0[author]}'.format, axis=1)
X = data_train['cc_aut']
y = data_train['label']

SEED = 101
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(X, y, test_size=.1, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)


print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))


Train set has total 45000 entries with 50.00% negative, 50.00% positive
Validation set has total 2500 entries with 50.16% negative, 49.84% positive
Test set has total 2500 entries with 49.76% negative, 50.24% positive


In [41]:
cv.fit_transform(x_train)

<45000x67951 sparse matrix of type '<class 'numpy.int64'>'
	with 253759 stored elements in Compressed Sparse Row format>

In [42]:
x_train_cv = cv.transform(x_train)

In [43]:
x_validation_cv = cv.transform(x_validation).toarray()

In [44]:
%%time
#Defining and training Random Forest model
model = RandomForestClassifier()
model.fit(x_train_cv, y_train)

Wall time: 8min 27s


RandomForestClassifier()

In [45]:
model.score(x_validation_cv, y_validation)

0.6396

In [46]:
model.score(x_train_cv, y_train)

0.9998444444444444

# 3rd model with subreddit feature

In [48]:
data_train['cc_subr'] = data_train.agg('{0[cleancomment]} {0[subreddit]}'.format, axis=1)
X = data_train['cc_subr']
y = data_train['label']

SEED = 101
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(X, y, test_size=.1, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)


print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))


Train set has total 45000 entries with 50.00% negative, 50.00% positive
Validation set has total 2500 entries with 50.16% negative, 49.84% positive
Test set has total 2500 entries with 49.76% negative, 50.24% positive


In [49]:
cv.fit_transform(x_train)

<45000x36242 sparse matrix of type '<class 'numpy.int64'>'
	with 252379 stored elements in Compressed Sparse Row format>

In [50]:
x_train_cv = cv.transform(x_train)

In [51]:
x_validation_cv = cv.transform(x_validation).toarray()

In [52]:
%%time
#Defining and training Random Forest model
model = RandomForestClassifier()
model.fit(x_train_cv, y_train)

Wall time: 6min 55s


RandomForestClassifier()

In [53]:
model.score(x_validation_cv, y_validation)

0.6292

In [54]:
model.score(x_train_cv, y_train)

0.9961333333333333

# Final model

In [55]:
data_train['all'] = data_train.agg('{0[cleancomment]} {0[subreddit]} {0[author]}'.format, axis=1)
X = data_train['all']
y = data_train['label']

SEED = 101
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(X, y, test_size=.1, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)


print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))


Train set has total 45000 entries with 50.00% negative, 50.00% positive
Validation set has total 2500 entries with 50.16% negative, 49.84% positive
Test set has total 2500 entries with 49.76% negative, 50.24% positive


In [56]:
cv.fit_transform(x_train)

<45000x70784 sparse matrix of type '<class 'numpy.int64'>'
	with 298303 stored elements in Compressed Sparse Row format>

In [57]:
x_train_cv = cv.transform(x_train)

In [58]:
x_validation_cv = cv.transform(x_validation).toarray()

In [59]:
%%time
#Defining and training Random Forest model
model = RandomForestClassifier()
model.fit(x_train_cv, y_train)

Wall time: 8min 6s


RandomForestClassifier()

In [60]:
model.score(x_validation_cv, y_validation)

0.6356

In [61]:
model.score(x_train_cv, y_train)

0.9998888888888889

# Saving all the models (Ignore this Section)

In [None]:
import pickle

In [None]:
# Save the Modle to file in the current working directory

Pkl1_Filename = "Clean_Comment.pkl"  

with open(Pkl1_Filename, 'wb') as file:  
    pickle.dump(model, file)
    
    

Pkl2_Filename = "CC_with_author.pkl"  

with open(Pkl2_Filename, 'wb') as file:  
    pickle.dump(model1, file)
    
    

    
Pkl3_Filename = "CC_with_subreddit.pkl"  

with open(Pkl3_Filename, 'wb') as file:  
    pickle.dump(model2, file)
    
    
    
    
Pkl4_Filename = "CC_with_both.pkl"  

with open(Pkl4_Filename, 'wb') as file:  
    pickle.dump(model3, file)

https://www.kaggle.com/prmohanty/python-how-to-save-and-load-ml-models

# Trying fresh model with tfidf

In [62]:
X = data_train['all']
y = data_train['label']

SEED = 101
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(X, y, test_size=.1, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)


print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))


Train set has total 45000 entries with 50.00% negative, 50.00% positive
Validation set has total 2500 entries with 50.16% negative, 49.84% positive
Test set has total 2500 entries with 49.76% negative, 50.24% positive


In [63]:
tvec1 = TfidfVectorizer()
tvec1.fit(x_train)

TfidfVectorizer()

In [64]:
x_train_tfidf = tvec1.transform(x_train)

In [65]:
x_validation_tfidf = tvec1.transform(x_validation).toarray()

In [68]:
%%time
#Defining and training Random Forest model
model = RandomForestClassifier()
model.fit(x_train_tfidf, y_train)

Wall time: 7min 14s


RandomForestClassifier()

In [70]:
model.score(x_validation_tfidf, y_validation)

0.668

In [71]:
model.score(x_train_tfidf, y_train)

0.9999555555555556

# Trying fresh model with tfidf with bi-gram

In [72]:
tvec1 = TfidfVectorizer(ngram_range=(1,2))
tvec1.fit(x_train)

TfidfVectorizer(ngram_range=(1, 2))

In [73]:
x_train_tfidf = tvec1.transform(x_train)

In [74]:
x_validation_tfidf = tvec1.transform(x_validation).toarray()

In [75]:
%%time
#Defining and training Random Forest model
model = RandomForestClassifier()
model.fit(x_train_tfidf, y_train)

Wall time: 20min 40s


RandomForestClassifier()

In [76]:
model.score(x_validation_tfidf, y_validation)

0.6544

In [77]:
model.score(x_train_tfidf, y_train)

0.9999555555555556

## Note: TfIdf using bigram proved to be less acrurate. Therefore, I will ignore this and not save the model using pickle

Future work can be making use of Dimentionality reduction (PCA) or Word Embedding (Word2Vec)

https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-8-dimensionality-reduction-chi2-pca-c6d06fb3fcf3