## Data Engineering Project 1 (ML Model Building)
by Brenden BALANE and Paul BASNIER

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import pickle
from nlppreprocess import NLP

In [39]:
#loading data
#Reddit Sentimental analysis Dataset by Chaithanya Kumar A
df = pd.read_csv('Reddit_Data.csv', encoding='latin1')
df.columns = ['text', 'label']
df.head(30)

Unnamed: 0,text,label
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
5,you should all sit down together and watch the...,-1
6,was teens when discovered zen meditation was ...,1
7,jesus was zen meets jew,0
8,there are two varieties christians dogmatic th...,-1
9,dont worry about trying explain yourself just ...,1


In [40]:
len(df)

37249

In [41]:
df.groupby(['label']).size()

label
-1     8277
 0    13142
 1    15830
dtype: int64

In [42]:
df['text'] = df['text'].astype(str)

In [43]:
#lowercasing every text inputs
df['text'] = df.text.str.lower()

In [44]:
#word tokenization
df['text'] = [word_tokenize(str(entry)) for entry in df['text'].dropna()]

In [45]:
#preprocessing
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

word_Lemmatized = WordNetLemmatizer()

stopWords = stopwords.words('english')

for index,entry in enumerate(df['text']):
    final_words = []
    
    for word,tag in pos_tag(entry):
        if word.isalpha():
            #print(word)
            final_words.append(word_Lemmatized.lemmatize(word,tag_map[tag[0]]))
            
    df.loc[index,'text_final'] = str(final_words)

In [46]:
#data splitting for training and testing the model
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['text_final'],df['label'],test_size=0.2, random_state=32)

In [47]:
encoder = LabelEncoder()
train_y_encoded = encoder.fit_transform(train_y)
test_y_encoded = encoder.fit_transform(test_y)

In [59]:
df.head()

Unnamed: 0,text,label,text_final
0,"[family, mormon, have, never, tried, explain, ...",1,"['family', 'mormon', 'never', 'try', 'explain'..."
1,"[buddhism, has, very, much, lot, compatible, w...",1,"['buddhism', 'much', 'lot', 'compatible', 'chr..."
2,"[seriously, don, say, thing, first, all, they,...",-1,"['seriously', 'say', 'thing', 'first', 'get', ..."
3,"[what, you, have, learned, yours, and, only, y...",0,"['learn', 'want', 'teach', 'different', 'focus..."
4,"[for, your, own, benefit, you, may, want, read...",1,"['benefit', 'may', 'want', 'read', 'live', 'bu..."


In [60]:
#df.to_csv(r'df_sentiment_analysis.csv', index = False)

In [48]:
#Text vectorization
tfidf_vect = TfidfVectorizer(max_features=5000)
tfidf_vect.fit(df['text_final'])
train_x_tfidf = tfidf_vect.transform(train_x)
test_x_tfidf = tfidf_vect.transform(test_x)

In [50]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(train_x_tfidf,train_y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(test_x_tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, test_y)*100)

SVM Accuracy Score ->  84.69798657718121


In [54]:
#testing the model
speech_to_classify = "i love Paul"
SVM.predict(tfidf_vect.transform([speech_to_classify]))

array([1])

In [55]:
#save the model to disk
sentiment_classifier = 'sentiment_analysis_model.sav'
pickle.dump(SVM, open(sentiment_classifier, 'wb'))

In [57]:
#loading pre-trained model and testing it
loaded_model = pickle.load(open('sentiment_analysis_model.sav', 'rb'))
loaded_model.predict(tfidf_vect.transform([speech_to_classify]))[0]

1