In [1]:
#Importing required libraries

#Data handling
import pandas as pd 

#For data preprocessing
from textblob import TextBlob #
import re
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')

#For data Visualization
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split #Training and testing data split
from sklearn.pipeline import Pipeline #Model pipeline
from sklearn.model_selection import GridSearchCV #Parameter tuning
import pickle #To store data in bytes

#Models and metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

In [2]:
#Load data
train = pd.read_csv('Data/train.csv')

In [3]:
train.head(10)

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1
5,6,Spooky's Jump Scare Mansion,2015.0,"Early Access ReviewIt's pretty cute at first, ...",1
6,7,Spooky's Jump Scare Mansion,2017.0,Great game. it's a cute little horror game tha...,1
7,8,Spooky's Jump Scare Mansion,2015.0,Spooky's Jump Scare Mansion is a Free Retro ma...,1
8,9,Spooky's Jump Scare Mansion,2015.0,"Somewhere between light hearted, happy parody ...",0
9,10,Spooky's Jump Scare Mansion,2015.0,This game with its cute little out of the wall...,1


In [4]:
train.shape

(17494, 5)

In [5]:
#We are going to predict user suggestion only based on user_review. Hence drop Review_id, titile and year
train=train[['user_review', 'user_suggestion']]

In [6]:
train.isnull().sum()

user_review        0
user_suggestion    0
dtype: int64

In [7]:
#Check the target class balance
train.user_suggestion.value_counts(normalize=True)

1    0.569795
0    0.430205
Name: user_suggestion, dtype: float64

**This shows that our target class is not highly imbalanced. 56:43 distribution would be enough to make good model.**

In [8]:
def text_process(data): #function to remove Special characters like @#[]()!
    tweet_blob = TextBlob(data)
    words = tweet_blob.words
    sent = ' '.join(words)
    return sent 

train.user_review = train.user_review.apply(text_process)

In [9]:
def remove_junk(data): #function to keep only characters and remove 'user'- which is not required 
    words=[words for words in data.split() if words != 'user']    
    clean_tokens = [t for t in words if re.match(r'[^\W\d]*$', t)] # Remove punctuations')]
    sent_join  = ' '.join(clean_tokens)
    return sent_join

train.user_review = train.user_review.apply(remove_junk)

In [10]:
# Remove stopwords
train['user_review'] = train['user_review'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))

In [11]:
train.head()

Unnamed: 0,user_review,user_suggestion
0,i scared hearing creepy voices so i pause mome...,1
1,best game better sam pepper youtube account ne...,1
2,a littly iffy controls know play easy master i...,1
3,great game fun colorful side note though when ...,1
4,not many games cute tag right next horror tag ...,1


In [12]:
#Lemmatizing the words
lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

train.user_review = train.user_review.apply(lemmatize_text) 

In [13]:
train.head()

Unnamed: 0,user_review,user_suggestion
0,i scared hearing creepy voice so i pause momen...,1
1,best game better sam pepper youtube account ne...,1
2,a littly iffy control know play easy master i ...,1
3,great game fun colorful side note though when ...,1
4,not many game cute tag right next horror tag f...,1


In [14]:
#Separate Dependent and independent featuere
X= train['user_review']
y=train['user_suggestion']

### Model building

In [15]:
#Tfidfvectorizer
tf = TfidfVectorizer(max_features=3500)
tfidf = tf.fit_transform(X)

In [16]:
#Train test split
x_train, x_test, y_train, y_test = train_test_split(tfidf,y, test_size=0.25, random_state=42)

In [17]:
#Logistic regression
lr=LogisticRegression()
lr.fit(x_train, y_train)

#Predict test cases
lr_predict = lr.predict(x_test)

In [20]:
#Metrics used to check the performance
print(classification_report(y_test, lr_predict))
print('Train score: ', lr.score(x_train, y_train))
print('Test score: ', lr.score(x_test, y_test))

              precision    recall  f1-score   support

           0       0.84      0.79      0.82      1842
           1       0.85      0.89      0.87      2532

    accuracy                           0.85      4374
   macro avg       0.85      0.84      0.84      4374
weighted avg       0.85      0.85      0.85      4374

Train score:  0.889405487804878
Test score:  0.8500228623685414


In [21]:
#creating the pickle file for Logistic model
pickle.dump(lr, open('Model/LG_model.pickle', 'wb'))
# Creating a pickle file for the CountVectorizer
pickle.dump(tf, open('Model/tfidf-transformer.pickle', 'wb'))

In [34]:
#Naive bayes
NB=MultinomialNB()
NB.fit(x_train, y_train)

#Predict test cases
NB_predict = NB.predict(x_test)

In [35]:
#Metrics used to check the performance
print(classification_report(y_test, NB_predict))
print('Train score: ', NB.score(x_train, y_train))
print('Test score: ', NB.score(x_test, y_test))

              precision    recall  f1-score   support

           0       0.85      0.74      0.79      1842
           1       0.83      0.90      0.86      2532

    accuracy                           0.83      4374
   macro avg       0.84      0.82      0.83      4374
weighted avg       0.84      0.83      0.83      4374

Train score:  0.8567835365853659
Test score:  0.8344764517604024
