In [1]:
#Importing required libraries

#Data handling
import pandas as pd 

#For data preprocessing
from textblob import TextBlob #
import re
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')

#For data Visualization
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split #Training and testing data split
from sklearn.pipeline import Pipeline #Model pipeline
from sklearn.model_selection import GridSearchCV #Parameter tuning
import joblib #To dump trained models
import pickle #To store data in bytes

#Models and metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

In [2]:
#Load data
train = pd.read_csv('train.csv')

In [3]:
train.head(10)

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1
5,6,Spooky's Jump Scare Mansion,2015.0,"Early Access ReviewIt's pretty cute at first, ...",1
6,7,Spooky's Jump Scare Mansion,2017.0,Great game. it's a cute little horror game tha...,1
7,8,Spooky's Jump Scare Mansion,2015.0,Spooky's Jump Scare Mansion is a Free Retro ma...,1
8,9,Spooky's Jump Scare Mansion,2015.0,"Somewhere between light hearted, happy parody ...",0
9,10,Spooky's Jump Scare Mansion,2015.0,This game with its cute little out of the wall...,1


In [4]:
train.shape

(17494, 5)

In [5]:
#We are going to predict user suggestion only based on user_review. Hence drop Review_id, titile and year
train=train[['user_review', 'user_suggestion']]

In [6]:
train.isnull().sum()

user_review        0
user_suggestion    0
dtype: int64

In [7]:
#Check the target class balance
train.user_suggestion.value_counts(normalize=True)

1    0.569795
0    0.430205
Name: user_suggestion, dtype: float64

#### This shows that our target class is not highly imbalanced. This data will help us to make good trainable model. 

In [8]:
def text_process(data): #function to remove Special characters like @#[]()!
    tweet_blob = TextBlob(data)
    words = tweet_blob.words
    sent = ' '.join(words)
    return sent 

train.user_review = train.user_review.apply(text_process)

In [9]:
def remove_junk(data): #function to keep only characters and remove 'user'- which is not required 
    words=[words for words in data.split() if words != 'user']    
    clean_tokens = [t for t in words if re.match(r'[^\W\d]*$', t)] # Remove punctuations')]
    sent_join  = ' '.join(clean_tokens)
    return sent_join

train.user_review = train.user_review.apply(remove_junk)

In [10]:
# Remove stopwords
train['user_review'] = train['user_review'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))

In [11]:
train.head()

Unnamed: 0,user_review,user_suggestion
0,i scared hearing creepy voices so i pause mome...,1
1,best game better sam pepper youtube account ne...,1
2,a littly iffy controls know play easy master i...,1
3,great game fun colorful side note though when ...,1
4,not many games cute tag right next horror tag ...,1


In [12]:
#Lemmatizing the words
lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

train.user_review = train.user_review.apply(lemmatize_text) 

In [13]:
train.head()

Unnamed: 0,user_review,user_suggestion
0,i scared hearing creepy voice so i pause momen...,1
1,best game better sam pepper youtube account ne...,1
2,a littly iffy control know play easy master i ...,1
3,great game fun colorful side note though when ...,1
4,not many game cute tag right next horror tag f...,1


In [14]:
#Dependent and independent featuere
X= train['user_review']
y=train['user_suggestion']

In [15]:
#Train test split
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

### Models Pipeline -- NB with TFIDF

In [68]:
pipe_nb = Pipeline(steps=[('tf', TfidfVectorizer()), ('NB', MultinomialNB())])

# Create Parameter Grid
pgrid_mnnb = {
#  'tf__max_features' : [1000, 3000, 4000, 5000],
#  'tf__stop_words' : ['english', None],
 'tf__ngram_range' : [(1,1),(1,2), (2,2)],
 'tf__use_idf' : [True, False],
 'NB__alpha' : [0.01, 0.1, 0.5, 1]
}

# Apply GridSearch to Pipeline to find the best parameters
gs_mnnb = GridSearchCV(pipe_nb, pgrid_mnnb, cv=5, n_jobs=-1, verbose=2)

In [62]:
# Fit the model
gs_mnnb.fit(x_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tf', TfidfVectorizer()),
                                       ('NB', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'NB__alpha': [0.01, 0.1, 0.5, 1],
                         'tf__max_features': [1000, 3000, 4000, 5000],
                         'tf__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'tf__use_idf': [True, False]},
             verbose=2)

In [65]:
#check the best parameter for the model
gs_mnnb.best_params_

{'NB__alpha': 0.1,
 'tf__max_features': 5000,
 'tf__ngram_range': (1, 2),
 'tf__use_idf': True}

In [67]:
print('Train score: ', gs_mnnb.score(x_train, y_train))
print('Test score: ', gs_mnnb.score(x_test, y_test))

Train score:  0.8690548780487805
Test score:  0.83493369913123


In [21]:
NB_predict = gs_mnnb.predict(x_test)
print(classification_report(y_test, NB_predict))

              precision    recall  f1-score   support

           0       0.85      0.76      0.80      1842
           1       0.84      0.90      0.87      2532

    accuracy                           0.84      4374
   macro avg       0.84      0.83      0.84      4374
weighted avg       0.84      0.84      0.84      4374



### Logistic regression with TFIDF

In [87]:
lg = Pipeline(steps=[('tf', TfidfVectorizer()), ('LR', LogisticRegression())])

# Create Parameter Grid
pgrid_lr = {
#  'tf__max_features' : [1000, 2000, 3000],
#  'tf__stop_words' : ['english', None],
#  'tf__ngram_range' : [(1,1),(1,2)],
#  'tf__use_idf' : [True, False],
 'LR__C' : [0.9, 1.0, 0.1],
    'LR__max_iter':[100,200,500],
    'LR__penalty':['l1', 'l2']
}

# Apply GridSearch to Pipeline to find the best parameters
gs_lr = GridSearchCV(lg, pgrid_lr, cv=5, n_jobs=-1, verbose=2)


In [88]:
%%time
gs_lr.fit(x_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


        nan 0.84443598        nan 0.84443598        nan 0.84443598
        nan 0.80480183        nan 0.80480183        nan 0.80480183]


Wall time: 22.5 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tf', TfidfVectorizer()),
                                       ('LR', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'LR__C': [0.9, 1.0, 0.1],
                         'LR__max_iter': [100, 200, 500],
                         'LR__penalty': ['l1', 'l2']},
             verbose=2)

In [89]:
gs_lr.best_params_

{'LR__C': 1.0, 'LR__max_iter': 100, 'LR__penalty': 'l2'}

In [90]:
print('Train score: ', gs_lr.score(x_train, y_train))
print('Test score: ', gs_lr.score(x_test, y_test))

Train score:  0.9078506097560975
Test score:  0.8529949702789209


In [91]:
#Predict test cases
lr_predict = gs_lr.predict(x_test)
print(classification_report(y_test, lr_predict))

              precision    recall  f1-score   support

           0       0.85      0.79      0.82      1842
           1       0.86      0.90      0.88      2532

    accuracy                           0.85      4374
   macro avg       0.85      0.84      0.85      4374
weighted avg       0.85      0.85      0.85      4374



In [95]:
# Creating a pickle file for the CountVectorizer
pickle.dump(cv, open('tfidf-transform.pkl', 'wb'))

NameError: name 'pickle' is not defined

In [94]:
# save the model to disk
filename = 'Model/LG_model.pkl'
joblib.dump(gs_lr, filename)
 
# load the model from disk
LG_model = joblib.load(filename)

### SGD Classifier - 
https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568

In [42]:
from sklearn.linear_model import SGDClassifier

In [43]:
pipe_SGD = Pipeline(steps=[('tf', TfidfVectorizer()), ('SGD', SGDClassifier())])

# Create Parameter Grid
pgrid_SGD = {
#  'tf__max_features' : [1000, 5000, 6000],
#  'tf__stop_words' : ['english', None],
#  'tf__ngram_range' : [(1,1),(1,2), (2,2)],
#  'tf__use_idf' : [True, False],
 'SGD__loss' : ['hinge', 'log', 'perceptron'],
    'SGD__alpha': [0.00001, 0.00001, 0.0001, 0.001],
    'SGD__max_iter' : [200, 500, 700, 1000]
}

# Apply GridSearch to Pipeline to find the best parameters
gs_SGD = GridSearchCV(pipe_SGD, pgrid_SGD, cv=5, n_jobs=-1, verbose=2)

In [44]:
%time%
gs_SGD.fit(x_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Wall time: 53.6 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tf', TfidfVectorizer()),
                                       ('SGD', SGDClassifier())]),
             n_jobs=-1,
             param_grid={'SGD__alpha': [1e-05, 1e-05, 0.0001, 0.001],
                         'SGD__loss': ['hinge', 'log', 'perceptron'],
                         'SGD__max_iter': [200, 500, 700, 1000]},
             verbose=2)

In [45]:
gs_SGD.best_params_

{'SGD__alpha': 0.0001, 'SGD__loss': 'log', 'SGD__max_iter': 500}

In [46]:
print('Train score: ',gs_SGD.score(x_train, y_train))
print('Train score: ',gs_SGD.score(x_test, y_test))

Train score:  0.9009908536585366
Train score:  0.8511659807956105


In [None]:
# import joblib
# # save the model to disk
# filename = 'NB_model.sav'
# joblib.dump(gs_mnnb, filename)
 
# # load the model from disk
# NB_model = joblib.load(filename)
# # result = loaded_model.score(X_test, Y_test)
# # print(result)