In [36]:
import re
import string
from  matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd

In [37]:
#Import dataframe
df = pd.read_csv("dataset/train.csv")
df.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


EDA

In [38]:
#Get shape
df.shape

(38932, 5)

In [39]:
#Describe dataset
df.describe().transpose()

Unnamed: 0,count,unique,top,freq
User_ID,38932,38932,id10326,1
Description,38932,38932,The room was kind of clean but had a VERY stro...,1
Browser_Used,38932,11,Firefox,7367
Device_Used,38932,3,Desktop,15026
Is_Response,38932,2,happy,26521


In [40]:
#Get null values
df.isnull().sum()

User_ID         0
Description     0
Browser_Used    0
Device_Used     0
Is_Response     0
dtype: int64

In [41]:
#Get columns
df.columns

Index(['User_ID', 'Description', 'Browser_Used', 'Device_Used', 'Is_Response'], dtype='object')

In [42]:
#remove un necessary columns
df.drop(columns=['User_ID', 'Browser_Used', 'Device_Used'], inplace=True)
df.head()

Unnamed: 0,Description,Is_Response
0,The room was kind of clean but had a VERY stro...,not happy
1,I stayed at the Crown Plaza April -- - April -...,not happy
2,I booked this hotel through Hotwire at the low...,not happy
3,Stayed here with husband and sons on the way t...,happy
4,My girlfriends and I stayed here to celebrate ...,not happy


In [43]:
#Clean the Description column
def cleaningDescription(descColumn):
    descColumn = descColumn.lower()
    descColumn = re.sub('\[.*?\]', '', descColumn)
    descColumn = re.sub('[%s]' % re.escape(string.punctuation), '', descColumn)
    descColumn = re.sub('\w*\d\w*', '', descColumn)
    descColumn = re.sub('[‘’“”…]', '', descColumn)
    descColumn = re.sub('\n', '', descColumn)
    return descColumn

cleaned1 = lambda x: cleaningDescription(x)

In [44]:
df['new_Description'] = pd.DataFrame(df.Description.apply(cleaned1))
df.head()

Unnamed: 0,Description,Is_Response,new_Description
0,The room was kind of clean but had a VERY stro...,not happy,the room was kind of clean but had a very stro...
1,I stayed at the Crown Plaza April -- - April -...,not happy,i stayed at the crown plaza april april th...
2,I booked this hotel through Hotwire at the low...,not happy,i booked this hotel through hotwire at the low...
3,Stayed here with husband and sons on the way t...,happy,stayed here with husband and sons on the way t...
4,My girlfriends and I stayed here to celebrate ...,not happy,my girlfriends and i stayed here to celebrate ...


In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

#Define variables 
X = df.new_Description
y = df.Is_Response	
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1)

lrModel = LogisticRegression()
tfid = TfidfVectorizer()

model = Pipeline(
    [("vectorizer",tfid),
     ("classifier",lrModel)]
)

model.fit(X_train, y_train)

Model Accuracy

In [50]:
#Model accuracy
from sklearn.metrics import confusion_matrix

pred = model.predict(X_test)

confusion_matrix(pred, y_test)

array([[2427,  287],
       [ 190,  990]])

In [51]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(pred, y_test))
print("Precision : ", precision_score(pred, y_test, average = 'weighted'))
print("Recall : ", recall_score(pred, y_test, average = 'weighted'))

Accuracy :  0.8775038520801233
Precision :  0.8812937591821159
Recall :  0.8775038520801233


Model in action

In [62]:
inputText = str(input("Enter review"))
outcome = model.predict([inputText])
print(outcome)

['not happy']


Saving the model

In [64]:
import pickle
file_name = "LogisticRegressionModel.pkl"
pickle.dump(model, open(file_name,"wb"))