# predict whether the cusomer's reviews are positive/negative

In [1]:
#importing necessary libraries
import nltk

from nltk.corpus import stopwords
import string
import re
from nltk.tokenize import word_tokenize
from collections import Counter
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import pandas as pd

In [3]:
dataset=pd.read_csv("train.csv",encoding='utf-8',
                   header=0,sep=',')

In [4]:
dataset

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy
...,...,...,...,...,...
38927,id49253,We arrived late at night and walked in to a ch...,Edge,Desktop,happy
38928,id49254,The only positive impression is location and p...,InternetExplorer,Mobile,not happy
38929,id49255,Traveling with friends for shopping and a show...,Firefox,Mobile,not happy
38930,id49256,The experience was just ok. We paid extra for ...,Chrome,Desktop,not happy


In [5]:
#counts the response
dataset['Is_Response'].value_counts()

happy        26521
not happy    12411
Name: Is_Response, dtype: int64

In [6]:
#set happy as positive response and unhappy as negative response

dataset["Is_Response"] = dataset["Is_Response"].map({"happy": "positive", "not happy" : "negative"})
dataset.sample(3)

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
11578,id21904,If you are going to Universal Studios then thi...,Edge,Desktop,negative
15592,id25918,We stayed - nights and wow! what a joy! We wan...,Mozilla Firefox,Mobile,positive
16227,id26553,Modern business hotel located one block from b...,Mozilla Firefox,Tablet,negative


In [7]:
#drop the columns which are not need for this task

dataset.drop(columns = ['User_ID', 'Browser_Used', 'Device_Used'], inplace = True)

In [8]:
# Preprocessing the data
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()

# remove email (@username)
email_handle = r'@[A-Za-z0-9_]+' 

# remove website URLs that start with 'https?://'
url_handle = r'http[^ ]+'                                  
combined_handle = r'|'.join((email_handle, url_handle)) 

# remove website URLs that start with 'www.'
www_handle = r'www.[^ ]+' 

#punctuation handle
punctuation_handle = r'\W+'

In [9]:
stop_words = set(stopwords.words("english"))

In [10]:
def process_text(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()

    try:
        text = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        text = souped

    cleaned_text = re.sub(punctuation_handle, " ",(re.sub(www_handle, '', re.sub(combined_handle, '', text)).lower()))
    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stop_words])

    return (" ".join([word for word in tokenizer.tokenize(cleaned_text) if len(word) > 1])).strip()

In [11]:
#retrieve cleaned text
cleaned_text = []

for text in dataset.Description:
    cleaned_text.append(process_text(text))

clean_text = pd.DataFrame({'clean_text' : cleaned_text})
data = pd.concat([dataset, clean_text], axis = 1)

data.sample(5)

Unnamed: 0,Description,Is_Response,clean_text
34952,Travelling by myself to New York for the first...,positive,travelling new york first time easy find train...
8451,Modern B&B was one of my best lodging experien...,positive,modern one best lodging experiences traveling ...
5273,We stayed at this hotel for a wedding. We paid...,negative,stayed hotel wedding paid group good things co...
35750,"The hotel is in a great location, but it is no...",negative,hotel great location worth family ate dinner p...
34813,Enjoyed our stay at this hotel. Close and conv...,positive,enjoyed stay hotel close convenient everything...


In [12]:
from sklearn.model_selection import train_test_split

x= data["clean_text"]
y= dataset["Is_Response"]

In [13]:
x_train, x_test,y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 41)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tvec = TfidfVectorizer()
clf2 = LogisticRegression()

In [15]:
from sklearn.pipeline import Pipeline

model = Pipeline([('vectorizer',tvec)
                 ,('classifier',clf2)])

model.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, inter

In [16]:
from sklearn.metrics import confusion_matrix

verdict = model.predict(x_test)

confusion_matrix(verdict, y_test)

array([[2800,  476],
       [ 884, 7520]], dtype=int64)

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(verdict, y_test))
print("Precision : ", precision_score(verdict, y_test, average = 'weighted'))
print("Recall : ", recall_score(verdict, y_test, average = 'weighted'))

Accuracy :  0.8835616438356164
Precision :  0.8898642239775905
Recall :  0.8835616438356164


In [18]:
test=pd.read_csv("test.csv",encoding="utf-8")
example_result = model.predict(test)
example_result

array(['positive', 'negative', 'positive', 'positive'], dtype=object)