In [1]:
# This file contains codes for Data cleaning and Model building only
# Visualisation and basic EDA is excluded here(refer EDA file for these details)

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string # special operations on strings
import spacy # language models

from matplotlib.pyplot import imread
from matplotlib import pyplot as plt
from wordcloud import WordCloud
%matplotlib inline
import re

In [3]:
#Importing dataset#
df=pd.read_excel("hotel_reviews.xlsx")

In [4]:
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


In [5]:
df1=df.copy()

In [6]:
def clean_Review(Review):
    Review = Review.lower()
    Review = re.sub('\[.*?\]','',Review)
    Review = re.sub('@[A-Za-z0–9]+', '', Review)  #Removing tag(@)
    Review = re.sub('#', '', Review)   #Removing hashtag(#)
    Review = re.sub('RT[\s]+', '', Review)  #Removing RT 
    Review = re.sub('https?:\/\/\S+', '', Review)  #Removing links
    Review = re.sub("[^A-Za-z" "]+"," ",Review).lower()  #remove special character
    Review = re.sub("[0-9" "]+"," ",Review)  #remove numbers
    Review=re.sub("\[.*?\]","",Review)
    Review=re.sub("[%s]" % re.escape(string.punctuation),'',Review)
    Review=re.sub("\w*\d\w*","",Review)
    Review=re.sub("\n","",Review)
    return Review

#any function whenever we want to apply it on the text data ,we can not directly apply it.for that we use lambda.
#lambda is an Intermediate function.

cleaned1 = lambda Review: clean_Review(Review)

In [7]:
#Applying function to dataset
df1["Cleaned_Reviews"]=pd.DataFrame(df1.Review.apply(cleaned1)) # adding new column

In [8]:
df1

Unnamed: 0,Review,Rating,Cleaned_Reviews
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not experience hotel monaco seattle...
3,"unique, great stay, wonderful time hotel monac...",5,unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...
...,...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5,best kept secret rd time staying charm not sta...
20487,great location price view hotel great quick pl...,4,great location price view hotel great quick pl...
20488,"ok just looks nice modern outside, desk staff ...",2,ok just looks nice modern outside desk staff n...
20489,hotel theft ruined vacation hotel opened sept ...,1,hotel theft ruined vacation hotel opened sept ...


In [9]:
#Importing libraries for text preprocessing

import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer 

In [10]:
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
stemmer=PorterStemmer()
import nltk
#nltk.download('wordnet')

In [11]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
STOP_WORDS |= {"nt","hotel","room"}
print(STOP_WORDS)

{'sometime', 'not', 'without', "'re", 'toward', 'sixty', 'up', 'whither', 'ca', 'yet', 'another', 'ever', 'into', 'namely', 'empty', 'would', 'else', 'when', 'eleven', 'always', 'less', 'ourselves', 'an', 'besides', 'whereafter', 're', 'upon', 'thus', 'as', 'alone', 'itself', 'whole', 'eight', 'n’t', 'it', '’ve', 'even', 'serious', 'seeming', 'done', 'become', 'move', "'s", 'being', 'two', 'thereby', 'hence', 'hers', 'will', 'yourselves', 'between', 'put', 'part', 'such', 'himself', 'otherwise', 'any', 'whatever', 'nowhere', 'thence', 'least', 'herein', 'indeed', 'take', 'thereupon', 'bottom', 'this', 'nobody', 'him', 'with', 'i', 'is', 'about', 'thereafter', 'nor', 'four', 'through', 'themselves', 'out', 'somehow', 'you', 'nt', 'latter', 'while', 'in', 'using', 'quite', 'and', 'whose', '’d', 'unless', 'get', 'me', 'could', 'on', 'back', 'above', 'own', 'same', 'all', 'do', 'fifteen', 'front', 'keep', 'nevertheless', '‘ve', '‘ll', 'hereafter', 'name', 'go', 'beyond', "'ve", 'most', 'ho

In [12]:
Reviews1=df1.copy()
Reviews1.drop(["Review","Rating"],axis=1,inplace=True)
Reviews1["Cleaned_Reviews"][6]

'cozy stay rainy city husband spent nights monaco early january business trip chance come ride we booked monte carlo suite proved comfortable longish stay room located street building street noise not problem view interesting rooms building look dank alley midsection large office building suite comfortable plenty room spread bathroom attractive squeaky clean small comparison generous proportions sitting sleeping areas lots comfortable seating options good lighting plenty storage clothing luggage hotel staff friendly efficient housekeeping staff did great job pleasant requests responded quickly the location quite good easy walk pike street market seattle art museum notch shopping dining options a positive experience '

In [13]:
Reviews1

Unnamed: 0,Cleaned_Reviews
0,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...
2,nice rooms not experience hotel monaco seattle...
3,unique great stay wonderful time hotel monaco ...
4,great stay great stay went seahawk game awesom...
...,...
20486,best kept secret rd time staying charm not sta...
20487,great location price view hotel great quick pl...
20488,ok just looks nice modern outside desk staff n...
20489,hotel theft ruined vacation hotel opened sept ...


In [14]:
#test line
#Stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [15]:
corpus=[]

In [16]:
# modifications are made in these lines to improve the EDA result(Stemming/lemmatisation results where not satisfactory in earlier code)
for i in range  (0,len(Reviews1)):
    review=re.sub("[^a-zA-Z]"," ",Reviews1["Cleaned_Reviews"][i])
    review=review.split()
    review = [ps.stem(word) for word in review if not word in STOP_WORDS]
    #print(stemmed_tokens[0:40])
    review=" ".join(review)
    corpus.append(review)

In [17]:
corpus

['nice expens park got good deal stay anniversari arriv late even took advic previou review valet park check quick easi littl disappoint non exist view clean nice size bed comfort woke stiff neck high pillow soundproof like heard music night morn loud bang door open close hear peopl talk hallway mayb noisi neighbor aveda bath product nice goldfish stay nice touch taken advantag stay longer locat great walk distanc shop overal nice experi have pay park night',
 'ok special charg diamond member hilton decid chain shot th anniversari seattl start book suit paid extra websit descript suit bedroom bathroom standard took print reserv desk show said thing like tv couch ect desk clerk told oh mix suit descript kimpton websit sorri free breakfast got kid embassi suit sit bathroom bedroom unlik kimpton call suit day stay offer correct fals advertis send kimpton prefer guest websit email ask failur provid suit advertis websit reserv descript furnish hard copi reserv printout websit desk manag dut

In [18]:
'''
#old code only for comparision
    for i in range  (0,len(Reviews1)):
    review=re.sub("[^a-zA-Z]"," ",Reviews1["Cleaned_Reviews"][i])
    
    review=review.split()
    review=[lemmatizer.lemmatize(word) for word in review if not word in STOP_WORDS]
    review=" ".join(review)
    corpus.append(review)'''

'\n#old code only for comparision\n    for i in range  (0,len(Reviews1)):\n    review=re.sub("[^a-zA-Z]"," ",Reviews1["Cleaned_Reviews"][i])\n    \n    review=review.split()\n    review=[lemmatizer.lemmatize(word) for word in review if not word in STOP_WORDS]\n    review=" ".join(review)\n    corpus.append(review)'

In [19]:
len(corpus)

20491

In [20]:
corpus[6]

'cozi stay raini citi husband spent night monaco earli januari busi trip chanc come ride book mont carlo suit prove comfort longish stay locat street build street nois problem view interest room build look dank alley midsect larg offic build suit comfort plenti spread bathroom attract squeaki clean small comparison gener proport sit sleep area lot comfort seat option good light plenti storag cloth luggag staff friendli effici housekeep staff great job pleasant request respond quickli locat good easi walk pike street market seattl art museum notch shop dine option posit experi'

In [21]:
df1["Cleaned_Review_Lemmatized"]=corpus

In [22]:
df1

Unnamed: 0,Review,Rating,Cleaned_Reviews,Cleaned_Review_Lemmatized
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice expens park got good deal stay anniversar...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok special charg diamond member hilton decid c...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not experience hotel monaco seattle...,nice room experi monaco seattl good n t level ...
3,"unique, great stay, wonderful time hotel monac...",5,unique great stay wonderful time hotel monaco ...,uniqu great stay wonder time monaco locat exce...
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...,great stay great stay went seahawk game awesom...
...,...,...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5,best kept secret rd time staying charm not sta...,best kept secret rd time stay charm star n t b...
20487,great location price view hotel great quick pl...,4,great location price view hotel great quick pl...,great locat price view great quick place sight...
20488,"ok just looks nice modern outside, desk staff ...",2,ok just looks nice modern outside desk staff n...,ok look nice modern outsid desk staff n t part...
20489,hotel theft ruined vacation hotel opened sept ...,1,hotel theft ruined vacation hotel opened sept ...,theft ruin vacat open sept guest week happi st...


In [23]:
#Polarity and subjectivity#
import textblob
from textblob import TextBlob

In [24]:
df1["Polarity"]=df1["Cleaned_Review_Lemmatized"].apply(lambda x:TextBlob(x).sentiment.polarity)

In [25]:
df1["Subjectivity"]=df1["Cleaned_Review_Lemmatized"].apply(lambda x:TextBlob(x).sentiment.subjectivity)

In [26]:
df1

Unnamed: 0,Review,Rating,Cleaned_Reviews,Cleaned_Review_Lemmatized,Polarity,Subjectivity
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice expens park got good deal stay anniversar...,0.353265,0.749286
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok special charg diamond member hilton decid c...,0.311808,0.495012
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not experience hotel monaco seattle...,nice room experi monaco seattl good n t level ...,0.294513,0.591511
3,"unique, great stay, wonderful time hotel monac...",5,unique great stay wonderful time hotel monaco ...,uniqu great stay wonder time monaco locat exce...,0.455556,0.640278
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...,great stay great stay went seahawk game awesom...,0.424351,0.537987
...,...,...,...,...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5,best kept secret rd time staying charm not sta...,best kept secret rd time stay charm star n t b...,0.195152,0.534444
20487,great location price view hotel great quick pl...,4,great location price view hotel great quick pl...,great locat price view great quick place sight...,0.516667,0.625000
20488,"ok just looks nice modern outside, desk staff ...",2,ok just looks nice modern outside desk staff n...,ok look nice modern outsid desk staff n t part...,0.263874,0.499026
20489,hotel theft ruined vacation hotel opened sept ...,1,hotel theft ruined vacation hotel opened sept ...,theft ruin vacat open sept guest week happi st...,0.119859,0.458975


In [27]:
def sentiment(x):
    if x<0:
        return 'negative'
    elif x==0:
        return 'neutral'
    else:
        return 'positive'
    
df1['polarity_score']=df1['Polarity'].\
   map(lambda x: sentiment(x))


In [28]:
pos = [5,4]
neg = [1,2]
neu=[3]

In [29]:
def sentiment(rating):
  if rating in pos:
    return "positive"
  elif rating in neg:
    return "negative"
  elif rating in neu:
      return "neutral"

In [30]:
df1['Sentiment'] = df1['Rating'].apply(sentiment)

In [31]:
df1

Unnamed: 0,Review,Rating,Cleaned_Reviews,Cleaned_Review_Lemmatized,Polarity,Subjectivity,polarity_score,Sentiment
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice expens park got good deal stay anniversar...,0.353265,0.749286,positive,positive
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok special charg diamond member hilton decid c...,0.311808,0.495012,positive,negative
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not experience hotel monaco seattle...,nice room experi monaco seattl good n t level ...,0.294513,0.591511,positive,neutral
3,"unique, great stay, wonderful time hotel monac...",5,unique great stay wonderful time hotel monaco ...,uniqu great stay wonder time monaco locat exce...,0.455556,0.640278,positive,positive
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...,great stay great stay went seahawk game awesom...,0.424351,0.537987,positive,positive
...,...,...,...,...,...,...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5,best kept secret rd time staying charm not sta...,best kept secret rd time stay charm star n t b...,0.195152,0.534444,positive,positive
20487,great location price view hotel great quick pl...,4,great location price view hotel great quick pl...,great locat price view great quick place sight...,0.516667,0.625000,positive,positive
20488,"ok just looks nice modern outside, desk staff ...",2,ok just looks nice modern outside desk staff n...,ok look nice modern outsid desk staff n t part...,0.263874,0.499026,positive,negative
20489,hotel theft ruined vacation hotel opened sept ...,1,hotel theft ruined vacation hotel opened sept ...,theft ruin vacat open sept guest week happi st...,0.119859,0.458975,positive,negative


### TF-IDF Vectorizer

In [32]:
#Using TF-IDF#
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(norm="l2", ngram_range=(1,3), analyzer='word', max_features =500)
X = tv.fit_transform(df1["Cleaned_Review_Lemmatized"]).toarray()
X_feat=pd.DataFrame(X)

In [33]:
X_feat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
1,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
2,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.063024,0.0,0.0,0.000000,0.000000
3,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.170888,0.0,0.121416,0.125823,0.0,0.0,0.000000,0.000000
4,0.0,0.00000,0.000000,0.0,0.097108,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.094146,0.0,0.066891,0.069319,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20486,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.136626,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
20487,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
20488,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
20489,0.0,0.06541,0.031648,0.0,0.000000,0.03335,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.025142,0.104218,0.0,0.0,0.000000,0.084116


In [34]:
X_feat.shape

(20491, 500)

## Data for Model Building

In [35]:
final_df=pd.concat([df1["Sentiment"],X_feat],axis=1) #combine sentiment from df and X_feat

In [36]:
final_df

Unnamed: 0,Sentiment,0,1,2,3,4,5,6,7,8,...,490,491,492,493,494,495,496,497,498,499
0,positive,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
1,negative,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
2,neutral,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.063024,0.0,0.0,0.000000,0.000000
3,positive,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.170888,0.0,0.121416,0.125823,0.0,0.0,0.000000,0.000000
4,positive,0.0,0.00000,0.000000,0.0,0.097108,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.094146,0.0,0.066891,0.069319,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20486,positive,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.136626,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
20487,positive,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
20488,negative,0.0,0.00000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
20489,negative,0.0,0.06541,0.031648,0.0,0.000000,0.03335,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.025142,0.104218,0.0,0.0,0.000000,0.084116


# Model Building
##### Type of the problem : Classification 

### 1. Logistic Regression

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.tree import  DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

In [38]:
X=final_df.iloc[:,1:501]
Y=final_df.iloc[:,0] 

In [39]:
X.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.063024,0.0,0.0,0.0,0.0


In [40]:
Y.head(3)

0    positive
1    negative
2     neutral
Name: Sentiment, dtype: object

In [41]:
Y.value_counts() # there are 3 unique target classes

positive    15093
negative     3214
neutral      2184
Name: Sentiment, dtype: int64

In [42]:
# Splitting data into training and testing data set
x_train, x_test,y_train,y_test = train_test_split(X,Y, test_size=0.2,random_state=40)

In [43]:
x_train.count()

0      16392
1      16392
2      16392
3      16392
4      16392
       ...  
495    16392
496    16392
497    16392
498    16392
499    16392
Length: 500, dtype: int64

In [44]:
x_test.count()

0      4099
1      4099
2      4099
3      4099
4      4099
       ... 
495    4099
496    4099
497    4099
498    4099
499    4099
Length: 500, dtype: int64

In [45]:
'''#Logistic regression and fit the model
model_1 = LogisticRegression()
model_1.fit(x_train,y_train) '''

'#Logistic regression and fit the model\nmodel_1 = LogisticRegression()\nmodel_1.fit(x_train,y_train) '

In [46]:
'''#Predict for train dataset
pred_train_LR=model_1.predict(x_train)
np.mean(pred_train_LR==y_train)'''

'#Predict for train dataset\npred_train_LR=model_1.predict(x_train)\nnp.mean(pred_train_LR==y_train)'

In [47]:
#pd.Series(pred_train_LR).value_counts()

In [48]:
#pred_test_LR=model_1.predict(x_test)

In [49]:
#np.mean(pred_test_LR==y_test)

In [50]:
# Confusion Matrix for the model accuracy
'''from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train,pred_train_LR)
print (confusion_matrix)'''

'from sklearn.metrics import confusion_matrix\nconfusion_matrix = confusion_matrix(y_train,pred_train_LR)\nprint (confusion_matrix)'

##### Test results: Logistic Regression
        Model accuracy on train data --> 85.9%
        Model acuuracy on test data --> 84.1%

### 2. Decision Tree Classifier using Entropy Criteria

In [51]:
from sklearn.tree import  DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [52]:
'''model_2 = DecisionTreeClassifier(criterion = 'entropy',max_depth=3)
model_2.fit(x_train,y_train)'''

"model_2 = DecisionTreeClassifier(criterion = 'entropy',max_depth=3)\nmodel_2.fit(x_train,y_train)"

In [53]:
#Predicting on test data
'''pred_train_DT1 = model_2.predict(x_train) # predicting on test data set 
pd.Series(pred_train_DT1).value_counts() # getting the count of each category'''

'pred_train_DT1 = model_2.predict(x_train) # predicting on test data set \npd.Series(pred_train_DT1).value_counts() # getting the count of each category'

In [54]:
#np.mean(pred_train_DT1==y_train)

In [55]:
#Predict for test dataset
#pred_test_DT1=model_2.predict(x_test)

In [56]:
#np.mean(pred_test_DT1==y_test)

##### Test results: Decision Tree Using Entropy
        Model accuracy on train data --> 75.7%
        Model acuuracy on test data --> 75.4%

### 3. Gaussian Naive Bayes

In [57]:
from sklearn.naive_bayes import GaussianNB
classifier_NB = GaussianNB()
classifier_NB.fit(x_train, y_train)
pred_train_NB=classifier_NB.predict(x_train)

In [58]:
#np.mean(pred_train_NB==y_train)

In [59]:
'''pred_test_NB=classifier_NB.predict(x_test)
np.mean(pred_test_NB==y_test)'''

'pred_test_NB=classifier_NB.predict(x_test)\nnp.mean(pred_test_NB==y_test)'

##### Test results: Gaussian Naive Bayes
        Model accuracy on train data --> 71.6%
        Model acuuracy on test data --> 70.0%

## Model 4 - Bagged Decision Trees 

In [60]:
'''from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

array = final_df.values

X=array[:,1:501]
Y=array[:,0]

seed = 7

kfold = KFold(n_splits=5) # random_state=seed,shuffle=True --- it was there in original file
cart = DecisionTreeClassifier()
num_trees = 20
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())'''

'from sklearn.model_selection import KFold\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.ensemble import BaggingClassifier\nfrom sklearn.tree import DecisionTreeClassifier\n\narray = final_df.values\n\nX=array[:,1:501]\nY=array[:,0]\n\nseed = 7\n\nkfold = KFold(n_splits=5) # random_state=seed,shuffle=True --- it was there in original file\ncart = DecisionTreeClassifier()\nnum_trees = 20\nmodel = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)\nresults = cross_val_score(model, X, Y, cv=kfold)\nprint(results.mean())'

##### Test results: Bagged Decision Tree
        Model accuracy(cross validation score) --> 79.8%
        K folds=5, Num_trees=20

## Model 5 - Random Forest Classification

In [61]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [62]:
'''array = final_df.values

X=array[:,1:501]
Y=array[:,0]
num_trees = 50
max_features = 250
kfold = KFold(n_splits=5) #random_state=7, shuffle=True) '''

'array = final_df.values\n\nX=array[:,1:501]\nY=array[:,0]\nnum_trees = 50\nmax_features = 250\nkfold = KFold(n_splits=5) #random_state=7, shuffle=True) '

In [63]:
'''model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features,random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())'''

'model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features,random_state=seed)\nresults = cross_val_score(model, X, Y, cv=kfold)\nprint(results.mean())'

##### Test results: Random forest
        Model accuracy(cross validation score) --> 80.6%
        K folds=5, Num_trees=20

## Model 6 - AdaBoost Classification

In [64]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

In [65]:
'''X=array[:,1:501]
Y=array[:,0]
num_trees = 50
max_features = 250
kfold = KFold(n_splits=5) #random_state=7, shuffle=True)'''

'X=array[:,1:501]\nY=array[:,0]\nnum_trees = 50\nmax_features = 250\nkfold = KFold(n_splits=5) #random_state=7, shuffle=True)'

In [66]:
'''model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())'''

'model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)\nresults = cross_val_score(model, X, Y, cv=kfold)\nprint(results.mean())'

##### Test results: AdaBoost
        Model accuracy(cross validation score) --> 81.1%
        K folds=5, Num_trees=20

## Model 7 - Stacking Ensemble for Classification

In [67]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [68]:
'''X=array[:,1:501]
Y=array[:,0]
num_trees = 50
max_features = 250
kfold = KFold(n_splits=5) #random_state=7, shuffle=True)'''

'X=array[:,1:501]\nY=array[:,0]\nnum_trees = 50\nmax_features = 250\nkfold = KFold(n_splits=5) #random_state=7, shuffle=True)'

In [69]:
# create the sub models
'''estimators = []
model1 = LogisticRegression(max_iter=250)
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))'''

"estimators = []\nmodel1 = LogisticRegression(max_iter=250)\nestimators.append(('logistic', model1))\nmodel2 = DecisionTreeClassifier()\nestimators.append(('cart', model2))\nmodel3 = SVC()\nestimators.append(('svm', model3))"

In [70]:
'''# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X, Y, cv=kfold)
print(results.mean())'''

'# create the ensemble model\nensemble = VotingClassifier(estimators)\nresults = cross_val_score(ensemble, X, Y, cv=kfold)\nprint(results.mean())'

##### Test results: Stacking Ensemble
        Model accuracy(cross validation score) --> 84.2%
        K folds=5, Num_trees=20

## Model 8 - SVM

In [71]:
# SVM Classification
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

In [72]:
'''array = final_df.values

X=array[:,1:501]
Y=array[:,0] '''

In [73]:
#X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.3)

In [74]:
#X_train.shape, y_train.shape, X_test.shape, y_test.shape

((14343, 500), (14343,), (6148, 500), (6148,))

### Grid Search CV

In [None]:
'''clf = SVC()
param_grid = [{'gamma':[50,10,0.5],'C':[15,10,0.1] }]
gsv = GridSearchCV(clf,param_grid,cv=10)
gsv.fit(X_train,y_train)'''

In [None]:
#gsv.best_params_ , gsv.best_score_ 

In [None]:
'''clf = SVC(C= 15, gamma = 50)
clf.fit(X_train , y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred) * 100
print("Accuracy =", acc)
confusion_matrix(y_test, y_pred)'''

## Model 8 - Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense
import numpy

In [None]:
# fix random seed for reproducibility
#seed = 7
#numpy.random.seed(seed)

In [None]:
# split into input (X) and output (Y) variables
'''array = final_df.values

X=array[:,1:501]
Y=array[:,0]'''

In [None]:
# create model
'''model = Sequential()
model.add(Dense(500, input_dim=500,activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(1, activation='sigmoid'))'''

In [None]:
'''# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])'''

In [None]:
'''# Fit the model
history_1=model.fit(X, Y, validation_split=0.33, epochs=500, batch_size=5)'''