In [106]:
import numpy as np
import pandas as pd
import plotly.express as px
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
import re

In [107]:
df=pd.read_csv('tripadvisor_hotel_reviews.csv')

df.shape

(20491, 2)

# NLP (bag of words pipeline)
  

- get the corpus
- tokenise 
- Stopwords Removal
- Stemming
- Build a vocab
- Vectorization
- Classify 

In [108]:
en_stopwords=set(stopwords.words('english'))
ps=PorterStemmer()
tokeniser=RegexpTokenizer(r'\w+')

In [109]:
def getStemmmedReview(review):
    review = str(review).lower()
    review=re.sub(r'(?i)\b(?:\d+[a-z]|[a-z]+\d)\w*\b'," ",review)
    review=re.sub('\d+',' ',review)
    review=review.replace("_"," ")
    tokens=tokeniser.tokenize(review)
    review=re.sub(r'(?i)\b(?:\d+[a-z]|[a-z]+\d)\w*\b'," ",review)
    new_tokens=[token for token in tokens if token not in en_stopwords]
    stem_tokens=[ps.stem(token) for token in new_tokens]
    
    
    clean_review='  '.join(stem_tokens)
    
    return clean_review
    
    

In [110]:
df=df.values

In [111]:
X=df[:,0]
y=df[:,1]
y=y.astype('int')

print(X.shape,y.shape)

(20491,) (20491,)


In [112]:
split=int(0.8*X.shape[0])
x_train=X[:split]
y_train=y[:split]

x_test=X[split:]
y_test=y[split:]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(16392,)
(16392,)
(4099,)
(4099,)


In [113]:
new_cleaned_review=[getStemmmedReview(i) for i in x_train]

In [114]:
new_cleaned_review

['nice  hotel  expens  park  got  good  deal  stay  hotel  anniversari  arriv  late  even  took  advic  previou  review  valet  park  check  quick  easi  littl  disappoint  non  exist  view  room  room  clean  nice  size  bed  comfort  woke  stiff  neck  high  pillow  soundproof  like  heard  music  room  night  morn  loud  bang  door  open  close  hear  peopl  talk  hallway  mayb  noisi  neighbor  aveda  bath  product  nice  goldfish  stay  nice  touch  taken  advantag  stay  longer  locat  great  walk  distanc  shop  overal  nice  experi  pay  park  night',
 'ok  noth  special  charg  diamond  member  hilton  decid  chain  shot  anniversari  seattl  start  book  suit  paid  extra  websit  descript  suit  bedroom  bathroom  standard  hotel  room  took  print  reserv  desk  show  said  thing  like  tv  couch  ect  desk  clerk  told  oh  mix  suit  descript  kimpton  websit  sorri  free  breakfast  got  kid  embassi  suit  sit  room  bathroom  bedroom  unlik  kimpton  call  suit  day  s

In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [116]:
tf=TfidfVectorizer(ngram_range=(1, 3), max_features=10000)

In [117]:
x_vector=tf.fit_transform(new_cleaned_review).toarray()

In [118]:
x_vector.shape

(16392, 10000)

In [119]:
print(tf.get_feature_names())

['aaa', 'abc', 'abil', 'abl', 'abl check', 'abl enjoy', 'abl make', 'abl room', 'abl sleep', 'abl stay', 'abl use', 'abl walk', 'abroad', 'absolut', 'absolut amaz', 'absolut beauti', 'absolut best', 'absolut fantast', 'absolut gorgeou', 'absolut love', 'absolut noth', 'absolut perfect', 'absolut stay', 'absolut wonder', 'absolutli', 'abund', 'ac', 'ac work', 'accademia', 'accept', 'access', 'access avail', 'access free', 'access hotel', 'access internet', 'access lobbi', 'access room', 'accid', 'accomad', 'accomid', 'accommod', 'accomod', 'accompani', 'accord', 'accordingli', 'account', 'accross', 'accur', 'accustom', 'acknowledg', 'act', 'act like', 'action', 'activ', 'activ day', 'activ go', 'actual', 'actual hotel', 'actual quit', 'actual room', 'actual stay', 'ad', 'ad bonu', 'adagio', 'adam', 'adapt', 'add', 'addit', 'addit charg', 'address', 'adequ', 'adequ size', 'adjac', 'adjac hotel', 'adjoin', 'adjoin room', 'adjust', 'admir', 'admit', 'ador', 'adult', 'adult child', 'adult c

In [120]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()

In [121]:
lr.fit(x_vector,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [122]:
lr.score(x_vector,y_train)

0.7985602733040508

In [123]:
new_cleaned_test_review=[getStemmmedReview(i) for i in x_test]

In [124]:
new_cleaned_test_review

['typhoon  earthquak  welcom  respit  grand  hyatt  return  night  grand  hyatt  n  happier  hotel  choic  request  earli  check  room  upper  floor  hotel  deliv  room  standard  room  floor  spaciou  modern  view  citi  fantast  bathroom  eat  hotel  great  restaur  nearbi  particularli  away  main  tourist  strip  tokyo  ate  iron  chef  restaur  experi  fresh  move  sea  urchin  turtl  dumpl  meal  forget  hurri  actual  room  hotel  earthquak  hit  structur  sway  minut  hotel  staff  ensur  kept  inform  modern  build  tokyo  built  withstand  strong  earthquak  felt  comfort  time  experi  definit  stay  grand  hyatt  time  visit  tokyo  locat  hard  beat  staff  room  great',
 'love  grand  hyatt  husband  went  tokyo  half  honeymoon  initi  book  stay  grand  hyatt  talk  expat  friend  live  chang  reserv  happi  echo  previou  poster  comment  say  hotel  true  luxuri  staff  amaz  pretti  pool  workout  room  die  spent  time  day  love  moment  workout  room  high  end  t

In [125]:
x_test_vector=tf.transform(new_cleaned_test_review).toarray()

In [126]:
print(x_test_vector.shape)

(4099, 10000)


In [127]:
lr.score(x_test_vector,y_test)

0.6360087826299098

### Random Forest

In [130]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
rf=RandomForestClassifier()

In [131]:
params={
    'max_features': [500, "sqrt", "log2", "auto"],
    'max_depth': [10, 15, 25]
}

In [133]:
gridsearch=GridSearchCV(estimator = rf, param_grid = params, 
                          cv = 5, n_jobs = -1, verbose = 1, scoring = "neg_mean_squared_error" )

In [None]:
gridsearch.fit(x_vector,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
