In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
data = pd.read_csv('./Dataset/rating_reviews.csv')

In [3]:
data.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [4]:
data.shape

(20491, 2)

In [5]:
data.size

40982

In [6]:
data.columns

Index(['Review', 'Rating'], dtype='object')

In [7]:
data.isnull().sum()

Review    0
Rating    0
dtype: int64

In [8]:
data['Rating'].value_counts()

5    9054
4    6039
3    2184
2    1793
1    1421
Name: Rating, dtype: int64

In [9]:
data['Review'] = data['Review'].str.replace("[^a-zA-Z#]"," ")

In [10]:
tokenized_reviews = data['Review'].apply(lambda x: x.split())
tokenized_reviews.head()

0    [nice, hotel, expensive, parking, got, good, d...
1    [ok, nothing, special, charge, diamond, member...
2    [nice, rooms, not, experience, hotel, monaco, ...
3    [unique, great, stay, wonderful, time, hotel, ...
4    [great, stay, great, stay, went, seahawk, game...
Name: Review, dtype: object

In [11]:
tokenized_reviews.tail()

20486    [best, kept, secret, rd, time, staying, charm,...
20487    [great, location, price, view, hotel, great, q...
20488    [ok, just, looks, nice, modern, outside, desk,...
20489    [hotel, theft, ruined, vacation, hotel, opened...
20490    [people, talking, ca, n, t, believe, excellent...
Name: Review, dtype: object

In [12]:
import nltk
from nltk import PorterStemmer
ps=PorterStemmer()
tokenized_reviews=tokenized_reviews.apply(lambda x:[ps.stem(word) for word in x])
tokenized_reviews.head()

0    [nice, hotel, expens, park, got, good, deal, s...
1    [ok, noth, special, charg, diamond, member, hi...
2    [nice, room, not, experi, hotel, monaco, seatt...
3    [uniqu, great, stay, wonder, time, hotel, mona...
4    [great, stay, great, stay, went, seahawk, game...
Name: Review, dtype: object

In [13]:
for i in range(len(tokenized_reviews)):
    tokenized_reviews[i]=' '.join(tokenized_reviews[i])
    data['Tokenized Review']=tokenized_reviews
data.head()

Unnamed: 0,Review,Rating,Tokenized Review
0,nice hotel expensive parking got good deal sta...,4,nice hotel expens park got good deal stay hote...
1,ok nothing special charge diamond member hilto...,2,ok noth special charg diamond member hilton de...
2,nice rooms not experience hotel monaco seat...,3,nice room not experi hotel monaco seattl good ...
3,unique great stay wonderful time hotel monac...,5,uniqu great stay wonder time hotel monaco loca...
4,great stay great stay went seahawk game aweso...,5,great stay great stay went seahawk game awesom...


# FEATURE EXTRACTION

# Using Count Vectorizer

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
count_vector_bow=CountVectorizer(max_features=6000,stop_words='english',ngram_range=(1,4))
x_bow = count_vector_bow.fit_transform(data['Tokenized Review']).toarray()

In [16]:
x_bow.shape

(20491, 6000)

In [17]:
y_bow=data['Rating']

In [18]:
y_bow.shape

(20491,)

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train_bow,x_test_bow,y_train_bow,y_test_bow=train_test_split(x_bow,y_bow,test_size=0.3,random_state=7)

In [21]:
count_vector_bow.get_feature_names()

['aaa',
 'abl',
 'abl check',
 'abl use',
 'abl walk',
 'absolut',
 'absolut beauti',
 'absolut gorgeou',
 'absolut love',
 'absolut wonder',
 'abund',
 'ac',
 'accademia',
 'accept',
 'access',
 'access internet',
 'access room',
 'accid',
 'accommod',
 'accomod',
 'accompani',
 'accord',
 'account',
 'accur',
 'acknowledg',
 'act',
 'action',
 'activ',
 'actual',
 'ad',
 'ad bonu',
 'adagio',
 'add',
 'addit',
 'address',
 'adequ',
 'adjac',
 'adjoin',
 'adjoin room',
 'adjust',
 'admit',
 'adult',
 'adult children',
 'advanc',
 'advantag',
 'adventur',
 'advertis',
 'advic',
 'advis',
 'advisor',
 'advisor review',
 'aerob',
 'affect',
 'affinia',
 'affinia dumont',
 'afford',
 'afraid',
 'afternoon',
 'afternoon tea',
 'age',
 'agenc',
 'agent',
 'aggress',
 'ago',
 'agre',
 'agre review',
 'ahead',
 'ahead time',
 'air',
 'air condit',
 'air condit room',
 'air condit work',
 'air condition',
 'aircon',
 'aircondit',
 'airi',
 'airlin',
 'airport',
 'airport bu',
 'airport hotel',

In [22]:
count_vector_bow.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 6000,
 'min_df': 1,
 'ngram_range': (1, 4),
 'preprocessor': None,
 'stop_words': 'english',
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [23]:
dataframe_count=pd.DataFrame(x_bow,columns=count_vector_bow.get_feature_names())

In [24]:
dataframe_count.head()

Unnamed: 0,aaa,abl,abl check,abl use,abl walk,absolut,absolut beauti,absolut gorgeou,absolut love,absolut wonder,...,young,young children,younger,yr,yr old,yummi,yunqu,zero,zone,zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
