# PREPROCESSING

## GET DATA

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")

import warnings
warnings.filterwarnings('ignore')

pd.pandas.set_option('display.max_columns', None)

In [20]:
df = pd.read_csv('Data/tripadvisor_hotel_reviews.csv', encoding = 'latin-1')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [21]:
df.columns = ['Review', 'Rating']
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


## CLEANING

In [22]:
import string
import re

In [23]:
def clean_text(text):
    return re.sub('[^a-zA-Z]', ' ', text).lower()

In [None]:
df['cleaned_text'] = df['Review'].apply(lambda x: clean_text(x))
df['label'] = df['Rating'].map({1.0:0, 2.0:0, 3.0:0, 4.0:1, 5.0:1})

In [24]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

Unnamed: 0,Review,Rating,cleaned_text,label,Review_len,punct
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,1,505,2.4
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,0,1438,1.8
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not experience hotel monaco seat...,0,1209,2.6
3,"unique, great stay, wonderful time hotel monac...",5,unique great stay wonderful time hotel monac...,1,510,3.1
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game aweso...,1,1089,3.7


In [None]:
df['Review_len'] = df['Review'].apply(lambda x: len(x) - x.count(" "))
df['punct'] = df['Review'].apply(lambda x: count_punct(x))
df.head()

In [25]:
def tokenize_text(text):
    tokenized_text = text.split()
    return tokenized_text

Unnamed: 0,Review,Rating,cleaned_text,label,Review_len,punct,tokens
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,1,505,2.4,"[nice, hotel, expensive, parking, got, good, d..."
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,0,1438,1.8,"[ok, nothing, special, charge, diamond, member..."
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not experience hotel monaco seat...,0,1209,2.6,"[nice, rooms, not, experience, hotel, monaco, ..."
3,"unique, great stay, wonderful time hotel monac...",5,unique great stay wonderful time hotel monac...,1,510,3.1,"[unique, great, stay, wonderful, time, hotel, ..."
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game aweso...,1,1089,3.7,"[great, stay, great, stay, went, seahawk, game..."


In [None]:
df['tokens'] = df['cleaned_text'].apply(lambda x: tokenize_text(x))
df.head()

## STOPWORD

In [26]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to C:\Users\Gede
[nltk_data]     Darmawan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Gede
[nltk_data]     Darmawan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Gede
[nltk_data]     Darmawan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [27]:
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

In [28]:
def lemmatize_text(token_list):
    return " ".join([lemmatizer.lemmatize(token) for token in token_list if not token in set(all_stopwords)])

In [29]:
lemmatizer = nltk.stem.WordNetLemmatizer()
df['lemmatized_review'] = df['tokens'].apply(lambda x: lemmatize_text(x))
df.head()

Unnamed: 0,Review,Rating,cleaned_text,label,Review_len,punct,tokens,lemmatized_review
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,1,505,2.4,"[nice, hotel, expensive, parking, got, good, d...",nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,0,1438,1.8,"[ok, nothing, special, charge, diamond, member...",ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not experience hotel monaco seat...,0,1209,2.6,"[nice, rooms, not, experience, hotel, monaco, ...",nice room not experience hotel monaco seattle ...
3,"unique, great stay, wonderful time hotel monac...",5,unique great stay wonderful time hotel monac...,1,510,3.1,"[unique, great, stay, wonderful, time, hotel, ...",unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game aweso...,1,1089,3.7,"[great, stay, great, stay, went, seahawk, game...",great stay great stay went seahawk game awesom...
