# PREPROCESSING

## GET DATA

In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import LancasterStemmer
import seaborn as sns
import nltk
import string
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('omw-1.4')
%matplotlib inline
sns.set_style("whitegrid")

import warnings
warnings.filterwarnings('ignore')

pd.pandas.set_option('display.max_columns', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [97]:
df = pd.read_csv('Data/tripadvisor_hotel_reviews.csv', encoding = 'latin-1')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [98]:
df.columns = ['Review', 'Rating']
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


## CLEANING

In [99]:
def create_label(rating):
    if rating > 3:
        return 'positive'
    elif rating < 3:
        return 'negative'
    else:
        return 'neutral'

In [100]:
def remove_punctuation(text):
    text = ''.join([char for char in text if char not in string.punctuation]).lower()
    return text

In [101]:
df['label'] = df['Rating'].apply(lambda x: create_label(x))
df['clean_punc'] = df['Review'].apply(lambda x: remove_punctuation(x))

In [102]:
df.head()

Unnamed: 0,Review,Rating,label,clean_punc
0,nice hotel expensive parking got good deal sta...,4,positive,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,negative,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,neutral,nice rooms not 4 experience hotel monaco seatt...
3,"unique, great stay, wonderful time hotel monac...",5,positive,unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",5,positive,great stay great stay went seahawk game awesom...


## STOPWORD & LEMMATIZATION

In [103]:
stopwords_list = stopwords.words('english')
def remove_stopwords(text):
    token_words = text.split()
    filtered_words = []
    for word in token_words:
        if word not in stopwords_list:
            filtered_words.append(word)
    return ' '.join(filtered_words)

In [104]:
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(sentence):
    token_words = sentence.split()
    lemmatize_text = []
    for word in token_words:
        lemmatize_text.append(lemmatizer.lemmatize(word,'v'))
    return " ".join(lemmatize_text)

In [105]:
df['lemmatized_review'] = df['clean_punc'].apply(lambda x: lemmatize_text(x))

print('Original Review:\t', df['Review'][0])
print('Lemmatized Review:\t', df['lemmatized_review'][0])

Original Review:	 nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe just noisy neighbors, aveda bath products nice, did not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience having pay 40 parking night,  
Lemmatized Review:	 nice hotel expensive park get good deal stay hotel anniversary arrive late even take advice previous review do valet park check quick easy little disappoint nonexistent view room room clean nice size bed comfortable wake stiff neck high pillow not soundproof like hear music room night morning loud bang doors open close hear people talk hallway maybe just noisy neigh

In [107]:
df.head()

Unnamed: 0,Review,Rating,label,clean_punc,lemmatized_review
0,nice hotel expensive parking got good deal sta...,4,positive,nice hotel expensive parking got good deal sta...,nice hotel expensive park get good deal stay h...
1,ok nothing special charge diamond member hilto...,2,negative,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,neutral,nice rooms not 4 experience hotel monaco seatt...,nice room not 4 experience hotel monaco seattl...
3,"unique, great stay, wonderful time hotel monac...",5,positive,unique great stay wonderful time hotel monaco ...,unique great stay wonderful time hotel monaco ...
4,"great stay great stay, went seahawk game aweso...",5,positive,great stay great stay went seahawk game awesom...,great stay great stay go seahawk game awesome ...


In [106]:
df.to_csv('Data/cleaned_hotel_reviews.csv', index=False)