# Solution to the Hotel Reviews exercise

All Rights Reserved © <a href="http://www.louisdorard.com" style="color: #6D00FF;">Louis Dorard</a>

<img src="http://s3.louisdorard.com.s3.amazonaws.com/ML_icon.png">


## Prepare data

Load from CSV

In [None]:
import pandas as pd
data = pd.read_csv('/data/hotel-reviews.csv', index_col=0)
data.head()

Inspect

In [None]:
from pandas_profiling import ProfileReport
pfr = ProfileReport(data)
pfr

Pre-process

In [None]:
from nltk.tokenize.simple import SpaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk import download

tknzr = SpaceTokenizer()
stemmer = LancasterStemmer()
download('stopwords')

def filter_stopwords(text, stopword_list):
    '''normalizes the words by turning them all lowercase and then filters out the stopwords'''
    words=[w.lower() for w in text] #normalize the words in the text, making them all lowercase
    #filtering stopwords
    filtered_words = [] #declare an empty list to hold our filtered words
    for word in words: #iterate over all words from the text
        if word not in stopword_list and word.isalpha() and len(word) > 1: #only add words that are not in the French stopwords list, are alphabetic, and are more than 1 character
            filtered_words.append(word) #add word to filter_words list if it meets the above conditions
    # filtered_words.sort() #sort filtered_words list
    return filtered_words

def stem_words(words):
    '''stems the word list using the English Stemmer'''
    #stemming words
    stemmed_words = [] #declare an empty list to hold our stemmed words
    for word in words:
        stemmed_word=stemmer.stem(word) #stem the word
        stemmed_words.append(stemmed_word) #add it to our stemmed word list
    # stemmed_words.sort() #sort the stemmed_words
    return stemmed_words

def concatenate(words):
    s = ""
    for word in words:
        s = s + word + " "
    return s

def process_strings(row):
    text = row['text']
    filtered_words = filter_stopwords(tknzr.tokenize(text), stopwords.words('english'))
    text_preprocessed = concatenate(stem_words(filtered_words))
    return text_preprocessed

%time data['text_preprocessed'] = data.apply(process_strings, axis = 1)

Define X (inputs) and y (outputs)

In [None]:
X = data['text_preprocessed'].tolist()
y = data['label'].tolist()

Split into train and test

In [None]:
test_size = 0.3 # ratio of data to have in test
seed = 8 # to be used to initialize random number generator, for reproducibility
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

## Create model

### Featurize training inputs

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
vectorizer.fit(X_train)

See how many features we have:

In [None]:
len(vectorizer.get_feature_names())

Apply the transformation:

In [None]:
X_train_vect = vectorizer.transform(X_train)

### Select best features

We could make our models lighter and faster to train by selecting the `k` most "promising" features. One method for this is that of the $\chi^2$.

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(chi2, k=50)
selector.fit(X_train_vect, y_train)

Apply feature selection:

In [None]:
X_train_vect_select = selector.transform(X_train_vect)

See what we end up with:

In [None]:
X_train_vect_select.toarray()

### Fit model

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100)

In [None]:
%time model.fit(X_train_vect_select, y_train)

## Compute performance metric

In [None]:
y_pred = model.predict(selector.transform(vectorizer.transform(X_test)))

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)

## Make single prediction

In [None]:
x = "The Chicago Hotel was the best"
import textutils
x_preprocessed = textutils.preprocess(x)
print(x_preprocessed)

In [None]:
model.predict(selector.transform(vectorizer.transform([x_preprocessed])))

In [None]:
probas = model.predict_proba(selector.transform(vectorizer.transform([x_preprocessed])))
print(probas)

Probability that it's fake:

In [None]:
probas[0][0]

## Export model

In [None]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))
pickle.dump(selector, open('selector.pkl', 'wb'))

Check it worked:

In [None]:
import hotel_reviews

hotel_reviews.predict_fakeness("The Chicago Hotel was the best")

## Tune model

WARNING: The following is incorrect... `X_train_vect_select` was built after doing a "fit", and is passed as data on which to do grid_search. This includes cross validations, which perform train-test splits; such splits should be made before any "fit" is done.

The right way is to define a pipeline first and then pass this pipeline to GridSearchCV.

In [None]:
# GridSearch RandomForest with AUC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from numpy import arange
params = arange(0.1, 1.0, 0.1)
grid_search = GridSearchCV(RandomForestClassifier(), {"max_features": params}, scoring="roc_auc", cv=5, verbose=1)

In [None]:
grid_search.fit(X_train_vect_select, y_train)