In [None]:
### SAMPLE RESTAURANT NLP - RATING MODEL

In [1]:
import pandas as pd

In [2]:
raw = pd.read_csv('../data/reviews_restaurant.csv')

In [3]:
raw.shape

(4223903, 14)

In [4]:
raw.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,categories,user_id,review_stars,text,date
0,pQeaRpvuhoEqudo3uymHIQ,The Empanadas House,404 E Green St,Champaign,IL,61820,40.110446,-88.233073,4.5,Restaurants,eSQ3z93DlzkpXK_H6MFEMw,5,I love the empanadas from the Empanadas House!...,2013-04-11 18:36:15
1,lu7vtrp_bE9PnxWfA8g4Pg,Banzai Sushi,300 John Street,Thornhill,ON,L3T 5W4,43.820492,-79.398466,4.5,Restaurants,dSTRQSeCqMTbs7l8KF_xJg,4,Been coming here since I was in grade 9 so abo...,2015-04-16 05:23:15
2,LoRef3ChgZKbxUio-sHgQg,Amir,5252 Rue Jean Talon O,MontrÃ©al,QC,H4P 2A7,45.49487,-73.651904,3.0,Restaurants,schfOYW71VamTTdfzqDlXQ,2,"Good food, terrible customer service. For me ...",2013-05-29 16:24:17
3,LoRef3ChgZKbxUio-sHgQg,Amir,5252 Rue Jean Talon O,MontrÃ©al,QC,H4P 2A7,45.49487,-73.651904,3.0,Restaurants,qKpkRCPk4ycbllTfFcRbNw,3,Just another nice Amir food nothing more but r...,2016-05-21 01:17:33
4,ZkzutF0P_u0C0yTulwaHkA,Lelulos Pizzeria,311 Unity Center Rd,Plum,PA,15239,40.489996,-79.779288,4.0,Restaurants,HzeJmgXaaWxReb9scgjv5A,5,"Stopped here with my wife due to the 18"" pizza...",2017-09-21 02:19:14


In [9]:
# Select the first 5000 rows and separate only by rating (y) and text (X)

y = raw.loc[:10000,'review_stars']
X = raw.loc[:10000,'text']

In [6]:
import spacy
import spacy_lookups_data
import re

nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser']) 
nlp.max_length = 33000000

In [None]:
# Functions to clean text and obtain lemmas

In [10]:
# TO EDIT
def clean_url(str_text_raw):
    '''This function eliminate a string URL in a given text'''
    str_text = re.sub('url_\S+', '', str_text_raw)
    str_text = re.sub('email_\S+', '', str_text)
    str_text = re.sub('phone_\S+', '', str_text)
    return(re.sub('http[s]?://\S+', '', str_text))
    
def clean_punctuation(str_text_raw):
    '''This function replace some of the troublemaker puntuation elements in a given text'''
    return(re.sub('[$\(\)/|{|\}#~\[\]^#;:!?¿]', ' ', str_text_raw))

def clean_unicode(str_text_raw):
    '''This function eliminate non-unicode text'''
    str_text = re.sub('&amp;', '', str_text_raw)
    return(re.sub(r'[^\x00-\x7F]+',' ', str_text))
                      
def clean_dot_words(str_text_raw):
    '''This function replace the dots between words'''
    return(re.sub(r'(\w+)\.+(\w+)', r'\1 \2',str_text_raw))

def clean_text(str_text_raw):
    '''This function clean a given '''
    str_text = str_text_raw.lower()
    str_text = clean_dot_words(clean_punctuation(clean_unicode(clean_url(str_text))))
    return(str_text)

####

tokens_to_drop=['+']

def string_to_token(string, str_pickle = None):
    '''
    This function takes a sentence and returns the list of tokens and all their information
    * Text: The original text of the lexeme.
    * Lemme: Lexeme.
    * Orth: The hash value of the lexeme.
    * is alpha: Does the lexeme consist of alphabetic characters?
    * is digit: Does the lexeme consist of digits?
    * is_title: Is the token in titlecase? 
    * is_punct: Is the token punctuation?
    * is_space: Does the token consist of whitespace characters?
    * is_stop: Is the token part of a “stop list”?
    * is_digit: Does the token consist of digits?
    * lang: Language of the token
    * tag: Fine-grained part-of-speech. The complete list is in: 
    https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html, also using: spacy.explain("RB")
    * pos: Coarse-grained part-of-speech.
    * has_vector: A boolean value indicating whether a word vector is associated with the token.
    * vector_norm: The L2 norm of the token’s vector representation.
    * is_ovv: '''
    doc = nlp(string)
    l_token = [[token.text, token.lemma_, token.orth, token.is_alpha, token.is_digit, token.is_title, token.lang_, 
        token.tag_, token.pos_, token.has_vector, token.vector_norm, token.is_oov]
        for token in doc if not token.is_punct | token.is_space | token.is_stop | token.is_digit | token.like_url 
               | token.like_num | token.like_email & token.is_oov]
    pd_token = pd.DataFrame(l_token, columns=['text', 'lemme', 'orth', 'is_alpha', 'is_digit', 'is_title', 'language',
                                          'tag', 'part_of_speech', 'has_vector', 'vector_norm', 'is_oov'])
    #drop problematic tokens
    pd_token = pd_token[~pd_token['text'].isin(tokens_to_drop)]
    #Convert plural text to singular
    pd_token['text_to_singular'] = np.where(pd_token['tag'].isin(['NNPS', 'NNS']), pd_token['lemme'], pd_token['text'])
    if(str_pickle!=None):
        pd_token.to_pickle(f'data/pickles/{str_pickle}.pkl') #Modified
    del l_token
    return(pd_token)

def apply_cleaning(string):
    '''
    This function takes a sentence and returns a clean text
    '''
    doc = nlp(clean_text(string))
    l_token = [token.text for token in doc if not token.is_punct | token.is_space | token.is_stop | 
               token.is_digit | token.like_url | token.like_num | token.like_email & token.is_oov]
    return ' '.join(l_token)

def apply_lemma(string):
    '''
    This function takes a sentence and returns a clean text
    '''
    doc = nlp(clean_text(string))
    l_token = [token.lemma_ for token in doc if not token.is_punct | token.is_space | token.is_digit | 
               token.like_url | token.like_num | token.like_email & token.is_oov]
    return ' '.join(l_token)

def list_to_bow(l_words):
    '''
    This function takes a list of words and create the bag of words ordered by desc order
    '''
    cv = CountVectorizer(l_words)
    # show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector.
    count_vector=cv.fit_transform(l_words)
    word_freq = Counter(l_words)
    print(f'Bag of words size: {count_vector.shape}\nUnique words size: {len(word_freq)}')
    dict_word_freq = dict(word_freq.most_common())
    return(dict_word_freq)

In [11]:
X = X.apply(lambda x: apply_lemma(x))

In [None]:
X.loc[:20].to_list()

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english").fit_transform(X)

In [None]:
tfidf.shape

In [None]:
import umap

embedding = umap.UMAP(metric='hellinger', random_state=42).fit_transform(tfidf)

In [None]:
embedding.min()

In [None]:
# Transformación de los embbeding a positivos
embedding_positive = embedding+5
embedding_positive

In [None]:
text_umap = pd.DataFrame(embedding_positive)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_umap, y, random_state=42, test_size=0.2)

In [None]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    #ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=42)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, scoring='accuracy'))
    #precision    = np.mean(cross_val_score(model, X_train, y_train, cv=ss, scoring='precision', average='micro'))
    #recall       = np.mean(cross_val_score(model, X_train, y_train, cv=ss, scoring='recall'))
    #f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=ss, scoring='f1'))
    #rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=ss, scoring='roc_auc'))
    #y_pred = model.predict(X_test)

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             #'precision'    : [precision],
                             #'recall'       : [recall],
                             #'f1score'      : [f1score],
                             #'rocauc'       : [rocauc]
                            })   
    return df_model

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import ShuffleSplit

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
models = {'gnb': GaussianNB(),
          'bnb': BernoulliNB(),
          'mnb': MultinomialNB(),
          'logit': LogisticRegression(multi_class='auto', solver='lbfgs'),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'svc': SVC(probability=True),
          'linearsvc': LinearSVC(),
          'xgboost': GradientBoostingClassifier(),
          'NN': MLPClassifier()
         }

In [None]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
# Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) 
                       for (name, model) in models.items()])
models_df

In [None]:
rf = RandomForestClassifier().fit(X_train, y_train)
rf.score(X_train, y_train)

In [None]:
#ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=42)
#model.fit(X_train, y_train)
accuracy     = np.mean(cross_val_score(rf, X_train, y_train, scoring='accuracy'))