In [1]:
# Load libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pickle
from sklearn.model_selection import train_test_split

In [2]:
# Load data
reviews_df = pd.read_csv('../data/processed/data_reviews.csv')

  reviews_df = pd.read_csv('../data/processed/data_reviews.csv')


In [3]:
# The first thing we do is remove the variable state, since we only wanted it for geoespacial visualizations.
# In this project we do not need it for the model.
reviews_df = reviews_df.drop(columns=['state'])

La siguiente transformación que vamos a realizar es la de agrupar los valores de alguno de los atributos de las variables categoricas que estaban repetidas.

In [4]:
reviews_df.dtypes

delivery                 object
outdoor_seating          object
credit_cards             object
bike_parking             object
price_range              object
take_out                 object
wifi                     object
alcohol                  object
caters                   object
wheelchair_accessible    object
good_for_kids            object
attire                   object
reservations             object
table_service            object
good_for_groups          object
tv                       object
noise_level              object
stars                     int64
useful                    int64
funny                     int64
cool                      int64
dtype: object

In [5]:
# We see all the unique values for each variable
for i in reviews_df.select_dtypes(include=['object', 'float']):
    print(i, reviews_df[i].unique())

delivery ['False' 'True' 'None' nan]
outdoor_seating ['False' 'True' 'None' nan]
credit_cards ['False' 'True' nan 'None' True False]
bike_parking ['True' 'False' nan 'None' True False]
price_range [1.0 2.0 3.0 4.0 nan '2' '3' '1' '4' 'None' '2.0' '1.0' '3.0' '4.0']
take_out ['True' 'None' nan 'False']
wifi ["u'free'" "u'no'" "'free'" "'no'" nan "u'paid'" "'paid'" 'None']
alcohol ["u'none'" "u'full_bar'" "'none'" "'full_bar'" "u'beer_and_wine'" nan
 "'beer_and_wine'" 'None']
caters ['True' 'False' nan 'None' True False]
wheelchair_accessible [nan True False 'True' 'False' 'None']
good_for_kids [nan 'True' 'False' 'None' True False]
attire [nan "u'casual'" "'casual'" "u'dressy'" "'dressy'" "u'formal'" 'None'
 "'formal'"]
reservations [nan 'False' 'True' 'None']
table_service [nan True False 'True' 'False' 'None']
good_for_groups [nan 'True' 'False' 'None' True False]
tv [nan 'True' 'False' 'None' True False]
noise_level [nan "u'quiet'" "u'average'" "u'loud'" "'quiet'" "'average'"
 "u'ver

Vamos a transformar los datos para poder convertirlos a 1 y 0 y poder trabajar con ellos. Lo haremos de forma distinta dependiendo de como vienen los datos y que necesitan.

In [6]:
# We create a list with the variables that we want to transform in this way, 1 if is True and 0 if is False and nan, None to nan. But first we transform the variables to string.

lista1 = ['credit_cards', 'bike_parking', 'caters', 'wheelchair_accessible', 'good_for_kids', 'table_service', 
          'good_for_groups', 'tv']

for i in lista1:
    reviews_df[i] = reviews_df[i].astype(str)
    reviews_df[i] = reviews_df[i].replace({'True': 1, 'False': 0, 'nan': np.nan, 'None': np.nan})

In [7]:
# We create a list with the variables that we want to transform in this other way, 1 if is True and 0 if is False and nan, None to nan. 

lista2 = ['delivery', 'outdoor_seating', 'take_out', 'reservations']

for i in lista2:
    reviews_df[i] = reviews_df[i].replace({'True': 1, 'False': 0, 'nan': np.nan, 'None': np.nan})

In [8]:
# Finally, we create a list with the variables that we want to transform in this other way, We eliminate the u' and ' and nan, None to nan.

lista_u = ['wifi', 'alcohol', 'attire', 'noise_level']

for i in reviews_df[lista_u]:
    reviews_df[i] = reviews_df[i].str.replace("u'", "'")
    reviews_df[i] = reviews_df[i].str.replace("'", "")
    reviews_df[i] = reviews_df[i].replace("None", np.nan)
    reviews_df[i] = reviews_df[i].replace("none", np.nan)

In [9]:
# This variable we have to transformate in a float type
reviews_df['price_range'] = reviews_df['price_range'].replace({'1.0': 1, '2.0': 2, '3.0': 3, '4.0': 4, 1.0: 1, 2.0: 2, 
                                                               3.0: 3, 4.0: 4, '1': 1, '2': 2, '3': 3,
                                                               '4': 4, 'None':np.nan})

In [10]:
# We look other time the unique values for each variable
for i in reviews_df.select_dtypes(include=['object', 'float']):
    print(i, reviews_df[i].unique())

delivery [ 0.  1. nan]
outdoor_seating [ 0.  1. nan]
credit_cards [ 0.  1. nan]
bike_parking [ 1.  0. nan]
price_range [ 1.  2.  3.  4. nan]
take_out [ 1. nan  0.]
wifi ['free' 'no' nan 'paid']
alcohol [nan 'full_bar' 'beer_and_wine']
caters [ 1.  0. nan]
wheelchair_accessible [nan  1.  0.]
good_for_kids [nan  1.  0.]
attire [nan 'casual' 'dressy' 'formal']
reservations [nan  0.  1.]
table_service [nan  1.  0.]
good_for_groups [nan  1.  0.]
tv [nan  1.  0.]
noise_level [nan 'quiet' 'average' 'loud' 'very_loud']


In [11]:
# we need to fill all tha nan values with 0
lista_na = ['delivery', 'outdoor_seating', 'credit_cards', 'bike_parking', 'price_range', 'take_out', 'caters', 
            'wheelchair_accessible', 'good_for_kids', 'reservations', 'table_service', 'good_for_groups', 'tv']

reviews_df[lista_na] = reviews_df[lista_na].fillna(0)

In [12]:
# We define other time the categorical and numerical variables
cat_var = ['delivery', 'outdoor_seating', 'credit_cards', 'bike_parking', 'price_range', 'take_out', 'wifi', 'alcohol', 
           'caters', 'wheelchair_accessible', 'good_for_kids', 'attire', 'reservations', 'table_service', 'good_for_groups',
           'tv', 'noise_level']

num_var = ['useful', 'funny', 'cool']

In [13]:
# We make the pipeline to transform the numerical variables
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

In [14]:
# We make the pipeline to transform the categorical variables
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unkown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [15]:
# We make the preprocessor, where we define the transformers and the variables that we want to transform
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_var),
        ('num', num_transformer, num_var)
    ]
)

In [16]:
# We save the preprocessor
with open('../modelos/preprocessor.pickle', 'wb') as f:
    pickle.dump(preprocessor, f)

In [17]:
#We split the dataset in train and test. We put stratify because the data is unbalanced and select the size of the test
#of 20%
X_train, X_test, y_train, y_test = train_test_split(reviews_df.drop('stars',axis=1), 
                                                   reviews_df['stars'], 
                                                   stratify=reviews_df['stars'], 
                                                   test_size=0.2, random_state=12345)


In [18]:
X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train,
                                                   stratify= y_train, 
                                                   test_size=0.2, random_state=12345)

In [19]:
# We save the train and test datasets
X_train.to_csv("../data/processed/X_train.csv")
y_train.to_csv("../data/processed/y_train.csv")

X_test.to_csv("../data/processed/X_test.csv")
y_test.to_csv("../data/processed/y_test.csv")

X_train_val.to_csv("../data/processed/X_train_val.csv")
y_train_val.to_csv("../data/processed/y_train_val.csv")

X_val.to_csv("../data/processed/X_val.csv")
y_val.to_csv("../data/processed/y_val.csv")
