In [61]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import datetime
warnings.filterwarnings('ignore')

# CoderHouse - Data Science

#### Data Wrangling - Marco Ormello - Comision: 32845

### Data Acquisition

In [62]:
df = pd.read_csv('/workspaces/data-science/data/raw/Cuisine_rating.csv')
df


Unnamed: 0,User ID,Area code,Location,Gender,YOB,Marital Status,Activity,Budget,Cuisines,Alcohol,Smoker,Food Rating,Service Rating,Overall Rating,Often A S
0,1,153,"Upper East Side,NY",Female,2006,Single,Professional,3,Japanese,Never,Never,5,4,4.5,No
1,2,123,"St. George,NY",Female,1991,Married,Student,3,Indian,Never,Socially,1,1,1.0,No
2,3,122,"Upper West Side,NY",Male,1977,Single,Student,5,Seafood,Often,Often,5,5,5.0,Yes
3,4,153,"Upper East Side,NY",Female,1956,Married,Professional,5,Japanese,Never,Socially,3,1,2.0,No
4,5,129,"Central Park,NY",Male,1997,Single,Student,4,Filipino,Socially,Never,2,4,3.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,196,175,"St. George,NY",Female,1982,Single,Professional,4,French,Never,Socially,1,2,1.5,No
196,197,170,"Upper West Side,NY",Female,2000,Married,Student,4,Chinese,Never,Often,1,2,1.5,No
197,198,160,"St. George,NY",Female,2006,Single,Professional,5,Japanese,Never,Often,5,2,3.5,No
198,199,130,"St. George,NY",Male,2002,Married,Student,3,Filipino,Never,Socially,3,2,2.5,No


### Limpieza de Datos

#### Nulls y Duplicados

In [63]:
print(df.isna().sum().sort_values())
print('Duplicados: ' ,df.duplicated().sum())

User ID           0
Area code         0
Location          0
Gender            0
YOB               0
Marital Status    0
Activity          0
Budget            0
Cuisines          0
Alcohol           0
Smoker            0
Food Rating       0
Service Rating    0
Overall Rating    0
Often A S         0
dtype: int64
Duplicados:  0


#### Data types

In [64]:
df.dtypes

User ID             int64
Area code           int64
Location           object
Gender             object
YOB                 int64
Marital Status     object
Activity           object
Budget              int64
Cuisines           object
Alcohol            object
Smoker             object
Food Rating         int64
Service Rating      int64
Overall Rating    float64
Often A S          object
dtype: object

### Datos de tipo:

#### Categorico:

In [65]:
categoric_col = []
for col in df.columns:
	if (df[col].nunique()<10) and (df[col].dtype=="object"):
		categoric_col.append(col) 
  
print("Columnas con datos categoricos {}".format(categoric_col))

Columnas con datos categoricos ['Gender', 'Marital Status', 'Activity', 'Cuisines', 'Alcohol ', 'Smoker', 'Often A S']


#### Continuo:

In [66]:
Numeric=list(set(df.columns)- set(categoric_col))
print("Columnas con datos continuos {}".format(Numeric))

Columnas con datos continuos ['Budget', 'Service Rating', 'Overall Rating', 'Area code', 'YOB', 'User ID', 'Location', 'Food Rating']


#### Conteo de Variables unicas:

In [67]:
df.nunique().sort_values()

Gender              2
Activity            2
Often A S           2
Marital Status      3
Alcohol             3
Smoker              3
Budget              5
Food Rating         5
Service Rating      5
Cuisines            7
Overall Rating      9
Location           10
YOB                46
Area code          65
User ID           200
dtype: int64

##### Reemplazamos la columna "YOB" por la edad de los participantes y la llamamos "Age"

In [68]:

now = datetime.datetime.now()
df['Age'] = now.year - df['YOB']
df = df.drop('YOB', axis=1)
df.head(5)

Unnamed: 0,User ID,Area code,Location,Gender,Marital Status,Activity,Budget,Cuisines,Alcohol,Smoker,Food Rating,Service Rating,Overall Rating,Often A S,Age
0,1,153,"Upper East Side,NY",Female,Single,Professional,3,Japanese,Never,Never,5,4,4.5,No,17
1,2,123,"St. George,NY",Female,Married,Student,3,Indian,Never,Socially,1,1,1.0,No,32
2,3,122,"Upper West Side,NY",Male,Single,Student,5,Seafood,Often,Often,5,5,5.0,Yes,46
3,4,153,"Upper East Side,NY",Female,Married,Professional,5,Japanese,Never,Socially,3,1,2.0,No,67
4,5,129,"Central Park,NY",Male,Single,Student,4,Filipino,Socially,Never,2,4,3.0,No,26


##### Eliminamos columnas que no poseen informacion relevante a para el modelo.

In [69]:
df.drop(labels=["User ID", "Area code", "Often A S"], axis=1, inplace=True)
df.head(5)

Unnamed: 0,Location,Gender,Marital Status,Activity,Budget,Cuisines,Alcohol,Smoker,Food Rating,Service Rating,Overall Rating,Age
0,"Upper East Side,NY",Female,Single,Professional,3,Japanese,Never,Never,5,4,4.5,17
1,"St. George,NY",Female,Married,Student,3,Indian,Never,Socially,1,1,1.0,32
2,"Upper West Side,NY",Male,Single,Student,5,Seafood,Often,Often,5,5,5.0,46
3,"Upper East Side,NY",Female,Married,Professional,5,Japanese,Never,Socially,3,1,2.0,67
4,"Central Park,NY",Male,Single,Student,4,Filipino,Socially,Never,2,4,3.0,26


##### Se codifican los datos categoricos.

In [70]:
from sklearn.preprocessing import LabelEncoder

variables=[]
for i in df.columns:
    if df[i].dtype == 'O':
        variables.append(i)
        le = LabelEncoder()
        df[i] = le.fit_transform(df[i].astype(str))
    elif df[i].dtype == 'float64':
        df[i]=(df[i]-df[i].mean())/(df[i].std())
    else:
        pass
df.head()

Unnamed: 0,Location,Gender,Marital Status,Activity,Budget,Cuisines,Alcohol,Smoker,Food Rating,Service Rating,Overall Rating,Age
0,8,0,2,0,3,5,0,0,5,4,1.181163,17
1,7,0,1,1,3,3,0,2,1,1,-2.061245,32
2,9,1,2,1,5,6,1,1,5,5,1.644364,46
3,8,0,1,0,5,5,0,2,3,1,-1.134843,67
4,1,1,2,1,4,1,2,0,2,4,-0.20844,26
