# Bootcamp Data Science y MLOps

<img src="https://i.ibb.co/5RM26Cw/LOGO-COLOR2.png" width="500px">

---

## Data Preparation

In [1]:
import numpy as np
import pandas as pd

import funpymodeling

In [2]:
# Para este caso nos interesa visualizar todas las columnas
pd.set_option('display.max_columns', None)

In [3]:
df_data = pd.read_csv("data/water_potability.csv", sep=',')

In [4]:
print(df_data.columns)
print(df_data.shape)

Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')
(3276, 10)


In [5]:
df_data

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


# Preparamos la data

In [6]:
funpymodeling.status(df_data)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,ph,491,0.149878,1,0.000305,2785,float64
1,Hardness,0,0.0,0,0.0,3276,float64
2,Solids,0,0.0,0,0.0,3276,float64
3,Chloramines,0,0.0,0,0.0,3276,float64
4,Sulfate,781,0.2384,0,0.0,2495,float64
5,Conductivity,0,0.0,0,0.0,3276,float64
6,Organic_carbon,0,0.0,0,0.0,3276,float64
7,Trihalomethanes,162,0.049451,0,0.0,3114,float64
8,Turbidity,0,0.0,0,0.0,3276,float64
9,Potability,0,0.0,1998,0.60989,2,int64


In [7]:
ph_cat = pd.qcut(df_data['ph'], q=5)

In [8]:
ph_cat

0                   NaN
1       (-0.001, 5.822]
2        (7.437, 8.311]
3         (8.311, 14.0]
4         (8.311, 14.0]
             ...       
3271    (-0.001, 5.822]
3272     (7.437, 8.311]
3273      (8.311, 14.0]
3274    (-0.001, 5.822]
3275     (7.437, 8.311]
Name: ph, Length: 3276, dtype: category
Categories (5, interval[float64, right]): [(-0.001, 5.822] < (5.822, 6.702] < (6.702, 7.437] < (7.437, 8.311] < (8.311, 14.0]]

In [9]:
sulfate_cat = pd.qcut(df_data['Sulfate'], q=5)

In [10]:
sulfate_cat

0       (367.369, 481.031]
1                      NaN
2                      NaN
3       (342.095, 367.369]
4       (301.074, 323.531]
               ...        
3271    (342.095, 367.369]
3272                   NaN
3273                   NaN
3274                   NaN
3275                   NaN
Name: Sulfate, Length: 3276, dtype: category
Categories (5, interval[float64, right]): [(128.999, 301.074] < (301.074, 323.531] < (323.531, 342.095] < (342.095, 367.369] < (367.369, 481.031]]

In [11]:
trihalomethanes_cat = pd.qcut(df_data['Trihalomethanes'], q=5)

In [12]:
trihalomethanes_cat

0        (79.701, 124.0]
1       (53.107, 62.656]
2       (62.656, 70.446]
3        (79.701, 124.0]
4        (0.737, 53.107]
              ...       
3271    (62.656, 70.446]
3272                 NaN
3273    (62.656, 70.446]
3274    (70.446, 79.701]
3275    (70.446, 79.701]
Name: Trihalomethanes, Length: 3276, dtype: category
Categories (5, interval[float64, right]): [(0.737, 53.107] < (53.107, 62.656] < (62.656, 70.446] < (70.446, 79.701] < (79.701, 124.0]]

In [13]:
df_data['ph'] = ph_cat
df_data['Sulfate'] = sulfate_cat
df_data['Trihalomethanes'] = trihalomethanes_cat

In [14]:
funpymodeling.status(df_data)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,ph,491,0.149878,0,0.0,5,category
1,Hardness,0,0.0,0,0.0,3276,float64
2,Solids,0,0.0,0,0.0,3276,float64
3,Chloramines,0,0.0,0,0.0,3276,float64
4,Sulfate,781,0.2384,0,0.0,5,category
5,Conductivity,0,0.0,0,0.0,3276,float64
6,Organic_carbon,0,0.0,0,0.0,3276,float64
7,Trihalomethanes,162,0.049451,0,0.0,5,category
8,Turbidity,0,0.0,0,0.0,3276,float64
9,Potability,0,0.0,1998,0.60989,2,int64


In [15]:
# Agregamos la categoria 'desconocido' a las categorias de la columna 'ph', 'Sulfate', 'Trihalomethanes'
df_data['ph'] = df_data['ph'].cat.add_categories("desconocido")
df_data['Sulfate'] = df_data['Sulfate'].cat.add_categories("desconocido")
df_data['Trihalomethanes'] = df_data['Trihalomethanes'].cat.add_categories("desconocido")

In [16]:
# Rellenamos los NaN con el valor 'desconocido'
df_data['ph'] = df_data['ph'].fillna(value="desconocido")
df_data['Sulfate'] = df_data['Sulfate'].fillna(value="desconocido")
df_data['Trihalomethanes'] = df_data['Trihalomethanes'].fillna(value="desconocido")

In [17]:
funpymodeling.status(df_data)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,ph,0,0.0,0,0.0,6,category
1,Hardness,0,0.0,0,0.0,3276,float64
2,Solids,0,0.0,0,0.0,3276,float64
3,Chloramines,0,0.0,0,0.0,3276,float64
4,Sulfate,0,0.0,0,0.0,6,category
5,Conductivity,0,0.0,0,0.0,3276,float64
6,Organic_carbon,0,0.0,0,0.0,3276,float64
7,Trihalomethanes,0,0.0,0,0.0,6,category
8,Turbidity,0,0.0,0,0.0,3276,float64
9,Potability,0,0.0,1998,0.60989,2,int64


In [19]:
df_data_2 = pd.get_dummies(df_data) 

In [20]:
df_data_2

Unnamed: 0,Hardness,Solids,Chloramines,Conductivity,Organic_carbon,Turbidity,Potability,"ph_(-0.001, 5.822]","ph_(5.822, 6.702]","ph_(6.702, 7.437]","ph_(7.437, 8.311]","ph_(8.311, 14.0]",ph_desconocido,"Sulfate_(128.999, 301.074]","Sulfate_(301.074, 323.531]","Sulfate_(323.531, 342.095]","Sulfate_(342.095, 367.369]","Sulfate_(367.369, 481.031]",Sulfate_desconocido,"Trihalomethanes_(0.737, 53.107]","Trihalomethanes_(53.107, 62.656]","Trihalomethanes_(62.656, 70.446]","Trihalomethanes_(70.446, 79.701]","Trihalomethanes_(79.701, 124.0]",Trihalomethanes_desconocido
0,204.890455,20791.318981,7.300212,564.308654,10.379783,2.963135,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0
1,129.422921,18630.057858,6.635246,592.885359,15.180013,4.500656,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2,224.236259,19909.541732,9.275884,418.606213,16.868637,3.055934,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0
3,214.373394,22018.417441,8.059332,363.266516,18.436524,4.628771,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0
4,181.101509,17978.986339,6.546600,398.410813,11.558279,4.075075,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3271,193.681735,47580.991603,7.166639,526.424171,13.894419,4.435821,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3272,193.553212,17329.802160,8.061362,392.449580,19.903225,2.798243,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
3273,175.762646,33155.578218,7.350233,432.044783,11.039070,3.298875,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0
3274,230.603758,11983.869376,6.303357,402.883113,11.168946,4.708658,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0


In [21]:
funpymodeling.status(df_data_2)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,Hardness,0,0.0,0,0.0,3276,float64
1,Solids,0,0.0,0,0.0,3276,float64
2,Chloramines,0,0.0,0,0.0,3276,float64
3,Conductivity,0,0.0,0,0.0,3276,float64
4,Organic_carbon,0,0.0,0,0.0,3276,float64
5,Turbidity,0,0.0,0,0.0,3276,float64
6,Potability,0,0.0,1998,0.60989,2,int64
7,"ph_(-0.001, 5.822]",0,0.0,2719,0.829976,2,uint8
8,"ph_(5.822, 6.702]",0,0.0,2719,0.829976,2,uint8
9,"ph_(6.702, 7.437]",0,0.0,2719,0.829976,2,uint8


## Separación de X de Y para entrenamiento

In [38]:
data_x = df_data_2.drop('Potability', axis=1)
data_y = df_data_2['Potability']

In [39]:
# Guardamos las columnas

import pickle

with open('pickle_files/categories_ohe.pkl', 'wb') as f:
    pickle.dump(data_x.columns, f, protocol=pickle.HIGHEST_PROTOCOL)

In [41]:
from sklearn.model_selection import train_test_split

In [42]:
# Spliteamos el dataset
x_train, x_test, y_train, y_test = train_test_split(
    data_x, data_y, test_size=0.3,
)

In [43]:
x_train

Unnamed: 0,Hardness,Solids,Chloramines,Conductivity,Organic_carbon,Turbidity,"ph_(-0.001, 5.822]","ph_(5.822, 6.702]","ph_(6.702, 7.437]","ph_(7.437, 8.311]","ph_(8.311, 14.0]",ph_desconocido,"Sulfate_(128.999, 301.074]","Sulfate_(301.074, 323.531]","Sulfate_(323.531, 342.095]","Sulfate_(342.095, 367.369]","Sulfate_(367.369, 481.031]",Sulfate_desconocido,"Trihalomethanes_(0.737, 53.107]","Trihalomethanes_(53.107, 62.656]","Trihalomethanes_(62.656, 70.446]","Trihalomethanes_(70.446, 79.701]","Trihalomethanes_(79.701, 124.0]",Trihalomethanes_desconocido
913,198.213405,11451.312217,5.308990,401.063764,13.243329,2.784713,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1228,186.174841,18085.158841,4.951277,347.986686,15.715546,3.068919,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0
3089,217.896772,6785.145303,6.028898,432.332406,12.365538,4.179198,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0
1511,159.168926,21217.158596,5.298877,305.243600,16.865453,3.271339,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
910,197.517093,19557.993724,8.311349,379.507739,18.240712,4.180389,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2173,204.258263,26321.689195,5.937496,487.273628,20.407809,3.305588,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0
687,198.310020,21148.241905,9.124900,312.388586,13.042948,4.899815,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
3177,180.453871,34451.984214,7.930651,429.859999,7.665923,3.365397,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
800,197.646951,22535.085158,9.288200,353.062907,6.039669,4.679169,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0


In [44]:
x_test

Unnamed: 0,Hardness,Solids,Chloramines,Conductivity,Organic_carbon,Turbidity,"ph_(-0.001, 5.822]","ph_(5.822, 6.702]","ph_(6.702, 7.437]","ph_(7.437, 8.311]","ph_(8.311, 14.0]",ph_desconocido,"Sulfate_(128.999, 301.074]","Sulfate_(301.074, 323.531]","Sulfate_(323.531, 342.095]","Sulfate_(342.095, 367.369]","Sulfate_(367.369, 481.031]",Sulfate_desconocido,"Trihalomethanes_(0.737, 53.107]","Trihalomethanes_(53.107, 62.656]","Trihalomethanes_(62.656, 70.446]","Trihalomethanes_(70.446, 79.701]","Trihalomethanes_(79.701, 124.0]",Trihalomethanes_desconocido
432,199.677784,25396.011782,4.960558,444.678579,16.115432,5.048834,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0
2601,244.461342,17940.099080,7.149053,337.713569,23.667667,3.749009,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
2527,139.955350,31421.144495,6.627705,378.771076,17.544485,4.536181,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0
1775,170.732289,10925.360104,8.200973,508.851053,12.244622,3.668089,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
2896,184.688395,20857.908219,6.462946,395.785623,18.443786,3.776100,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,204.812711,20555.559572,9.775004,389.638228,12.298753,3.130045,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2055,166.269757,32471.456324,7.342535,498.313062,12.779885,4.566254,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0
2655,141.300305,16017.611896,8.214714,580.870759,10.603280,3.676246,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1269,190.992873,26895.257956,5.629536,660.254946,18.125202,3.584202,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0


## Creamos y entrenamiento del modelo

In [45]:
from sklearn.ensemble import RandomForestClassifier 

In [46]:
# Creamos 1000 decision trees
rf_model = RandomForestClassifier(n_estimators=1000, random_state=99)

In [47]:
%%time

rf_model.fit(x_train, y_train)

CPU times: user 2.62 s, sys: 33.4 ms, total: 2.65 s
Wall time: 2.72 s


## Calculamos algunas metricas del modelo entrenado

In [48]:
# En training (por defecto asume 0.5 como punto de corte)
pred_tr = rf_model.predict(x_train)

In [49]:
# En testing (por defecto asume 0.5 como punto de corte)
pred_ts = rf_model.predict(x_test)

In [50]:
from pprint import pprint

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [51]:
train_metrics = {
    'accuracy': accuracy_score(y_train, pred_tr, normalize=True),
    'precision': precision_score(y_train, pred_tr),
    'recall': recall_score(y_train, pred_tr),
    'f1_score': f1_score(y_train, pred_tr),
}

pprint(train_metrics)

{'accuracy': 1.0, 'f1_score': 1.0, 'precision': 1.0, 'recall': 1.0}


In [52]:
test_metrics = {
    'accuracy': accuracy_score(y_test, pred_ts, normalize=True),
    'precision': precision_score(y_test, pred_ts),
    'recall': recall_score(y_test, pred_ts),
    'f1_score': f1_score(y_test, pred_ts),
}

pprint(test_metrics)

{'accuracy': 0.646998982706002,
 'f1_score': 0.37924865831842575,
 'precision': 0.585635359116022,
 'recall': 0.2804232804232804}


# Guardamos el modelo entrenado

In [53]:
# Guardar en el disco
filename = 'rf.pkl'
pickle.dump(rf_model, open(filename, 'wb'))