# Laboratorio 1 - Clasificación

### Importaciones

In [1]:
#Librerías necesarias
import pandas as pd
pd.set_option('display.max_columns', 25) # Número máximo de columnas a mostrar
pd.set_option('display.max_rows', 50) # Numero máximo de filas a mostar
# Ranom seed
import numpy as np
np.random.seed(3301)

# Seaborn
import seaborn as sn

# Matplolib
%matplotlib inline
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

### Carga de datos

In [2]:
#Ubicacion de la base de datos
db_location = "data/202120_Laboratorio_1_datos_SaludAlpes_diagnosticos_dataset.csv"

In [3]:
# Leer los datos
df = pd.read_csv(db_location, sep=";")
df

Unnamed: 0,Hair color,Pregnancies,Glucose,City,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,Red,6,148,New York,72,35,0,336,627,50,1
1,Black,1,85,New York,66,29,0,266,351,31,0
2,Red,8,183,New York,64,0,0,233,672,32,1
3,Black,1,89,New York,66,23,94,281,167,21,0
4,Black,0,137,New York,40,35,168,431,2288,33,1
...,...,...,...,...,...,...,...,...,...,...,...
763,Black,10,101,New York,76,48,180,329,171,63,0
764,Black,2,122,New York,70,27,0,368,34,27,0
765,Red,5,121,New York,72,23,112,262,245,30,0
766,Black,1,126,New York,60,0,0,301,349,47,1


In [4]:
# Dimensiones de los datos
df.shape

(768, 11)

In [5]:
df.dtypes

Hair color                  object
Pregnancies                 object
Glucose                     object
City                        object
BloodPressure               object
SkinThickness               object
Insulin                     object
BMI                          int64
DiabetesPedigreeFunction    object
Age                          int64
Outcome                     object
dtype: object

### Limpieza de datos

In [6]:
df["Pregnancies"].unique()

array(['6', '1', '8', '0', '5', '3', '10', '2', '4', '7', '9', '11', '13',
       '15', '17', '12', '14', '-'], dtype=object)

Nos dimos cuenta que en las columnas, hay valores que deberían ser numéricos pero aparecen como objetos. En el caso de Pregnancies es porque hay valores que son '-' que en este caso asumiremos que son lo mismo que 0. 

In [7]:
df["Pregnancies"] = df["Pregnancies"].replace("-",0)
df["Pregnancies"] = pd.to_numeric(df["Pregnancies"])
df["Pregnancies"].unique()

array([ 6,  1,  8,  0,  5,  3, 10,  2,  4,  7,  9, 11, 13, 15, 17, 12, 14])

In [8]:
df["Glucose"].unique()

array(['148', '85', '183', '89', '137', '116', '78', '115', '197', '125',
       '110', '168', '139', '189', '166', '100', '118', '107', '103',
       '126', '99', '196', '119', '143', '147', '97', '145', '117', '109',
       '158', '88', '92', '122', '138', '102', '90', '111', '180', '133',
       '106', '171', '159', '146', '71', '105', '101', '176', '150', '73',
       '187', '84', '44', '141', '114', '95', '129', '79', '0', '62',
       '131', '112', '113', '74', '83', '136', '80', '123', '81', '134',
       '142', '144', '93', '163', '151', '96', '155', '76', '160', '124',
       '162', '132', '120', '173', '170', '128', '108', '154', '57',
       '156', '153', '188', '152', '104', '87', '75', '179', '130', '194',
       '181', '135', '184', '140', '177', '164', '91', '165', '86', '193',
       '191', '161', '167', '77', '182', '157', '178', '61', '98', '127',
       '82', '72', '172', '94', '175', '195', '68', '186', '198', '121',
       '-', '67', '174', '199', '56', '169', '149

Después de darnos 

In [9]:
df["Glucose"] = pd.to_numeric(df["Glucose"], errors = "coerce")

In [10]:
df["City"].value_counts()

New York    767
-             1
Name: City, dtype: int64

Tomamos la decisión de eliminar la columna de ciudad, ya que todos los valores son New York, y por lo tanto no es una columna con valores relevantes para el diagnóstico de pacientes con diabetes. 

In [11]:
del df["City"]

In [12]:
df["BloodPressure"].unique()

array(['72', '66', '64', '40', '74', '50', '0', '70', '96', '92', '80',
       '60', '84', '30', '88', '90', '94', '76', '82', '75', '58', '78',
       '68', '110', '56', '62', '85', '86', '48', '44', '65', '108', '55',
       '122', '54', '52', '98', '104', '95', '46', '102', '100', '61',
       '-', '24', '38', '106', '114'], dtype=object)

In [13]:
df["BloodPressure"] = pd.to_numeric(df["BloodPressure"], errors = "coerce")
df["BloodPressure"] = df["BloodPressure"].replace(0,np.nan)

In [14]:
df.loc[df['SkinThickness'] <= 0] = np.nan
df["SkinThickness"].value_counts()

TypeError: '<=' not supported between instances of 'str' and 'int'

In [None]:
df["Insulin"].value_counts()

In [None]:
df["Insulin"] = pd.to_numeric(df["Insulin"], errors = "coerce")

In [None]:
df["Insulin"].unique()

In [None]:
df["BMI"].unique()

In [None]:
df.loc[df['BMI'] == 0.] = np.nan
df["BMI"].value_counts()

In [None]:
df

In [None]:
df["DiabetesPedigreeFunction"] = pd.to_numeric(df["DiabetesPedigreeFunction"], errors = "coerce")

In [None]:
df["Age"] = pd.to_numeric(df["Age"], errors = "coerce")
df.loc[df['Age'] > 81] = np.nan

In [None]:
df = df.dropna()

In [None]:
le = preprocessing.LabelEncoder()
df["Hair color"]=le.fit_transform(df["Hair color"])

In [None]:
df

In [None]:
df.describe()

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
df_scaled = min_max_scaler.fit_transform(df)
clases=df.columns
df = pd.DataFrame(df_scaled)
df.columns=clases
df

In [None]:
sn.set_style("whitegrid");
sn.pairplot(df, hue="Outcome", height=2);
plt.show()

### Separación de datos

In [None]:
Y = df["Outcome"]
X = df.drop(columns=["Outcome"])

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.2, random_state=53, shuffle=True)

### KNN 

### Construcción del modelo

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train )

In [None]:
predicted=neigh.predict(X_test)
cm =confusion_matrix(y_test, predicted)
plt.figure(figsize = (10,4))
sn.heatmap(cm, annot=True, cmap="YlGnBu")

In [None]:
print(classification_report(y_test, predicted))

### Calibración del modelo

In [None]:
params = {}
params["n_neighbors"]=[2,3,5,7,10,15,20,25,30,35]
params["weights"]=["uniform","distance"]
params["algorithm"]=["ball_tree", "kd_tree", "brute"]


print(params)
best_model = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=params, cv= 3, n_jobs=-1)
best_model.fit(X_train, y_train)

In [None]:
best_model.best_params_

In [None]:
predicted= best_model.predict(X_test)
cm =confusion_matrix(y_test, predicted)
plt.figure(figsize = (10,4))
sn.heatmap(cm, annot=True, cmap="YlGnBu")

In [None]:
print("Accuracy",accuracy_score(y_test, predicted).round(3))
print(classification_report(y_test, predicted))

### Random Forest Classifier

### Construccion del modelo

In [None]:
randomForest = RandomForestClassifier(max_depth=2, random_state = 0)
randomForest.fit(X_train,y_train, sample_weight=None)

In [None]:
predictedRandomForest = randomForest.predict(X_test)
cm2 = confusion_matrix(y_test, predicted)
plt.figure(figsize = (10,4))
sn.heatmap(cm2, annot=True, cmap="YlGnBu")

In [None]:
print("Accuracy",accuracy_score(y_test, predictedRandomForest).round(3))
print(classification_report(y_test, predictedRandomForest))

### Calibración del modelo

In [None]:
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(randomForest, hyperF, cv = 3, verbose = 1, n_jobs = -1)
gridF.fit(X_train, y_train)

In [None]:
gridF.best_params_

In [None]:
predictedRF = gridF.predict(X_test)
cmRF =confusion_matrix(y_test, predictedRF)
plt.figure(figsize = (10,4))
sn.heatmap(cm, annot=True, cmap="YlGnBu")

In [None]:
print("Accuracy",accuracy_score(y_test, predictedRF).round(3))
print(classification_report(y_test, predictedRF))