<a href="https://colab.research.google.com/github/Dr-Carlos-Villasenor/ReconocimientoPatrones/blob/master/PR_L08.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Reconocimiento de Patrones
##Dr. Carlos Villaseñor
##Lección 8 - Búsqueda de hiperparámetros

## Presentación de los Datos

In [1]:
import numpy as np
import pandas as pd
import time
import warnings
from sklearn import metrics, preprocessing, tree
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

In [2]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('loan_prediction.csv')
df.head()

Saving loan_prediction.csv to loan_prediction.csv


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,5849,0.0,0.0,360.0,1.0,1
1,4583,1508.0,128.0,360.0,1.0,0
2,3000,0.0,66.0,360.0,1.0,1
3,2583,2358.0,120.0,360.0,1.0,1
4,6000,0.0,141.0,360.0,1.0,1


In [3]:
from sklearn.tree import DecisionTreeClassifier as dt
model = dt()

## Train-Test Split

In [34]:
x = np.asanyarray(df.drop(columns=['Loan_Status']))
y = np.asanyarray(df[['Loan_Status']])
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)
model = dt(min_samples_leaf=4)
model.fit(xtrain, ytrain)
print('Train: ', model.score(xtrain, ytrain))
print('Test: ', model.score(xtest, ytest))

Train:  0.8553971486761711
Test:  0.6991869918699187


## Validación cruzada

In [45]:
model = dt()
scores = cross_val_score(model, xtrain, ytrain, cv=64, scoring='f1_macro')
print(scores)

[0.85454545 0.79487179 0.85454545 0.2        0.27272727 0.56363636
 0.27272727 0.33333333 0.56363636 0.66666667 0.79487179 0.85454545
 0.73333333 0.66666667 0.66666667 0.73333333 0.56363636 0.33333333
 0.73333333 0.33333333 0.61904762 0.46666667 0.61904762 0.46666667
 0.61904762 0.46666667 0.38461538 0.73333333 0.73333333 0.66666667
 0.85454545 0.73333333 0.61904762 0.46666667 0.46666667 0.27272727
 0.5        0.66666667 0.36507937 0.5        0.33333333 0.85454545
 0.56363636 0.78787879 0.65       0.53333333 0.36363636 0.65
 0.57142857 0.22222222 0.78787879 0.65       0.41666667 0.78787879
 1.         0.22222222 0.41666667 0.36363636 0.65       0.78787879
 1.         0.53333333 0.84444444 0.53333333]


In [46]:
print(scores.mean(), scores.std())

0.5864612817737818 0.19801892314391029


## KFold

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)

train_scores = []
dev_scores = []

for train_index, test_index in kf.split(xtrain):
  train, dev = xtrain[train_index], xtrain[test_index]
  y_train, y_dev = ytrain[train_index], ytrain[test_index]
  model = dt()
  model.fit(train, y_train)
  train_scores.append(model.score(train,y_train))
  dev_scores.append(model.score(dev, y_dev))


print(train_scores)
print(np.mean(train_scores))
print(dev_scores)
print(np.mean(dev_scores))

[1.0, 1.0, 1.0, 1.0, 1.0]
1.0
[0.6868686868686869, 0.6224489795918368, 0.5612244897959183, 0.6836734693877551, 0.6632653061224489]
0.6434961863533293


## Grid Search

In [None]:
parameters = {'max_depth':[1,2,3,4,5],
              'min_samples_leaf':[1,2,3,4,5],
              'min_samples_split':[2,3,4,5],
              'criterion' : ['gini','entropy']}

In [None]:
search_obj = GridSearchCV(model, parameters, cv=5, scoring='f1_macro')
fit_obj = search_obj.fit(x, y)
print(fit_obj.cv_results_['mean_test_score'])
best_model = fit_obj.best_estimator_

[0.70741357 0.70741357 0.70741357 0.70741357 0.70741357 0.70741357
 0.70741357 0.70741357 0.70741357 0.70741357 0.70741357 0.70741357
 0.70741357 0.70741357 0.70741357 0.70741357 0.70741357 0.70741357
 0.70741357 0.70741357 0.686559   0.686559   0.686559   0.686559
 0.686559   0.686559   0.686559   0.686559   0.68503533 0.68503533
 0.68503533 0.68503533 0.68503533 0.68503533 0.68736277 0.68503533
 0.6880986  0.6880986  0.6880986  0.6880986  0.686559   0.686559
 0.686559   0.686559   0.686559   0.6895075  0.686559   0.6895075
 0.68586884 0.68843179 0.68843179 0.68586884 0.68819627 0.69075923
 0.68843179 0.69075923 0.67941612 0.67448555 0.67941612 0.67941612
 0.68005905 0.67697512 0.67697512 0.67697512 0.67976772 0.67697512
 0.67697512 0.68005905 0.67726496 0.67575011 0.67879778 0.67726496
 0.67969941 0.68202684 0.67969941 0.68121553 0.67366259 0.67517744
 0.67136939 0.67136939 0.66108016 0.66108016 0.66108016 0.66063377
 0.66596589 0.65876345 0.66304737 0.66304737 0.65302425 0.65982071


In [None]:
print(best_model)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=1, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')


In [None]:
best_model.fit(xtrain, ytrain)
# Make predictions using the new model.
print('Train: ', best_model.score(xtrain, ytrain))
print('Test: ', best_model.score(xtest, ytest))

Train:  0.7678207739307535
Test:  0.7804878048780488


## Ramdonized Search

In [None]:
search_obj = RandomizedSearchCV(model, parameters, cv=5, 
                                scoring='f1_macro', n_iter=15)
fit_obj = search_obj.fit(xtrain, ytrain)
print(fit_obj.cv_results_['mean_test_score'])
best_model = fit_obj.best_estimator_

[0.6721033  0.67702883 0.70637494 0.678073   0.69916128 0.61968833
 0.70637494 0.69262019 0.633026   0.66271876 0.70637494 0.66975593
 0.68097192 0.68380785 0.70637494]


In [None]:
best_model.fit(xtrain, ytrain)
# Make predictions using the new model.
print('Train: ', best_model.score(xtrain, ytrain))
print('Test: ', best_model.score(xtest, ytest))

Train:  0.7678207739307535
Test:  0.7804878048780488
