In [1]:
import pandas as pd
import numpy as np

In [2]:
# data doesn't have headers, so let's create headers
_headers = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'car']
# read in cars dataset
df = pd.read_csv('Datos/car.data', names=_headers, index_col=None)
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,car
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   car       1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [5]:
# encode categorical variables
_df = pd.get_dummies(df, columns=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])
_df.head()

Unnamed: 0,car,buying_high,buying_low,buying_med,buying_vhigh,maint_high,maint_low,maint_med,maint_vhigh,doors_2,...,doors_5more,persons_2,persons_4,persons_more,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med
0,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,0,1,0
1,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,0,0,1
2,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,1,0,0
3,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
4,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,1,0,0,0,1


In [6]:
# separate features and labels DataFrames
X = _df.drop(['car'], axis=1).values
y = _df[['car']].values.ravel()

# LogisticRegression with Cross Validation

In [7]:
from sklearn.linear_model import LogisticRegression
_lr = LogisticRegression()

In [9]:
from sklearn.model_selection import cross_val_score

In [11]:
_scores = cross_val_score(_lr, X, y, cv=5)

In [12]:
print(_scores) # R^2

[0.70231214 0.84971098 0.74566474 0.75652174 0.76231884]


# Grid Search with Cross Validation

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

In [14]:
clf= DecisionTreeClassifier()

In [15]:
params = {'max_depth': np.arange(1,8)}

In [16]:
np.arange(1,8)

array([1, 2, 3, 4, 5, 6, 7])

In [17]:
clf_cv = GridSearchCV(clf, param_grid=params, cv=5)

In [19]:
clf_cv.fit(X, y)

print('Mejor Parametro para el árbol de decisión: {}'.format(clf_cv.best_params_))

In [20]:
model = clf_cv.best_estimator_
model

In [21]:
score = clf_cv.best_score_
score

0.7778822149618833

In [23]:
param = clf_cv.best_params_
param

{'max_depth': 2}

## Randomized Search with Cross Validation

In [25]:
_df.head()

Unnamed: 0,car,buying_high,buying_low,buying_med,buying_vhigh,maint_high,maint_low,maint_med,maint_vhigh,doors_2,...,doors_5more,persons_2,persons_4,persons_more,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med
0,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,0,1,0
1,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,0,0,1
2,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,1,0,0
3,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
4,unacc,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,1,0,0,0,1


In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [29]:
clf = RandomForestClassifier()

In [30]:
params = {'n_estimators': [500, 1000, 2000], 'max_depth': np.arange(1, 8)}

In [31]:
clf_cv = RandomizedSearchCV(clf, param_distributions=params, cv=5)

In [32]:
clf_cv.fit(X, y)

In [34]:
print('Tuned Random Forest Parameters: {}'.format(clf_cv.best_params_))

Tuned Random Forest Parameters: {'n_estimators': 2000, 'max_depth': 5}


In [36]:
print('Best score is: {}'.format(clf_cv.best_score_))

Best score is: 0.7627896456396079


In [37]:
model = clf_cv.best_estimator_
model

## Leave One Out Cross-Validation

In [41]:
import pandas as pd
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [53]:
#Carga del Dataset
url = 'Datos/sonar.csv'
dataframe = pd.read_csv(url, header=None)
data = dataframe.values

In [54]:
data.shape

(208, 61)

In [51]:
# Separar en X e y
X = data[:,:-1]
y = data[:, -1]

In [52]:
X.shape

(208, 60)

In [55]:
y.shape

(208,)

In [56]:
cv = LeaveOneOut()

In [57]:
model = RandomForestClassifier(random_state=1)

In [59]:
scores = cross_val_score(model, X, y, scoring='accuracy', cv = cv, n_jobs=1)

In [62]:

print('Accuracy: ', np.mean(scores), np.std(scores))

Accuracy:  0.8221153846153846 0.38241558414123605


## GridSearchCV en el datasee: Breast Cancer Wisconsin

In [63]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [64]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [69]:
data['data']

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [70]:
data['target']

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [71]:
df = pd.DataFrame(np.c_[data['data'], data['target']],
                  columns= np.append(data['feature_names'], ['target']))
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [72]:
df.shape

(569, 31)

In [74]:
features = list(df.columns[0:10]) 
features

['mean radius',
 'mean texture',
 'mean perimeter',
 'mean area',
 'mean smoothness',
 'mean compactness',
 'mean concavity',
 'mean concave points',
 'mean symmetry',
 'mean fractal dimension']

In [75]:
_df = df[features + ['target']]
_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.0


In [77]:
X = _df.drop(['target'], axis=1)
y = _df['target']

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

In [86]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114 entries, 421 to 442
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   mean radius             114 non-null    float64
 1   mean texture            114 non-null    float64
 2   mean perimeter          114 non-null    float64
 3   mean area               114 non-null    float64
 4   mean smoothness         114 non-null    float64
 5   mean compactness        114 non-null    float64
 6   mean concavity          114 non-null    float64
 7   mean concave points     114 non-null    float64
 8   mean symmetry           114 non-null    float64
 9   mean fractal dimension  114 non-null    float64
dtypes: float64(10)
memory usage: 9.8 KB


In [87]:
knn = KNeighborsClassifier()

## GridSearch CV

In [92]:
params = {'n_neighbors': np.arange(1,10),
             'weights':['uniform', 'distance'],
             'leaf_size': [1,3,5,7,10],
             'algorithm': ['auto', 'kd_tree']}

In [93]:
model = GridSearchCV(knn, param_grid=params, cv=5)

In [105]:
model.fit(X_train, y_train)

¿Cómo sabemos cuales son los mejores hyperparámetros?
* best_params_
* best_score_
* cv_results_

In [111]:
print('Mejores parámetros: '+ str(model.best_params_))

Mejores parámetros: {'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 9, 'weights': 'distance'}


In [112]:
print('Mejor score: '+ str(model.best_score_))

Mejor score: 0.8989010989010989


In [113]:
# los resultados obtenidos
scores = pd.DataFrame(model.cv_results_)
scores.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008407,0.006248,0.009764,0.00106,auto,1,1,uniform,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.824176,0.813187,0.813187,0.901099,0.846154,0.83956,0.03304,141
1,0.006583,0.004123,0.005802,0.003735,auto,1,1,distance,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.824176,0.813187,0.813187,0.901099,0.846154,0.83956,0.03304,141
2,0.004354,0.000214,0.008339,0.001153,auto,1,2,uniform,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.813187,0.758242,0.824176,0.89011,0.846154,0.826374,0.043068,171
3,0.005075,0.001896,0.005103,0.001037,auto,1,2,distance,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.824176,0.813187,0.813187,0.901099,0.846154,0.83956,0.03304,141
4,0.004162,7.3e-05,0.008512,0.001544,auto,1,3,uniform,"{'algorithm': 'auto', 'leaf_size': 1, 'n_neigh...",0.89011,0.879121,0.901099,0.923077,0.868132,0.892308,0.018906,21


In [114]:
predition = model.predict(X_test)

In [115]:
predition

array([1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1.,
       1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0.,
       0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1.])

In [116]:
accuracy_score(y_test, predition)

0.868421052631579

In [117]:
confusion_matrix(y_test, predition)

array([[31, 11],
       [ 4, 68]])

## Randomized Search

In [132]:
from sklearn.model_selection import RandomizedSearchCV

In [139]:
import scipy as sp

params = {'n_neighbors': sp.stats.randint(1,20),
             'weights':['uniform', 'distance'],
             'leaf_size': sp.stats.randint(1,20),
             'algorithm': ['auto', 'kd_tree']}

In [140]:
model = RandomizedSearchCV(knn, params, n_iter=100, random_state=0, cv=5)

In [141]:
model.fit(X_train, y_train)

In [142]:
print('Mejores parámetros: '+ str(model.best_params_))

Mejores parámetros: {'algorithm': 'kd_tree', 'leaf_size': 14, 'n_neighbors': 9, 'weights': 'distance'}


In [143]:
print('Mejor score: '+ str(model.best_score_))

Mejor score: 0.8989010989010989


In [126]:
# los resultados obtenidos
scores = pd.DataFrame(model.cv_results_)
scores.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004802,0.000831,0.005518,0.00163,auto,6,1,distance,"{'algorithm': 'auto', 'leaf_size': 6, 'n_neigh...",0.824176,0.813187,0.813187,0.901099,0.846154,0.83956,0.03304,83
1,0.006731,0.003012,0.005068,0.001649,kd_tree,4,8,distance,"{'algorithm': 'kd_tree', 'leaf_size': 4, 'n_ne...",0.901099,0.89011,0.901099,0.923077,0.857143,0.894505,0.021534,6
2,0.003651,0.000307,0.008421,0.000702,kd_tree,6,3,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 6, 'n_ne...",0.89011,0.879121,0.901099,0.923077,0.868132,0.892308,0.018906,14
3,0.003527,0.000348,0.008702,0.000755,kd_tree,7,9,uniform,"{'algorithm': 'kd_tree', 'leaf_size': 7, 'n_ne...",0.89011,0.868132,0.923077,0.901099,0.868132,0.89011,0.02085,24
4,0.004347,0.001259,0.004571,0.000806,auto,2,7,distance,"{'algorithm': 'auto', 'leaf_size': 2, 'n_neigh...",0.89011,0.89011,0.901099,0.923077,0.857143,0.892308,0.021308,20


In [127]:
prediction = model.predict(X_test)

In [128]:
prediction

array([1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1.,
       1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0.,
       0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1.])

In [144]:
accuracy_score(y_test, prediction)

0.868421052631579

In [145]:
confusion_matrix(y_test, prediction)

array([[31, 11],
       [ 4, 68]])