In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

# Ejercicio 1

In [4]:
df = pd.read_csv('data/data.csv')

In [5]:
df['target'].value_counts()

target
0    343
1    157
Name: count, dtype: int64

Notamos que estamos ante un dataset desbalanceado, lo tendremos en cuenta para separar los datos de modo estratificado

In [6]:
def train_test_split(df,train_size=0.8):
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    n = df.shape[0]
    cantNeg, cantPos = df['target'].value_counts()
    proporcion = cantPos / n

    cant_en_train = int(n*train_size)
    cant_en_validacion = n - cant_en_train

    res = []
    cant_positivas = int(proporcion * cant_en_validacion)
    cant_negativas = cant_en_validacion - cant_positivas

    contador_positivos, contador_negativos = 0, 0
    for index, value in enumerate(df['target']):
        if value == 0:
            contador_negativos += 1
            res.append(index)

        if contador_negativos == cant_negativas:
            break
    for index, value in enumerate(df['target']):

        if value == 1:
            contador_positivos += 1
            res.append(index)

        if contador_positivos == cant_positivas:
            break

    df_dev = df.loc[~df.index.isin(res)].reset_index(drop=True)
    df_eval = df.loc[res].reset_index(drop=True)
    return df_dev,df_eval

In [7]:
df_dev , df_eval =  train_test_split(df)


# Ejercicio 2

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score , average_precision_score
from sklearn.model_selection import cross_val_predict


In [9]:
X_train = df_dev.drop(columns=['target']).values

y_train = df_dev['target'].values

X_eval   = df_eval.drop(columns=['target']).values

y_eval   =  df_eval['target'].values

In [10]:
tree_clf = DecisionTreeClassifier(max_depth=3, random_state=51)
tree_clf.fit(X_train, y_train)

In [30]:

scoring = ['accuracy', 'roc_auc', 'average_precision']

skf = StratifiedKFold(n_splits=5)

cv_results = cross_validate(tree_clf, X_train, y_train, cv = skf, scoring = scoring, return_train_score = True)

predicted_labels = cross_val_predict(tree_clf, X_train, y_train, cv = skf)

metricas_globales = {'Accuracy Global' : accuracy_score(y_train, predicted_labels),
                     'Auc Global' : roc_auc_score(y_train, predicted_labels),
                      'Average Precision Global' : average_precision_score(y_train, predicted_labels)
}

cv_results.update(metricas_globales)

df_resultados_ej_2_2 = pd.DataFrame({key: cv_results[key] for key in list(cv_results.keys())[2:]})

df_resultados_ej_2_2.index = df_resultados_ej_2_2.index + 1 # asi el indice del df se correponde con el k


In [31]:
df_resultados_ej_2_2

Unnamed: 0,test_accuracy,train_accuracy,test_roc_auc,train_roc_auc,test_average_precision,train_average_precision,Accuracy Global,Auc Global,Average Precision Global
1,0.65,0.834375,0.58547,0.770614,0.432637,0.656625,0.695,0.599467,0.386459
2,0.7,0.81875,0.732,0.852706,0.484188,0.713912,0.695,0.599467,0.386459
3,0.675,0.825,0.613455,0.796826,0.404476,0.657074,0.695,0.599467,0.386459
4,0.6875,0.809375,0.596,0.854492,0.394547,0.713593,0.695,0.599467,0.386459
5,0.7625,0.8375,0.705455,0.836928,0.516564,0.733381,0.695,0.599467,0.386459


In [33]:
alturas = [3, 5, None]
criterios = ["gini", "entropy"]
resultados = {}

for altura in alturas:
    for criterio in criterios:

        tree = DecisionTreeClassifier(max_depth = altura, criterion = criterio, random_state = 51)

        cv_results_grid = cross_validate(tree, X_train, y_train, cv = skf, scoring = 'accuracy', return_train_score = True)

        accuracy_train = cv_results_grid['train_score'].mean()

        accuracy_test =  cv_results_grid['test_score'].mean()



        resultados[(altura, criterio)] = (accuracy_train,accuracy_test)

results_df = pd.DataFrame(resultados.values(), index=resultados.keys(), columns=['Accuracy (training)', 'Accuracy (testing)'])

results_df

Unnamed: 0,Unnamed: 1,Accuracy (training),Accuracy (testing)
3.0,gini,0.825,0.695
3.0,entropy,0.78625,0.67
5.0,gini,0.93,0.69
5.0,entropy,0.920625,0.655
,gini,1.0,0.68
,entropy,1.0,0.645


Notamos que en Test el criterio de corte Gini da siempre mejor que entropy para cualquier altura.
Sera porque entropy tiende a producir arboles mas equilibrados y nuestro data set esta desbalanceado?

Asimismo tambien vemos que a medida que aumenta la altura del arbol, decae el rendimiento en el test set y aumenta en el train set, independientemente del criterio de corte utilizado.

# Ejercicio 3

In [16]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
import scipy

# Hay que revisar en la documentacion del modelo cada parametro y usar una distribucion con los valores mas probables
# Las listas son interpretadas como distribuciones uniformes discretas

# Comento los parámetros que generan errores
dist_params = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': scipy.stats.randint(1,40), #puede ser float
    'min_samples_split': scipy.stats.randint(2,20), #puede ser float
    'min_samples_leaf': scipy.stats.randint(1,20),
    #'min_weight_fraction_leaf':scipy.stats.uniform(0,1) ,
    'max_features': scipy.stats.randint(1,20),
    'random_state': [42],
    'max_leaf_nodes': scipy.stats.randint(2,len(X_train)),
    #'min_impurity_decrease': scipy.stats.uniform(0,0.1),
    #'class_weight': {i: scipy.stats.uniform(0,1) for i in range(len(X_train[0]))},
    #'ccp_alpha': scipy.stats.uniform(0,0.1),
}

tree = DecisionTreeClassifier()
n_iter_search = 50    # Esto es la cantidad de configuraciones que pruebo
random_search = RandomizedSearchCV(estimator=tree, param_distributions=dist_params, scoring='roc_auc', n_iter=n_iter_search)
random_search.fit(X_train, y_train)

mejor_modelo = random_search.best_estimator_
mejor_score = random_search.best_score_

print("Mejor modelo:", mejor_modelo)
print("Mejor score AUCROC:", mejor_score)

Mejor modelo: DecisionTreeClassifier(criterion='log_loss', max_depth=13, max_features=14,
                       max_leaf_nodes=201, min_samples_leaf=6,
                       min_samples_split=7, random_state=42, splitter='random')
Mejor score AUCROC: 0.6177628075628075


In [17]:
from sklearn.neighbors import KNeighborsClassifier

dist_params = {
    'n_neighbors': scipy.stats.randint(1, 30),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': scipy.stats.randint(1, 50),
    'p': scipy.stats.randint(1, 5),
    'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
    #'metric_params':,
    #'n_jobs':,
}

knn = KNeighborsClassifier()
n_iter_search = 50
random_search = RandomizedSearchCV(estimator=knn, param_distributions=dist_params, scoring='roc_auc', n_iter=n_iter_search)

# Normalizo los datos para knn
from sklearn import preprocessing
X_train_normalized = preprocessing.normalize(X_train.T).T
random_search.fit(X_train_normalized, y_train)

mejor_modelo = random_search.best_estimator_
mejor_score = random_search.best_score_

print("Mejor modelo:", mejor_modelo)
print("Mejor score AUCROC:", mejor_score)


Mejor modelo: KNeighborsClassifier(algorithm='kd_tree', leaf_size=22, n_neighbors=25)
Mejor score AUCROC: 0.7755982387982389


In [18]:
from sklearn.svm import SVC

dist_params = {
    'C': scipy.stats.uniform(0, 10),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': scipy.stats.randint(1, 6),
    'gamma': ['scale', 'auto' ],
    'coef0': scipy.stats.uniform(0, 1),
    'shrinking': [True, False],
    'probability': [True, False],
    'tol': scipy.stats.uniform(0, 0.1),
    'cache_size': scipy.stats.randint(1, 1000),
    #'class_weight': [{0: x, 1: 1 - x} for x in np.linspace(0, 1, 10)],
}

svm = SVC()
n_iter_search = 50
random_search = RandomizedSearchCV(estimator=svm, param_distributions=dist_params, scoring='roc_auc', n_iter=n_iter_search)
random_search.fit(X_train, y_train)

mejor_modelo = random_search.best_estimator_
mejor_score = random_search.best_score_

print("Mejor modelo:", mejor_modelo)
print("Mejor score AUCROC::", mejor_score)

Mejor modelo: SVC(C=4.302200282983497, cache_size=219, coef0=0.5954582858481083, degree=2,
    kernel='poly', tol=0.0041216155604131615)
Mejor score AUCROC:: 0.8897176897176898


In [19]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Hubo bastantes errores con los parámetros
dist_params = {
    #'solver': ['svd','lsqr', 'eigen'],
    #'shrinkage': ['auto',None], #scipy.stats.uniform(0, 1),
    #'priors': [],
    #'n_components': scipy.stats.randint(1, min(len(X_train[0,:]), len(X_train[0]-1))),
    'tol': scipy.stats.uniform(0, 0.1)
    #'covariance_estimator':
}

lda = LinearDiscriminantAnalysis()
n_iter_search = 50
random_search = RandomizedSearchCV(estimator=lda, param_distributions=dist_params, scoring='roc_auc', n_iter=n_iter_search)
random_search.fit(X_train, y_train)

mejor_modelo = random_search.best_estimator_
mejor_score = random_search.best_score_

print("Mejor modelo:", mejor_modelo)
print("Mejor score AUCROC::", mejor_score)

Mejor modelo: LinearDiscriminantAnalysis(tol=0.05331761009039276)
Mejor score AUCROC:: 0.7554754726754727


In [20]:
from sklearn.naive_bayes import GaussianNB

dist_params = {
    'var_smoothing': scipy.stats.uniform(0, 1),
    'priors': [None, [0.5, 0.5]],
}

nb = GaussianNB()
n_iter_search = 50
random_search = RandomizedSearchCV(estimator=nb, param_distributions=dist_params, scoring='roc_auc', n_iter=n_iter_search)
random_search.fit(X_train, y_train)

mejor_modelo = random_search.best_estimator_
mejor_score = random_search.best_score_

print("Mejor modelo:", mejor_modelo)
print("Mejor score AUCROC:", mejor_score)

Mejor modelo: GaussianNB(priors=[0.5, 0.5], var_smoothing=0.014665457319101183)
Mejor score AUCROC: 0.8601668997668999


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats

# Definir los parámetros de distribución
dist_params = {
    'n_estimators': scipy.stats.randint(100, 500),
    'max_depth': scipy.stats.randint(1, 100),
    'min_samples_split': scipy.stats.uniform(0.01, 0.99),
    'min_samples_leaf': scipy.stats.uniform(0.01, 0.49),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

# Definir el clasificador
rf = RandomForestClassifier()

# Definir el número de iteraciones
n_iter_search = 50

# Aplicar RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=dist_params, scoring='roc_auc', n_iter=n_iter_search)
random_search.fit(X_train, y_train)

# Obtener el mejor modelo y el mejor score
mejor_modelo = random_search.best_estimator_
mejor_score = random_search.best_score_

print("Mejor modelo:", mejor_modelo)
print("Mejor score AUCROC:", mejor_score)

75 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Administrator\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Administrator\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\Administrator\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Administrator\AppData\Loc

Mejor modelo: RandomForestClassifier(criterion='entropy', max_depth=51, max_features='log2',
                       min_samples_leaf=0.09348461478008974,
                       min_samples_split=0.13890532539746944, n_estimators=413)
Mejor score AUCROC: 0.8179375291375293
