In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification

def get_datasets():
    X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 
                              n_redundant=5, random_state=1)
    return X, y

In [2]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [3]:
## definir los hyperparametros del modelo
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
distance = ['euclidean', 'manhattan', 'minkowski']

In [4]:
## definir el GridSearch
X, y =  get_datasets()
grid = dict(n_neighbors=n_neighbors, weights=weights, metric=distance)
cv = KFold(n_splits=5)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1')
grid_result = grid_search.fit(X, y)

In [6]:
import pandas as pd
X_oversample = pd.read_csv('datasets/X_oversample.csv')
y_oversample = pd.read_csv('datasets/y_oversample.csv')
Test = pd.read_csv('datasets/TestEncoded.csv')

In [7]:
from xgboost import XGBClassifier

In [8]:
# model = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2)
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

def grid_search(params, random=False):
    ## XGBoost standard
    xgb = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2)
    
    ## Kfold
    kfold = KFold(n_splits=5)
    
    ## if statement con Random o sin Random
    if random:
        grid = RandomizedSearchCV(xgb, params, cv=kfold, n_iter=5, n_jobs=-1)
    else:
        grid = GridSearchCV(xgb, params, cv=kfold, n_jobs=-1)
        
    ## entrenar el modelo en el grid
    grid.fit(X_oversample, y_oversample)
    
    ## obtener e imprimir los mejores parametros
    best_params = grid.best_params_
    print(best_params)
    
    ## obtener e imprimir los mejores scores
    best_score = grid.best_score_
    print("best score", best_score)

In [9]:
grid_search(params={'n_estimators': [2, 25, 50, 75, 100]})

  return f(*args, **kwargs)


{'n_estimators': 100}
best score 0.8082176541749503


In [10]:
grid_search(params={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8], 'n_estimators':[100]})

{'max_depth': 8, 'n_estimators': 100}
best score 0.8108818364990361


In [11]:
grid_search(params={'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
                    'max_depth': [8],
                    'n_estimators': [100]})

{'learning_rate': 0.5, 'max_depth': 8, 'n_estimators': 100}
best score 0.8127356999894658


In [12]:
grid_search(params={'min_child_weight': [1, 2, 3, 4, 5],
                    'learning_rate': [0.2],
                    'max_depth': [8],
                    'n_estimators': [100]})

{'learning_rate': 0.2, 'max_depth': 8, 'min_child_weight': 3, 'n_estimators': 100}
best score 0.8104180015982276


In [13]:
model = XGBClassifier(booster='gbtree', 
                      objective='binary:logistic', 
                      random_state=2,
                      n_estimators=100,
                      max_depth=8,
                      learning_rate=0.2,
                      min_child_weight=1)

In [14]:
model.fit(X_oversample, y_oversample)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=2,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [15]:
y_pred = model.predict(Test)

In [18]:
def results_to_csv(file_name, y_pred):
    model = pd.DataFrame({
        'id': list(range(1, len(y_pred)+1)),
        'rating': y_pred
    })
    model.to_csv("datasets/{}.csv".format(file_name), index=False)
    return model

In [19]:
results_to_csv("XGBoost-HyperparameterTunning-1", y_pred)

Unnamed: 0,id,rating
0,1,1.0
1,2,1.0
2,3,0.0
3,4,1.0
4,5,1.0
...,...,...
1443,1444,1.0
1444,1445,1.0
1445,1446,1.0
1446,1447,0.0
