# Hyperparameter Tuning

Scikit-learn documentation:
* [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
* [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/murilogustineli/hype-tuning/blob/main/gridsearch.ipynb)

In [7]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

# Load sklearn libraries
from sklearn import tree
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import (
    train_test_split, learning_curve, validation_curve,
    StratifiedKFold, GridSearchCV, RandomizedSearchCV)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    make_scorer, classification_report)

### Helper functions

In [9]:
# Rename columns to lower case
def rename_columns(df: pd.DataFrame, ) -> pd.DataFrame:
    # Rename columns to lower case
    cols = list(df.columns)
    lower_cols = [col.replace(" ", "_").lower() for col in cols]
    df.columns = lower_cols
    return df


# Load breast cancer dataset from sklearn
def load_breast_cancer_data():
    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    return X, y


# Preprocess data
def preprocess_data(load_data, test_size=0.2, oversample=False):
    # Load dataset
    X, y = load_data

    # Train/Test Split using Stratified Sampling
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=42)

    # Oversampling using SMOTE
    if oversample:
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    return X_train, X_test, y_train, y_test

### Load data from sklearn

In [12]:
# Load Breast Cancer dataset from sklearn
X, y = load_breast_cancer(return_X_y=True, as_frame=True)

X.shape, y.shape

((569, 30), (569,))

In [11]:
# Get data
X_train, X_test, y_train, y_test = preprocess_data(
    load_data=load_breast_cancer_data(), test_size=0.2, oversample=True)

print(f"Train data: {X_train.shape, y_train.shape}")
print(f"Test data:  {X_test.shape, y_test.shape}")

Train data: ((570, 30), (570,))
Test data:  ((114, 30), (114,))
