<a href="https://colab.research.google.com/github/nicolezk/pet-adoption-prediction/blob/main/notebooks/model_tunning_split_dogs_and_cats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Notebook Configuration

In [None]:
# Mount Google Drive to obtain the data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge, Lasso, LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.utils import resample

from xgboost import XGBClassifier

seed = 123

In [None]:
def balance_classes(df):
  # Balance classes
  # print('Target distribution before balancing:')
  # print(df.AdoptionSpeed.value_counts(dropna=False))

  def round_down(x, k=3):
    n = 10**k
    return x // n * n

  balance_number = int(round_down(df[df.AdoptionSpeed!=0].AdoptionSpeed.value_counts(dropna=False).min(), 3))

  # Separate majority and minority classes
  # Minority
  df_label0 = df[df.AdoptionSpeed==0]

  # Majority
  df_label1 = df[df.AdoptionSpeed==1]
  df_label2 = df[df.AdoptionSpeed==2]
  df_label3 = df[df.AdoptionSpeed==3]
  df_label4 = df[df.AdoptionSpeed==4]
  
  # Upsample minority class
  df_label0 = resample(df_label0, 
                        replace=True,     # sample with replacement
                        n_samples=balance_number,    
                        random_state=seed) # reproducible results

  # Downsample majority classes
  df_label1 = resample(df_label1, replace=False, n_samples=balance_number, random_state=seed)
  df_label2 = resample(df_label2, replace=False, n_samples=balance_number, random_state=seed)
  df_label3 = resample(df_label3, replace=False, n_samples=balance_number, random_state=seed)
  df_label4 = resample(df_label4, replace=False, n_samples=balance_number, random_state=seed)

  # Combine back the df
  df = pd.concat([df_label0, df_label1, df_label2, df_label3, df_label4])

  # print('\nTarget distribution after balancing:')
  # print(df.AdoptionSpeed.value_counts(dropna=False))

  return df

def feature_selection(df):
  # Feature selection
  target_feature = 'AdoptionSpeed'
  columns = [c for c in df.columns if c not in [target_feature, 'Description']] # at first use all columns
  X = df[columns]
  y = df[target_feature]
  return X, y

def preprocessing(split=None):
  # Read data
  df = pd.read_csv('/content/drive/MyDrive/ML - Project/data/pets_feature_engineering.csv')

  if split == 'dogs':
    df = df[df['Type'] == 1]
  elif split == 'cats':
    df = df[df['Type'] == 2]

  for c in ['Color1Name', 'Color2Name']:
    df[c] = df[c].astype("category")

  df = balance_classes(df)
  
  return df

def apply_best_model(X_train, X_test):
  # Pipeline pre-processing
  categorical_transformer = OneHotEncoder(sparse=False, dtype='int', handle_unknown='error')

  preprocessor = ColumnTransformer(
      transformers=[  ('num', 'passthrough', selector(dtype_exclude="category")),
                      ('cat', categorical_transformer, selector(dtype_include="category"))
                  ]
  )

  #  Cross-validation to tune hyperparameters
  model_pipeline = Pipeline(
      steps=[('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier())]
  )

  param_grid = {
      'classifier__n_estimators':  [300],
      'classifier__max_depth': [5, 10, 25],
      'classifier__min_samples_split': [2, 5, 10],
      'classifier__min_samples_leaf': [2, 5, 10],
      'classifier__max_features': ['auto', 'sqrt'],
      'classifier__bootstrap': [True, False],
      'classifier__random_state': [seed]
  }

  # Use n_iter to specify the number of parameter combinations
  # Use random_state as seed for reproducibility
  rs = GridSearchCV(model_pipeline, param_grid, n_jobs=-1, scoring='neg_root_mean_squared_error')

  rf_classifier = rs.fit(X_train, y_train)

  return rf_classifier

In [None]:
df = preprocessing()
X, y = feature_selection(df)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.25,
                                                    random_state=seed)
                                                    
 
best_model = apply_best_model(X_train, X_test)

print(best_model.best_estimator_)
print('Base model score: ' + str(best_model.best_score_))

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f56dbe1f090>),
                                                 ('cat',
                                                  OneHotEncoder(dtype='int',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f56dbe1f450>)])),
                ('classifier',
                 RandomForestClassifier(bootstrap=False, max_depth=25,
                                        min_samples_leaf=2, min_samples_split=5,
                                        n_estimators=300, random_state=123))])
Base model score: -1.271226088442211


Base model score: -1.256761476922338

```Parameters:
      'classifier__n_estimators':  [300],
      'classifier__max_depth': [25],
      'classifier__bootstrap': [False],
      'classifier__random_state': [seed]
  }```

In [None]:
df = preprocessing('cats')
X, y = feature_selection(df)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.25,
                                                    random_state=seed)
                                                    

best_model = apply_best_model(X_train, X_test)

print(best_model.best_estimator_)
print('Only-cats model score: ' + str(best_model.best_score_))

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f56dbf7ded0>),
                                                 ('cat',
                                                  OneHotEncoder(dtype='int',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f56dbf7df10>)])),
                ('classifier',
                 RandomForestClassifier(bootstrap=False, max_depth=25,
                                        min_samples_split=5, n_estimators=300,
                                        random_state=123))])
Only-cats model score: -1.3305087962896056


Only-cats model score: -1.3359151974406172

```Parameters:
      'classifier__n_estimators':  [300],
      'classifier__max_depth': [25],
      'classifier__bootstrap': [False],
      'classifier__random_state': [seed]
  }```

In [None]:
df = preprocessing('dogs')
X, y = feature_selection(df)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.25,
                                                    random_state=seed)
                                                    

best_model = apply_best_model(X_train, X_test)

print(best_model.best_estimator_)
print('Only-dogs model score: ' + str(best_model.best_score_))

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f56db2b9350>),
                                                 ('cat',
                                                  OneHotEncoder(dtype='int',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f56db2b9050>)])),
                ('classifier',
                 RandomForestClassifier(bootstrap=False, max_depth=25,
                                        n_estimators=300, random_state=123))])
Only-dogs model score: -1.2126444186955354


Only-dogs model score: -1.2126444186955354

```Parameters:
      'classifier__n_estimators':  [300],
      'classifier__max_depth': [25],
      'classifier__bootstrap': [False],
      'classifier__random_state': [seed]
  }```