## Regression template

### 1. Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm.notebook import tqdm

In [2]:
random.seed(42)
np.random.seed(42)
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
tqdm.pandas()
sns.set()

In [4]:
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, \
                            accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

In [None]:
from helper_funcs import *

### 2. Reading data

In [None]:
df = pd.read_csv('',index_col=0)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.nunique()

### 3. Exploring data

In [None]:
df.isna().sum()

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cmap='YlGnBu', cbar=False)
plt.title('Missing Values Heatmap')
plt.show()

### 4. Feature engineering

In [18]:
target = 'Transported'
feats_num = []
feats_bool = []
feats_cat_high_card = []
feats_cat_low_card = []

In [19]:
feats = feats_num + feats_bool + feats_cat_low_card

**Dealing with missing values**

In [None]:
print(f'Shape before {df.shape}')
df_drop_na = df.dropna()
print(f'Shape after {df_drop_na.shape}, reduction of {100*(df.shape[0] - df_drop_na.shape[0])/df.shape[0]:.2f}%')

In [43]:
df_final = df_drop_na

### 5. Preparing X,y

In [44]:
X = df_final[feats]

y = df_final[target]

In [None]:
X_hot = pd.get_dummies(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_hot, y, test_size=0.20, random_state=42)

### 6. Models with default parameters

#### 6.1 Random Forest

In [None]:
# Instantiate and train the Random Forest classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
# Score the Random Forest classifier
rf_scores = score_classification_model(rf, X_train, y_train, X_test, y_test)

In [None]:
# Perform cross-validation with Random Forest classifier
rf = RandomForestClassifier()
cv_results = cross_validate(rf, X_train, y_train, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])

print('Test scores accuracy mean:', cv_results['test_accuracy'].mean(), 'Test scores accuracy std:', cv_results['test_accuracy'].std())
print('Test scores precision mean:', cv_results['test_precision'].mean(), 'Test scores precision std:', cv_results['test_precision'].std())
print('Test scores recall mean:', cv_results['test_recall'].mean(), 'Test scores recall std:', cv_results['test_recall'].std())
print('Test scores F1 mean:', cv_results['test_f1'].mean(), 'Test scores F1 std:', cv_results['test_f1'].std())
print('Test scores AUC mean:', cv_results['test_roc_auc'].mean(), 'Test scores AUC std:', cv_results['test_roc_auc'].std())

#### 6.2 XGBoost

In [None]:
# Instantiate and train the XGBoost classifier
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)

In [None]:
# Score the XGBoost classifier
xgb_scores = score_classification_model(xgb_clf, X_train, y_train, X_test, y_test)

In [None]:
# Perform cross-validation with XGBoost classifier
xgb_clf = xgb.XGBClassifier()
cv_results = cross_validate(xgb_clf, X_train, y_train, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])

print('Test scores accuracy mean:', cv_results['test_accuracy'].mean(), 'Test scores accuracy std:', cv_results['test_accuracy'].std())
print('Test scores precision mean:', cv_results['test_precision'].mean(), 'Test scores precision std:', cv_results['test_precision'].std())
print('Test scores recall mean:', cv_results['test_recall'].mean(), 'Test scores recall std:', cv_results['test_recall'].std())
print('Test scores F1 mean:', cv_results['test_f1'].mean(), 'Test scores F1 std:', cv_results['test_f1'].std())
print('Test scores AUC mean:', cv_results['test_roc_auc'].mean(), 'Test scores AUC std:', cv_results['test_roc_auc'].std())

### 7. Randomized search

#### 7.1 Random Forest

In [2]:
n_estimators = [1, 5, 10, 50, 100, 200, 500]
max_depth = [1, 5, 10, 20, 30, 50]
max_features = ['auto', 'sqrt']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

rf_grid = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator=rf_grid, param_distributions=random_grid, scoring='accuracy', n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

rf_best_params = rf_random.best_params_
print("Best parameters for Random Forest:", rf_best_params)

rf_best = rf_random.best_estimator_
rf_best_scores = score_classification_model(rf_best, X_train, y_train, X_test, y_test, name='best_rf')

UsageError: Line magic function `%%time` not found.


#### 7.2 XGBoost

In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [1, 5, 10, 50, 100, 200, 500],
    'max_depth': [1, 5, 10, 20, 30, 50],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
}

# Create an instance of the XGBoost classifier
xgb_clf = xgb.XGBClassifier()

# Create a RandomizedSearchCV instance
xgb_random = RandomizedSearchCV(estimator=xgb_clf, param_distributions=param_grid, scoring='accuracy', n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)
xgb_random.fit(X_train, y_train)

xgb_best_params = xgb_random.best_params_
print("Best parameters for XGBoost:", xgb_best_params)

# Use the best estimator from RandomizedSearchCV
xgb_best = xgb_random.best_estimator_
xgb_best_scores = score_classification_model(xgb_best, X_train, y_train, X_test, y_test, name='best_xgb')