In [None]:
# 

## Tuning RandomForest with Gridsearch

In [1]:
import pandas as pd
from ml.data import basic_preprocess


In [2]:
train = pd.read_csv('./data/train.csv')

In [3]:
df, cat_cols, num_cols, target =  basic_preprocess(train, train=True, target='Transported')

2022-10-28 09:51:34 INFO Replacing " " and "-" with "_" in and  column names and converting to lower case
2022-10-28 09:51:34 INFO Converting CryoSleep to binary
2022-10-28 09:51:34 INFO Converting VIP to binary
2022-10-28 09:51:34 INFO Splitting "cabin" from `deck/num/side` to `deck` `number` and `side`
2022-10-28 09:51:34 INFO Found 7 categorical columns
2022-10-28 09:51:34 INFO Categorical columns: ['passengerid', 'homeplanet', 'destination', 'name', 'cabin_deck', 'cabin_number', 'cabin_side']
2022-10-28 09:51:34 INFO Found 9 numerical columns
2022-10-28 09:51:34 INFO Numerical columns: ['cryosleep', 'age', 'vip', 'roomservice', 'foodcourt', 'shoppingmall', 'spa', 'vrdeck', 'transported']
2022-10-28 09:51:34 INFO Converting str to lower case
2022-10-28 09:51:34 INFO Converting target to binary
2022-10-28 09:51:34 INFO Removing target from cat_cols or num_cols
2022-10-28 09:51:34 INFO Filling categorical columns with mode
2022-10-28 09:51:34 INFO Filling numerical columns with mean


In [4]:
df.head()

Unnamed: 0,passengerid,homeplanet,cryosleep,destination,age,vip,roomservice,foodcourt,shoppingmall,spa,vrdeck,name,transported,cabin_deck,cabin_number,cabin_side
0,0001_01,europa,0.0,trappist-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,maham ofracculy,0,b,0,p
1,0002_01,earth,0.0,trappist-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,juanna vines,1,f,0,s
2,0003_01,europa,0.0,trappist-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,altark susent,0,a,0,s
3,0003_02,europa,0.0,trappist-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,solam susent,0,a,0,s
4,0004_01,earth,0.0,trappist-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,willy santantines,1,f,1,s


In [5]:
#Removing the `passengerid`, `name` from cat_cols
final_cols = ['homeplanet', 'destination', 'cabin_deck', 'cabin_number', 'cabin_side'] + num_cols

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

# Split data into train and validation
df_train, df_val = train_test_split(df, test_size=0.2, random_state=1)

# Split data into X and y
X_train_df = df_train.drop(target, axis=1)
y_train = df_train[target]

X_val_df = df_val.drop(target, axis=1)
y_val = df_val[target]

# convert to dicts
train_dicts = X_train_df[final_cols].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)

# transform dicts
X_train = dv.transform(train_dicts)
val_dicts = X_val_df[final_cols].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [9]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=1, n_jobs=-1, oob_score=True)
model.fit(X_train, y_train)

# Predict on validation data
y_pred_prob = model.predict_proba(X_val)[:, 1]
y_pred = y_pred_prob > 0.5

# Calculate metrics
auc = roc_auc_score(y_val, y_pred_prob)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
accuracy = accuracy_score(y_val, y_pred)

# Print metrics
print('***** Validation metrics *****')
print(f"AUC: {auc:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1: {f1:.3f}")
print(f"Accuracy: {accuracy:.3f}")
print('******************************')

***** Validation metrics *****
AUC: 0.874
Precision: 0.816
Recall: 0.772
F1: 0.794
Accuracy: 0.793
******************************


In [11]:
# Tuning Randomforest model
from sklearn.model_selection import GridSearchCV

# Define parameters
params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 3, 4]
}

# Create model
model = RandomForestClassifier(random_state=1, n_jobs=-1, oob_score=True)

# Create grid search with f1 scoring with verbose=1
grid = GridSearchCV(model, params, scoring='f1', cv=5, n_jobs=-1, verbose=1)

# Fit grid search
grid.fit(X_train, y_train)

# Print best parameters
print('***** Best parameters *****')
print(grid.best_params_)
print('***************************')

# print best score
print('***** Best score *****')
print(grid.best_score_)
print('***************************')

Fitting 5 folds for each of 54 candidates, totalling 270 fits


KeyboardInterrupt: 