In [17]:
import pandas as pd
from ml.data import basic_preprocess

In [18]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [19]:
df, cat_cols, num_cols, target =  basic_preprocess(train, train=True, target='Transported')
df_test, _, _ , _ = basic_preprocess(test, train=False, target='Transported')

2022-11-01 07:03:27 INFO Replacing " " and "-" with "_" in and  column names and converting to lower case
2022-11-01 07:03:27 INFO Converting CryoSleep to binary
2022-11-01 07:03:27 INFO Converting VIP to binary
2022-11-01 07:03:27 INFO Splitting "cabin" from `deck/num/side` to `deck` `number` and `side`
2022-11-01 07:03:28 INFO Found 6 categorical columns
2022-11-01 07:03:28 INFO Categorical columns: ['passengerid', 'homeplanet', 'destination', 'name', 'cabin_deck', 'cabin_side']
2022-11-01 07:03:28 INFO Found 10 numerical columns
2022-11-01 07:03:28 INFO Numerical columns: ['cryosleep', 'age', 'vip', 'roomservice', 'foodcourt', 'shoppingmall', 'spa', 'vrdeck', 'transported', 'cabin_number']
2022-11-01 07:03:28 INFO Converting str to lower case
2022-11-01 07:03:28 INFO Converting target to binary
2022-11-01 07:03:28 INFO Removing target from cat_cols or num_cols
2022-11-01 07:03:28 INFO Filling categorical columns with mode
2022-11-01 07:03:29 INFO Filling numerical columns with mean


In [20]:
df.head()

Unnamed: 0,passengerid,homeplanet,cryosleep,destination,age,vip,roomservice,foodcourt,shoppingmall,spa,vrdeck,name,transported,cabin_deck,cabin_number,cabin_side
0,0001_01,europa,0.0,trappist-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,maham ofracculy,0,b,0.0,p
1,0002_01,earth,0.0,trappist-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,juanna vines,1,f,0.0,s
2,0003_01,europa,0.0,trappist-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,altark susent,0,a,0.0,s
3,0003_02,europa,0.0,trappist-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,solam susent,0,a,0.0,s
4,0004_01,earth,0.0,trappist-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,willy santantines,1,f,1.0,s


In [21]:
cat_cols

['passengerid',
 'homeplanet',
 'destination',
 'name',
 'cabin_deck',
 'cabin_side']

## Convert to dict_vectorizer

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

# Final features
cat_cols_to_use = ['homeplanet', 'cabin_deck', 'cabin_side', 'destination']
final_cols = cat_cols_to_use + list(num_cols)

# Split data into train and validation
df_train, df_val = train_test_split(df, test_size=0.2, random_state=1)

# Split data into X and y
X_train_df = df_train.drop(target, axis=1)
y_train = df_train[target]

X_val_df = df_val.drop(target, axis=1)
y_val = df_val[target]

# convert to dicts
train_dicts = X_train_df[final_cols].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)

# transform dicts
X_train = dv.transform(train_dicts)
val_dicts = X_val_df[final_cols].to_dict(orient='records')
X_val = dv.transform(val_dicts)

## Train Randon Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=1, n_jobs=-1, oob_score=True)
model.fit(X_train, y_train)

# Predict on validation data
y_pred_prob = model.predict_proba(X_val)[:, 1]
y_pred = y_pred_prob > 0.5

from ml.model import print_metrics
print_metrics(y_val, y_pred, y_pred_prob)


Accuracy: 0.802
Precision: 0.834
Recall: 0.769
F1: 0.800
AUC: 0.882


## Train XGBoost

In [26]:
import xgboost as xgb

# convert data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Train model
params = {
    'max_depth': 3,
    'eta': 0.1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'seed': 1,
}

model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dval, 'val')], early_stopping_rounds=10, verbose_eval=100)

# Predict on validation data
y_pred_prob = model.predict(dval)
y_pred = y_pred_prob > 0.5

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
accuracy = accuracy_score(y_val, y_pred)

print(f'****** Accuracy: {accuracy}')
print('******************************')

[0]	val-auc:0.83651
[55]	val-auc:0.90397
****** Accuracy: 0.8223116733755031
******************************


In [None]:
# predict on test data
X_test_df = df_test[final_cols]
test_dicts = X_test_df.to_dict(orient='records')
X_test = dv.transform(test_dicts)
dtest = xgb.DMatrix(X_test)
y_pred_prob = model.predict(dtest)
y_pred = y_pred_prob > 0.5

# save predictions
df_test['Transported'] = y_pred
#rename columns
df_test.rename(columns={'passengerid': 'PassengerId'}, inplace=True)
df_test[['PassengerId', 'Transported']].to_csv('xgb_predictions.csv', index=False)



## Tuning XGB hyperparameters:

In [None]:
# XGB Hyperparameters tuning:
eta = [0.1, 0.2, 0.3, 0.4, 0.5]
max_depth = [3, 4, 5, 6, 7, 8, 9, 10]
for e in eta:
    for m in max_depth:
        # Train model
        params = {
            'max_depth': m,
            'eta': e,
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'seed': 1,
        }

        model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dval, 'val')], early_stopping_rounds=10, verbose_eval=100)

        # Predict on validation data
        y_pred_prob = model.predict(dval)
        y_pred = y_pred_prob > 0.5

        from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
        accuracy = accuracy_score(y_val, y_pred)

        print(f'****** eta: {e} - max_depth: {m} ******  --- Accuracy: {accuracy}')
        print('****************************************************************************************************') 