In [1]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [2]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from utils.cleanTransform import cleanTransform as ct

---
---
## 1. Set-Up

In [3]:
# Load datasets
train = pd.read_csv('../data/raw/train.csv')
predict = pd.read_csv('../data/raw/test.csv')

target = 'Transported'

In [4]:
def process_dataframe(df):
    df = ct.transform_passengerId(df)
    df = ct.transform_Cabin(df)
    df = ct.impute_homePlanet(df)
    df = ct.proportional_imputer(df, impute_cols=['Destination', 'Deck', 'Side', 'CabinPosition', 'VIP', 'CryoSleep'])
    df = ct.knn_imputer(df, columns=['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])
    df = ct.create_totalSpent(df)
    df = ct.oneHot(df, oh_cols=['HomePlanet','Destination','Deck','Side','CabinPosition','GroupSize'])
    df = ct.numPipe(df, num_cols=['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','TotalSpent'])
    df = ct.convert_to_int(df)
    df = ct.drop_cols(df, drop_cols=['PassengerNumber','GroupId','Cabin','CabinNumber','Name'])
    return df

In [5]:
# Process using functions in cleanTransform module
train = ct.process_dataframe(train)
predict = ct.process_dataframe(predict)

In [6]:
# Prepare data for training and predicting
X_train, y_train = train.drop(['PassengerId',target], axis=1), train[target]
X_pred = predict.drop('PassengerId', axis=1)

---
---
### Train and Predict

In [8]:
# Instantiate algorithms
rf = RandomForestClassifier(max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100, random_state=13)
lgbm = LGBMClassifier(learning_rate=0.01, max_depth=-1, n_estimators=300, num_leaves=31, random_state=13, verbose=-1)
xgbm = XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8, random_state=13, verbosity=0)

# Store algorithms in a dictionary
algs = {'rf':rf, 'lgbm':lgbm, 'xgbm':xgbm}

In [16]:
# Instatiate an empty dictionary to store to fitted models
models = {}
# Intatiate another empty dictionary to save predictions in
predictions = {}

# Iterate over algorithms and fit to training data
for name, alg in algs.items():
    models[name] = alg.fit(X_train, y_train)    # Store fitted models in models dictionary
    predictions[name] = models[name].predict(X_pred).astype(bool)    # Store predictions in predictions dictionary

In [17]:
# Iterate over predictions and convert each to a dataframe with 'PassengerId' from test and prediction
for model, preds in predictions.items():
    predictions[model] = pd.DataFrame({
            'PassengerId': predict['PassengerId'],
            'Transported': preds
        })

In [18]:
# Save each dataframe for submision to Kaggle
for model, df in predictions.items():
    df.to_csv(f'../submissions/predictions_{model}.csv', index=False)