In [None]:
# standard libraries
import pandas as pd
import numpy as np

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# tensorflow
import tensorflow as tf
import tensorflow_decision_forests as tfdf

# scikit-learn
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Set custom styles for text color
plt.rcParams['text.color'] = 'white'
plt.rcParams['axes.labelcolor'] = 'white'
plt.rcParams['axes.titlecolor'] = 'white'
plt.rcParams['xtick.color'] = 'white'  # Color of the x-axis tick values
plt.rcParams['ytick.color'] = 'white'  # Color of the y-axis tick values

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
test.head()

In [None]:
test.info()

In [None]:
submission.head()

In [None]:
submission.info()

In [None]:
target = train.columns.to_list()[-1]
# convert target from bool to int
train[target] = train[target].astype(int)
train_target = train[target]

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(data = train, x = target, color='g')
plt.title('Class distribution')
plt.show()

In [None]:
train.drop(target, axis=1, inplace=True)
train_id = train['PassengerId']
test_id = test['PassengerId']

In [None]:
# combine over rows, keeping id for now as group number might be useful
combined = pd.concat([train, test], axis = 0)

### Feature engineering

In [None]:
combined.info()

In [None]:
combined

In [None]:
combined['Group'] = combined['PassengerId'].str[:4]
combined.drop('PassengerId', inplace=True, axis=1)

In [None]:
# categorical columns first
col_cat_NA = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']

# fill with Group first
for col in col_cat_NA:
    combined[col] = combined.groupby('Group')[col].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))
    
# need to do this, so we can use homeplanet to fill values for destination
combined['HomePlanet'].fillna('', inplace=True)
combined['Destination'] = combined.groupby('HomePlanet')['Destination'].transform(
    lambda x: x.fillna(x.mode()[0]))
# reverse what we did earlier
combined['HomePlanet'].replace('', np.nan, inplace=True)

# now fill homeplanet with destination
combined['HomePlanet'] = combined.groupby('Destination')['HomePlanet'].transform(
    lambda x: x.fillna(x.mode()[0]))

combined['CryoSleep'] = combined.groupby('Destination')['CryoSleep'].transform(
    lambda x: x.fillna(x.mode()[0]))
combined['VIP'].fillna(False, inplace=True)

In [None]:
# numerical columns now
col_num_NA = ['RoomService', 'FoodCourt', 'ShoppingMall',
             'Spa', 'VRDeck', 'Age']

# fill with Group first
for col in col_num_NA:
    combined[col] = combined.groupby('Group')[col].transform(
        lambda x: x.fillna(x.mean()))
    
# fill with destination now
for col in col_num_NA:
    combined[col] = combined.groupby('Destination')[col].transform(
        lambda x: x.fillna(x.mean()))
    
# take log due to skewed distribution
col_num_log = col_num_NA
for col in col_num_log:
    combined[col] = np.log1p(combined[col])
    
# adding new numerical features
combined['TotalSpent'] = combined[['RoomService', 'FoodCourt', 
                                   'ShoppingMall','Spa', 'VRDeck']].sum(axis=1)

# since the distribution is so skewed to 0s, this might help
combined['moneySpent'] = (combined['TotalSpent']>0).astype(int)

In [None]:
# need to break cabin down, this is screwing it up when the entire cabin isn't available
combined[['deck', 'cabin_num', 'side']] = combined['Cabin'].str.split('/', expand=True)

combined.drop('Cabin', inplace=True, axis=1)

col_cat_na = ['deck', 'cabin_num', 'side']

for col in col_cat_na:
    combined[col] = combined.groupby('Destination')[col].transform(
        lambda x: x.fillna(x.mode()[0]))
combined['cabin_num'] = combined['cabin_num'].astype(int)

In [None]:
#combined[['Name']].info()

In [None]:
#combined['lastName'] = combined['Name'].str.split(' ', expand=True)[1]

In [None]:
# # next two statements don't do anything, I guess no mode values...
# combined['lastName'] = combined.groupby('Group')['lastName'].transform(
#     lambda x: x.fillna(x.mode() if not x.mode().empty else np.nan))

# combined['lastName'] = combined.groupby('Destination')['lastName'].transform(
#     lambda x: x.fillna(x.mode()))

# combined['lastName'].fillna('lastName', inplace=True)

In [None]:
# drop name
combined.drop('Name', inplace=True, axis=1)

In [None]:
combined['VIP'].unique()

In [None]:
col = 'TotalSpent'
plt.figure(figsize=(12,8))
sns.displot(data=combined, x=col, color='g', kde=True)
plt.show()

In [None]:
col_bool = ['CryoSleep', 'VIP']

# correct dtypes of boolean columns from object to bool then to int
for col in col_bool:
    combined[col] = combined[col].astype(bool).astype(int)

In [None]:
combined.info()

In [None]:
# one hot encoding
combined = pd.get_dummies(combined, columns=['HomePlanet', 'Destination', 'deck', 'side'])

# shouldn't be, but for now, let's...maybe this does make sense after all
combined['Group'] = combined['Group'].astype('int')

In [None]:
# change dtypes to int
combined.iloc[:, -16:] = combined.iloc[:, -16:].astype(int)

In [None]:
combined.info()

In [None]:
# not even sure why do this
#combined = pd.get_dummies(combined, columns=['lastName'])
#combined.iloc[:, -2407:] = combined.iloc[:, -2407:].astype(int).info()
#combined.info()

### Model design

In [None]:
train = combined.iloc[:train_id.shape[0]]
test = combined.iloc[-test_id.shape[0]:]
train[target] = train_target
print('Train shape', train.shape)
print('Test shape', test.shape)

In [None]:
# let's split train data into train/test
def train_test_split(data, test_ratio=0.3):
    test_indices = np.random.rand(data.shape[0]) < test_ratio
    return data[~test_indices], data[test_indices]
train_data, test_data = train_test_split(train)
print('{} samples in training, {} samples in testing data'.format(train_data.shape[0], 
                                                            test_data.shape[0]))

In [None]:
train_data

In [None]:
train_data.info()

Model: Logistic Regression

In [None]:
#lr = LogisticRegression(max_iter=10000)
#lr.fit(train_data.iloc[:, :-1], train_data[target])

In [None]:
#prediction = lr.predict(test_data.iloc[:, :-1])

In [None]:
#accuracy = accuracy_score(test_data[target], prediction)
#print('Accuracy of {}'.format(accuracy))

Model: SVM (takes a while to run this though)

In [None]:
# param_grid = {
#     'C': [0.1, 1, 10], # regularization parameter
#     'kernel': ['linear', 'rbf'], # kernel type
#     'gamma': [0.1, 1, 10] # kernel coefficient
# }

# svm = SVC()
# grid_search = GridSearchCV(svm, param_grid, cv=5)
# grid_search.fit(train_data.iloc[:, :-1], train_data[target])

# grid_search.best_params_

Model: TFDF

In [None]:
# convert these to tensorflow datasets
train_data = tfdf.keras.pd_dataframe_to_tf_dataset(train_data, label=target)
test_data = tfdf.keras.pd_dataframe_to_tf_dataset(test_data, label=target)

In [None]:
rf = tfdf.keras.RandomForestModel()
rf.fit(train_data)

In [None]:
rf.summary()

In [None]:
rf.compile(metrics=['accuracy'])
evaluation = rf.evaluate(test_data, return_dict=True)
display(evaluation)

In [None]:
# tfdf.model_plotter.plot_model_in_colab(rf, tree_idx=0, max_depth=3)

In [None]:
# let's try bootstrapping here

In [None]:
# bs_train = []
# for i in range(100):
#     sample = train.sample(frac=0.3)
#     bs_train.append(sample)
    
# bs_train = pd.concat(bs_train, axis=0)
# train = bs_train

In [None]:
# prior to prediction, let's train the model on entire train data
train = tfdf.keras.pd_dataframe_to_tf_dataset(train, label=target)

rf = tfdf.keras.RandomForestModel()
rf.fit(train)

In [None]:
rf.summary()

Prediction

In [None]:
test = tfdf.keras.pd_dataframe_to_tf_dataset(test)
predictions = rf.predict(test)
output = pd.DataFrame({'PassengerId': test_id, 'Transported': predictions.squeeze()})
# convert to boolean
output[target] = (output[target] >= 0.5)

In [None]:
output.info()

In [None]:
output.to_csv('data/sample_submission.csv', index=False)