#  Demo

In [None]:
# standard libraries
import pandas as pd
import numpy as np

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Set custom styles for text color
plt.rcParams['text.color'] = 'white'
plt.rcParams['axes.labelcolor'] = 'white'
plt.rcParams['axes.titlecolor'] = 'white'
plt.rcParams['xtick.color'] = 'white'  # Color of the x-axis tick values
plt.rcParams['ytick.color'] = 'white'  # Color of the y-axis tick values

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
test.head()

In [None]:
test.info()

In [None]:
submission.head()

In [None]:
submission.info()

In [None]:
target = train.columns.to_list()[-1]
train_target = train[target]

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(data = train, x = target, color='g')
plt.title('Class distribution')
plt.show()

In [None]:
train.drop(target, axis=1, inplace=True)
train_id = train['PassengerId']
test_id = test['PassengerId']

In [None]:
# combine over rows, keeping id for now as group number might be useful
combined = pd.concat([train, test], axis = 0)

### Feature engineering

In [None]:
combined.info()

In [None]:
combined

In [None]:
combined['Group'] = combined['PassengerId'].str[:4]
combined.drop('PassengerId', inplace=True, axis=1)

In [None]:
# categorical columns first
col_cat_NA = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']

# fill with Group first
for col in col_cat_NA:
    combined[col] = combined.groupby('Group')[col].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))

In [None]:
combined.info()

In [None]:
# need to do this, so we can use homeplanet to fill values for destination
combined['HomePlanet'].fillna('', inplace=True)
combined['Destination'] = combined.groupby('HomePlanet')['Destination'].transform(
    lambda x: x.fillna(x.mode()[0]))
# reverse what we did earlier
combined['HomePlanet'].replace('', np.nan, inplace=True)

# now fill homeplanet with destination
combined['HomePlanet'] = combined.groupby('Destination')['HomePlanet'].transform(
    lambda x: x.fillna(x.mode()[0]))

combined['CryoSleep'] = combined.groupby('Destination')['CryoSleep'].transform(
    lambda x: x.fillna(x.mode()[0]))
combined['VIP'].fillna(False, inplace=True)

In [None]:
# numerical columns now
col_num_NA = ['RoomService', 'FoodCourt', 'ShoppingMall',
             'Spa', 'VRDeck', 'Age']

# fill with Group first
for col in col_num_NA:
    combined[col] = combined.groupby('Group')[col].transform(
        lambda x: x.fillna(x.mean()))

In [None]:
combined.info()

In [None]:
# fill with destination now
for col in col_num_NA:
    combined[col] = combined.groupby('Destination')[col].transform(
        lambda x: x.fillna(x.mean()))

In [None]:
col = 'RoomService'
plt.figure(figsize=(12,8))
sns.displot(data=combined, x=col, color='g', kde=True)
# plt.ylim(0,500)
plt.show()

In [None]:
# take log due to skewed distribution
col_num_log = col_num_NA
for col in col_num_log:
    combined[col] = np.log1p(combined[col])
    
# adding new numerical features
combined['TotalSpent'] = combined[['RoomService', 'FoodCourt', 
                                   'ShoppingMall','Spa', 'VRDeck']].sum(axis=1)

# since the distribution is so skewed to 0s, this might help
combined['moneySpent'] = (combined['TotalSpent']>0).astype(int)

In [None]:
col = 'RoomService'
plt.figure(figsize=(12,8))
sns.displot(data=combined, x=col, color='g', kde=True)
plt.show()

In [None]:
combined.info()

In [None]:
# need to break cabin down, this is screwing it up when the entire cabin isn't available
combined[['deck', 'cabin_num', 'side']] = combined['Cabin'].str.split('/', expand=True)

combined.drop('Cabin', inplace=True, axis=1)

col_cat_na = ['deck', 'cabin_num', 'side']

for col in col_cat_na:
    combined[col] = combined.groupby('Destination')[col].transform(
        lambda x: x.fillna(x.mode()[0]))
combined['cabin_num'] = combined['cabin_num'].astype(int)

In [None]:
# drop name
combined.drop('Name', inplace=True, axis=1)

In [None]:
combined['VIP'].unique()

In [None]:
combined.info()

In [None]:
# # one hot encoding
combined = pd.get_dummies(combined, columns=['HomePlanet', 'Destination', 'deck', 'side'])

combined['Group'] = combined['Group'].astype('int')

In [None]:
combined.info()

In [None]:
combined.head()

In [None]:
combined['VIP'] = combined['VIP'].astype(int)

### Model design

In [None]:
train = combined.iloc[:train_id.shape[0]]
test = combined.iloc[-test_id.shape[0]:]
train[target] = train_target
print('Train shape', train.shape)
print('Test shape', test.shape)

In [None]:
# let's split train data into train/test
def train_test_split(data, test_ratio=0.3):
    test_indices = np.random.rand(data.shape[0]) < test_ratio
    return data[~test_indices], data[test_indices]
train_data, test_data = train_test_split(train)
print('{} samples in training, {} samples in testing data'.format(train_data.shape[0], 
                                                            test_data.shape[0]))

In [None]:
train_data

In [None]:
train_data.info()

Model: Decision Tree

In [None]:
dt_classifier = DecisionTreeClassifier()

In [None]:
# Define a parameter grid for grid search
param_grid = {
    'max_depth': np.arange(1, 11),  # Vary the max depth from 1 to 10
    'max_leaf_nodes': np.arange(2, 21),  # Vary the max leaf nodes from 2 to 20
}

# Perform grid search with cross-validation, 5-folds, 5 subset for accuracy validation
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5)
grid_search.fit(train_data.iloc[:, :-1], train_data[target])

In [None]:
# Get the best parameters from grid search
best_params = grid_search.best_params_
best_params

In [None]:
# Train a decision tree classifier with the best parameters
best_dt_classifier = DecisionTreeClassifier(max_depth=best_params['max_depth'], max_leaf_nodes=best_params['max_leaf_nodes'])
best_dt_classifier.fit(train_data.iloc[:, :-1], train_data[target])

In [None]:
# Make predictions on the test set
y_pred = best_dt_classifier.predict(test_data.iloc[:, :-1])

In [None]:
# Calculate accuracy on the test set
accuracy = accuracy_score(test_data[target], y_pred)
print(f'Accuracy on the test set: {accuracy}')