# Team 2 - Spaceship Titanic

**Members** 

- Dejanire Bernal
- Claudia Contreras
- Carla Forte 
- Nadeen Ilayan
- Zeeshan Siddiqui 


In [252]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


import warnings
warnings.filterwarnings("ignore") 

In [253]:
# Get the current working directory
cwd = Path.cwd()

# Construct the path to the data directory
datapath = cwd / 'data'

# Build the path to the train data file
train_path = datapath / 'train.csv'
test_path = datapath / 'test.csv'

# Load the train data
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Data Cleaning

In [254]:
# Checking for missing values in both datasets
missing_values_train = train_df.isnull().sum()
missing_values_test = test_df.isnull().sum()

missing_values_train, missing_values_test

(PassengerId       0
 HomePlanet      201
 CryoSleep       217
 Cabin           199
 Destination     182
 Age             179
 VIP             203
 RoomService     181
 FoodCourt       183
 ShoppingMall    208
 Spa             183
 VRDeck          188
 Name            200
 Transported       0
 dtype: int64,
 PassengerId       0
 HomePlanet       87
 CryoSleep        93
 Cabin           100
 Destination      92
 Age              91
 VIP              93
 RoomService      82
 FoodCourt       106
 ShoppingMall     98
 Spa             101
 VRDeck           80
 Name             94
 dtype: int64)

In [255]:
# Fill missing values with 0 for specified columns when CryoSleep is True
specified_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for col in specified_cols:
    train_df.loc[train_df['CryoSleep'] == True, col] = train_df.loc[train_df['CryoSleep'] == True, col].fillna(0)
    test_df.loc[test_df['CryoSleep'] == True, col] = test_df.loc[test_df['CryoSleep'] == True, col].fillna(0)

# Re-check missing values after this operation
missing_values_train_after = train_df.isnull().sum()
missing_values_test_after = test_df.isnull().sum()

missing_values_train_after, missing_values_test_after

(PassengerId       0
 HomePlanet      201
 CryoSleep       217
 Cabin           199
 Destination     182
 Age             179
 VIP             203
 RoomService     113
 FoodCourt       113
 ShoppingMall    112
 Spa             118
 VRDeck          126
 Name            200
 Transported       0
 dtype: int64,
 PassengerId       0
 HomePlanet       87
 CryoSleep        93
 Cabin           100
 Destination      92
 Age              91
 VIP              93
 RoomService      57
 FoodCourt        67
 ShoppingMall     63
 Spa              59
 VRDeck           51
 Name             94
 dtype: int64)

In [256]:
# Filling missing values for numerical columns with the median
for col in specified_cols:
    train_df[col].fillna(train_df[col].median(), inplace=True)
    test_df[col].fillna(test_df[col].median(), inplace=True)

# For categorical columns, we use the mode; for numerical columns, we use the median.
categorical_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Age']

for col in categorical_cols:
    if col != 'Age':  # Age is a special case, will be handled separately for age groups
        train_df[col].fillna(train_df[col].mode()[0], inplace=True)
        test_df[col].fillna(test_df[col].mode()[0], inplace=True)

# Re-check missing values after filling
missing_values_train_filled = train_df.isnull().sum()
missing_values_test_filled = test_df.isnull().sum()

# Dividing Age into age groups
train_df['AgeGroup'] = pd.cut(train_df['Age'], bins=[0, 12, 18, 60, 100], labels=['Child', 'Teen', 'Adult', 'Senior'])
test_df['AgeGroup'] = pd.cut(test_df['Age'], bins=[0, 12, 18, 60, 100], labels=['Child', 'Teen', 'Adult', 'Senior'])

# Filling missing 'Age' and 'AgeGroup' based on median age
median_age_train = train_df['Age'].median()
median_age_test = test_df['Age'].median()
train_df['Age'].fillna(median_age_train, inplace=True)
test_df['Age'].fillna(median_age_test, inplace=True)
train_df['AgeGroup'].fillna(pd.cut([median_age_train], bins=[0, 12, 18, 60, 100], labels=['Child', 'Teen', 'Adult', 'Senior'])[0], inplace=True)
test_df['AgeGroup'].fillna(pd.cut([median_age_test], bins=[0, 12, 18, 60, 100], labels=['Child', 'Teen', 'Adult', 'Senior'])[0], inplace=True)


In [257]:
# Creating new feature: TotalSpending
train_df['TotalSpending'] = train_df[specified_cols].sum(axis=1)
test_df['TotalSpending'] = test_df[specified_cols].sum(axis=1)

In [258]:
missing_values_train_filled, missing_values_test_filled, train_df[['Age', 'AgeGroup', 'TotalSpending']].head(), test_df[['Age', 'AgeGroup', 'TotalSpending']].head()

(PassengerId       0
 HomePlanet        0
 CryoSleep         0
 Cabin             0
 Destination       0
 Age             179
 VIP               0
 RoomService       0
 FoodCourt         0
 ShoppingMall      0
 Spa               0
 VRDeck            0
 Name            200
 Transported       0
 dtype: int64,
 PassengerId      0
 HomePlanet       0
 CryoSleep        0
 Cabin            0
 Destination      0
 Age             91
 VIP              0
 RoomService      0
 FoodCourt        0
 ShoppingMall     0
 Spa              0
 VRDeck           0
 Name            94
 dtype: int64,
     Age AgeGroup  TotalSpending
 0  39.0    Adult            0.0
 1  24.0    Adult          736.0
 2  58.0    Adult        10383.0
 3  33.0    Adult         5176.0
 4  16.0     Teen         1091.0,
     Age AgeGroup  TotalSpending
 0  27.0    Adult            0.0
 1  19.0    Adult         2832.0
 2  31.0    Adult            0.0
 3  38.0    Adult         7418.0
 4  20.0    Adult          645.0)

In [259]:
# Identifying numerical columns for Min-Max scaling (excluding 'Age' which will be represented by 'AgeGroup')
numerical_cols_for_scaling = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpending']

# Initializing MinMaxScaler
scaler = MinMaxScaler()

# Applying Min-Max scaling to numerical columns in both datasets
train_df[numerical_cols_for_scaling] = scaler.fit_transform(train_df[numerical_cols_for_scaling])
test_df[numerical_cols_for_scaling] = scaler.transform(test_df[numerical_cols_for_scaling])

# Creating additional features
# Feature indicating if the passenger spent any money on services
train_df['HasSpent'] = (train_df[numerical_cols_for_scaling] > 0).any(axis=1).astype(int)
test_df['HasSpent'] = (test_df[numerical_cols_for_scaling] > 0).any(axis=1).astype(int)

# Feature indicating the diversity of spending across different categories
train_df['SpendingDiversity'] = (train_df[numerical_cols_for_scaling] > 0).sum(axis=1) / len(numerical_cols_for_scaling)
test_df['SpendingDiversity'] = (test_df[numerical_cols_for_scaling] > 0).sum(axis=1) / len(numerical_cols_for_scaling)

# Displaying a sample of the updated datasets to verify changes
train_df[['TotalSpending', 'HasSpent', 'SpendingDiversity']].head(), test_df[['TotalSpending', 'HasSpent', 'SpendingDiversity']].head()

(   TotalSpending  HasSpent  SpendingDiversity
 0       0.000000         0           0.000000
 1       0.020452         1           1.000000
 2       0.288521         1           0.833333
 3       0.143830         1           0.833333
 4       0.030317         1           1.000000,
    TotalSpending  HasSpent  SpendingDiversity
 0       0.000000         0           0.000000
 1       0.078695         1           0.500000
 2       0.000000         0           0.000000
 3       0.206130         1           0.666667
 4       0.017923         1           0.500000)

In [260]:
# Re-checking for any remaining missing values
remaining_missing_values_train = train_df.isnull().sum()
remaining_missing_values_test = test_df.isnull().sum()

remaining_missing_values_train, remaining_missing_values_test

(PassengerId            0
 HomePlanet             0
 CryoSleep              0
 Cabin                  0
 Destination            0
 Age                    0
 VIP                    0
 RoomService            0
 FoodCourt              0
 ShoppingMall           0
 Spa                    0
 VRDeck                 0
 Name                 200
 Transported            0
 AgeGroup               0
 TotalSpending          0
 HasSpent               0
 SpendingDiversity      0
 dtype: int64,
 PassengerId           0
 HomePlanet            0
 CryoSleep             0
 Cabin                 0
 Destination           0
 Age                   0
 VIP                   0
 RoomService           0
 FoodCourt             0
 ShoppingMall          0
 Spa                   0
 VRDeck                0
 Name                 94
 AgeGroup              0
 TotalSpending         0
 HasSpent              0
 SpendingDiversity     0
 dtype: int64)

In [261]:
train_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,AgeGroup,TotalSpending,HasSpent,SpendingDiversity
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.000000,0.000000,0.000000,0.000000,0.000000,Maham Ofracculy,False,Adult,0.000000,0,0.000000
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,0.007608,0.000302,0.001064,0.024500,0.001823,Juanna Vines,True,Adult,0.020452,1,1.000000
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,0.003001,0.119948,0.000000,0.299670,0.002030,Altark Susent,False,Adult,0.288521,1,0.833333
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.000000,0.043035,0.015793,0.148563,0.007997,Solam Susent,False,Adult,0.143830,1,0.833333
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,0.021149,0.002348,0.006428,0.025214,0.000083,Willy Santantines,True,Teen,0.030317,1,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.000000,0.228726,0.000000,0.073322,0.003066,Gravior Noxnuther,False,Adult,0.237197,1,0.666667
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.000000,0.000000,0.000000,0.000000,0.000000,Kurta Mondalley,False,Teen,0.000000,0,0.000000
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.000000,0.000000,0.079687,0.000045,0.000000,Fayey Connon,True,Adult,0.052047,1,0.500000
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.000000,0.035186,0.000000,0.015753,0.134049,Celeon Hontichre,False,Adult,0.128852,1,0.666667


In [262]:
# Converting 'CryoSleep', 'VIP', 'Transported' into binary (0 or 1)
train_df['CryoSleep'] = train_df['CryoSleep'].astype(int)
test_df['CryoSleep'] = test_df['CryoSleep'].astype(int)
train_df['VIP'] = train_df['VIP'].astype(int)
test_df['VIP'] = test_df['VIP'].astype(int)
train_df['Transported'] = train_df['Transported'].astype(int)

# Verifying the conversion by displaying a sample of the updated data
train_df[['CryoSleep', 'VIP', 'Transported']].head(), test_df[['CryoSleep', 'VIP']].head()

(   CryoSleep  VIP  Transported
 0          0    0            0
 1          0    0            1
 2          0    1            0
 3          0    0            0
 4          0    0            1,
    CryoSleep  VIP
 0          1    0
 1          0    0
 2          1    0
 3          0    0
 4          0    0)

In [263]:
# Normalize 'AgeGroup' using the oldest age as 1 and the youngest as 0
# Mapping AgeGroups to a normalized scale
age_group_mapping = {'Child': 0, 'Teen': 0.33, 'Adult': 0.67, 'Senior': 1}
train_df['AgeGroupNorm'] = train_df['AgeGroup'].map(age_group_mapping)
test_df['AgeGroupNorm'] = test_df['AgeGroup'].map(age_group_mapping)

In [264]:
train_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,AgeGroup,TotalSpending,HasSpent,SpendingDiversity,AgeGroupNorm
0,0001_01,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,Maham Ofracculy,0,Adult,0.000000,0,0.000000,0.67
1,0002_01,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,0.007608,0.000302,0.001064,0.024500,0.001823,Juanna Vines,1,Adult,0.020452,1,1.000000,0.67
2,0003_01,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,0.003001,0.119948,0.000000,0.299670,0.002030,Altark Susent,0,Adult,0.288521,1,0.833333,0.67
3,0003_02,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.000000,0.043035,0.015793,0.148563,0.007997,Solam Susent,0,Adult,0.143830,1,0.833333,0.67
4,0004_01,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,0.021149,0.002348,0.006428,0.025214,0.000083,Willy Santantines,1,Teen,0.030317,1,1.000000,0.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,0,A/98/P,55 Cancri e,41.0,1,0.000000,0.228726,0.000000,0.073322,0.003066,Gravior Noxnuther,0,Adult,0.237197,1,0.666667,0.67
8689,9278_01,Earth,1,G/1499/S,PSO J318.5-22,18.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,Kurta Mondalley,0,Teen,0.000000,0,0.000000,0.33
8690,9279_01,Earth,0,G/1500/S,TRAPPIST-1e,26.0,0,0.000000,0.000000,0.079687,0.000045,0.000000,Fayey Connon,1,Adult,0.052047,1,0.500000,0.67
8691,9280_01,Europa,0,E/608/S,55 Cancri e,32.0,0,0.000000,0.035186,0.000000,0.015753,0.134049,Celeon Hontichre,0,Adult,0.128852,1,0.666667,0.67


In [265]:
# Applying One-Hot Encoding to 'HomePlanet' and 'Destination'
train_df = pd.get_dummies(train_df, columns=['HomePlanet', 'Destination'])
test_df = pd.get_dummies(test_df, columns=['HomePlanet', 'Destination'])

In [266]:
train_df

Unnamed: 0,PassengerId,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,TotalSpending,HasSpent,SpendingDiversity,AgeGroupNorm,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0001_01,0,B/0/P,39.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0,0.000000,0.67,0,1,0,0,0,1
1,0002_01,0,F/0/S,24.0,0,0.007608,0.000302,0.001064,0.024500,0.001823,...,0.020452,1,1.000000,0.67,1,0,0,0,0,1
2,0003_01,0,A/0/S,58.0,1,0.003001,0.119948,0.000000,0.299670,0.002030,...,0.288521,1,0.833333,0.67,0,1,0,0,0,1
3,0003_02,0,A/0/S,33.0,0,0.000000,0.043035,0.015793,0.148563,0.007997,...,0.143830,1,0.833333,0.67,0,1,0,0,0,1
4,0004_01,0,F/1/S,16.0,0,0.021149,0.002348,0.006428,0.025214,0.000083,...,0.030317,1,1.000000,0.33,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0,A/98/P,41.0,1,0.000000,0.228726,0.000000,0.073322,0.003066,...,0.237197,1,0.666667,0.67,0,1,0,1,0,0
8689,9278_01,1,G/1499/S,18.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0,0.000000,0.33,1,0,0,0,1,0
8690,9279_01,0,G/1500/S,26.0,0,0.000000,0.000000,0.079687,0.000045,0.000000,...,0.052047,1,0.500000,0.67,1,0,0,0,0,1
8691,9280_01,0,E/608/S,32.0,0,0.000000,0.035186,0.000000,0.015753,0.134049,...,0.128852,1,0.666667,0.67,0,1,0,1,0,0


In [267]:
# Columns to convert to integers
columns_to_convert = ['Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e']

# Convert each specified column to integer
for column in columns_to_convert:
    train_df[column] = train_df[column].astype(int)


In [268]:
train_df

Unnamed: 0,PassengerId,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,TotalSpending,HasSpent,SpendingDiversity,AgeGroupNorm,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0001_01,0,B/0/P,39.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0,0.000000,0.67,0,1,0,0,0,1
1,0002_01,0,F/0/S,24.0,0,0.007608,0.000302,0.001064,0.024500,0.001823,...,0.020452,1,1.000000,0.67,1,0,0,0,0,1
2,0003_01,0,A/0/S,58.0,1,0.003001,0.119948,0.000000,0.299670,0.002030,...,0.288521,1,0.833333,0.67,0,1,0,0,0,1
3,0003_02,0,A/0/S,33.0,0,0.000000,0.043035,0.015793,0.148563,0.007997,...,0.143830,1,0.833333,0.67,0,1,0,0,0,1
4,0004_01,0,F/1/S,16.0,0,0.021149,0.002348,0.006428,0.025214,0.000083,...,0.030317,1,1.000000,0.33,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0,A/98/P,41.0,1,0.000000,0.228726,0.000000,0.073322,0.003066,...,0.237197,1,0.666667,0.67,0,1,0,1,0,0
8689,9278_01,1,G/1499/S,18.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0,0.000000,0.33,1,0,0,0,1,0
8690,9279_01,0,G/1500/S,26.0,0,0.000000,0.000000,0.079687,0.000045,0.000000,...,0.052047,1,0.500000,0.67,1,0,0,0,0,1
8691,9280_01,0,E/608/S,32.0,0,0.000000,0.035186,0.000000,0.015753,0.134049,...,0.128852,1,0.666667,0.67,0,1,0,1,0,0


In [269]:
# Columns to convert to integers
columns_to_convert = ['HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars']

# Convert each specified column to integer
for column in columns_to_convert:
    train_df[column] = train_df[column].astype(int)


In [270]:
train_df

Unnamed: 0,PassengerId,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,TotalSpending,HasSpent,SpendingDiversity,AgeGroupNorm,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0001_01,0,B/0/P,39.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0,0.000000,0.67,0,1,0,0,0,1
1,0002_01,0,F/0/S,24.0,0,0.007608,0.000302,0.001064,0.024500,0.001823,...,0.020452,1,1.000000,0.67,1,0,0,0,0,1
2,0003_01,0,A/0/S,58.0,1,0.003001,0.119948,0.000000,0.299670,0.002030,...,0.288521,1,0.833333,0.67,0,1,0,0,0,1
3,0003_02,0,A/0/S,33.0,0,0.000000,0.043035,0.015793,0.148563,0.007997,...,0.143830,1,0.833333,0.67,0,1,0,0,0,1
4,0004_01,0,F/1/S,16.0,0,0.021149,0.002348,0.006428,0.025214,0.000083,...,0.030317,1,1.000000,0.33,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0,A/98/P,41.0,1,0.000000,0.228726,0.000000,0.073322,0.003066,...,0.237197,1,0.666667,0.67,0,1,0,1,0,0
8689,9278_01,1,G/1499/S,18.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0,0.000000,0.33,1,0,0,0,1,0
8690,9279_01,0,G/1500/S,26.0,0,0.000000,0.000000,0.079687,0.000045,0.000000,...,0.052047,1,0.500000,0.67,1,0,0,0,0,1
8691,9280_01,0,E/608/S,32.0,0,0.000000,0.035186,0.000000,0.015753,0.134049,...,0.128852,1,0.666667,0.67,0,1,0,1,0,0


In [271]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   PassengerId                8693 non-null   object  
 1   CryoSleep                  8693 non-null   int64   
 2   Cabin                      8693 non-null   object  
 3   Age                        8693 non-null   float64 
 4   VIP                        8693 non-null   int64   
 5   RoomService                8693 non-null   float64 
 6   FoodCourt                  8693 non-null   float64 
 7   ShoppingMall               8693 non-null   float64 
 8   Spa                        8693 non-null   float64 
 9   VRDeck                     8693 non-null   float64 
 10  Name                       8493 non-null   object  
 11  Transported                8693 non-null   int64   
 12  AgeGroup                   8693 non-null   category
 13  TotalSpending              8693 n

In [272]:
# Columns to convert to integers
columns_to_convert = ['AgeGroupNorm']

# Convert each specified column to integer
for column in columns_to_convert:
    train_df[column] = train_df[column].astype(int)


In [273]:
mode = train_df['Cabin'].value_counts().idxmax()
mode


'G/734/S'

In [274]:
modetest = train_df['Cabin'].value_counts().idxmax()
modetest


'G/734/S'

In [275]:
# Substituting NAs with mode

train_df['Cabin'] = train_df['Cabin'].fillna(mode)
nas = train_df['Cabin'].isna().sum()
na_perc = nas / len(train_df)
na_perc

0.0

In [276]:
# Substituting NAs with mode

test_df['Cabin'] = test_df['Cabin'].fillna(mode)
nas = test_df['Cabin'].isna().sum()
na_perc = nas / len(test_df)
na_perc

0.0

In [277]:
# Extracting the first letter from the string
train_df['CabinChar_1'] = train_df['Cabin'].str.extract(r'^([A-Za-z])')
train_df['CabinChar_1']

0       B
1       F
2       A
3       A
4       F
       ..
8688    A
8689    G
8690    G
8691    E
8692    E
Name: CabinChar_1, Length: 8693, dtype: object

In [278]:
# Extracting the first letter from the string
test_df['CabinChar_1'] = test_df['Cabin'].str.extract(r'^([A-Za-z])')
test_df['CabinChar_1']

0       G
1       F
2       C
3       C
4       F
       ..
4272    G
4273    G
4274    D
4275    D
4276    G
Name: CabinChar_1, Length: 4277, dtype: object

In [279]:
# Extracting the numbers from middle from the string
train_df['CabinChar_2'] = train_df['Cabin'].str.extract(r'/(\d+)/')
train_df['CabinChar_2'].value_counts()

734     208
82       28
86       22
19       22
56       21
       ... 
1644      1
1515      1
1639      1
1277      1
1894      1
Name: CabinChar_2, Length: 1817, dtype: int64

In [280]:
# Extracting the numbers from middle from the string
test_df['CabinChar_2'] = test_df['Cabin'].str.extract(r'/(\d+)/')
test_df['CabinChar_2'].value_counts()

160     113
4        21
31       18
197      16
294      16
       ... 
1170      1
904       1
1174      1
356       1
1503      1
Name: CabinChar_2, Length: 1505, dtype: int64

In [281]:
# Extracting the last letter from the string
train_df['CabinChar_3'] = train_df['Cabin'].str[-1]
train_df['CabinChar_3'].value_counts()

S    4487
P    4206
Name: CabinChar_3, dtype: int64

In [282]:
# Extracting the last letter from the string
test_df['CabinChar_3'] = test_df['Cabin'].str[-1]
test_df['CabinChar_3'].value_counts()

P    2184
S    2093
Name: CabinChar_3, dtype: int64

In [283]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   PassengerId                8693 non-null   object  
 1   CryoSleep                  8693 non-null   int64   
 2   Cabin                      8693 non-null   object  
 3   Age                        8693 non-null   float64 
 4   VIP                        8693 non-null   int64   
 5   RoomService                8693 non-null   float64 
 6   FoodCourt                  8693 non-null   float64 
 7   ShoppingMall               8693 non-null   float64 
 8   Spa                        8693 non-null   float64 
 9   VRDeck                     8693 non-null   float64 
 10  Name                       8493 non-null   object  
 11  Transported                8693 non-null   int64   
 12  AgeGroup                   8693 non-null   category
 13  TotalSpending              8693 n

In [284]:
# Changing dtypes
## New cols
train_df = train_df.astype({'CabinChar_1': 'category', 'CabinChar_2': 'int', 'CabinChar_3': 'category'})
test_df = test_df.astype({'CabinChar_1': 'category', 'CabinChar_2': 'int', 'CabinChar_3': 'category'})


In [285]:
# Define the custom mapping
custom_mapping = {'A': 1, 'T': 0}
step_size = 1 / 7  # Since we have 7 intervals between A and T

# Assigning values for B through G
for i, letter in enumerate('BCDEFG', start=1):
    custom_mapping[letter] = 1 - (step_size * i)

# Apply the mapping
train_df['CabinChar_Scaled'] = train_df['CabinChar_1'].map(custom_mapping)

In [286]:
# Define the custom mapping
custom_mapping = {'A': 1, 'T': 0}
step_size = 1 / 7  # Since we have 7 intervals between A and T

# Assigning values for B through G
for i, letter in enumerate('BCDEFG', start=1):
    custom_mapping[letter] = 1 - (step_size * i)

# Apply the mapping
test_df['CabinChar_Scaled'] = test_df['CabinChar_1'].map(custom_mapping)

In [287]:
train_df['CabinChar3_Scaled'] = train_df['CabinChar_3'].map({'S': 1, 'P': 0})

In [288]:
test_df['CabinChar3_Scaled'] = test_df['CabinChar_3'].map({'S': 1, 'P': 0})

In [289]:
train_df

Unnamed: 0,PassengerId,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CabinChar_1,CabinChar_2,CabinChar_3,CabinChar_Scaled,CabinChar3_Scaled
0,0001_01,0,B/0/P,39.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,1,0,0,0,1,B,0,P,0.857143,0
1,0002_01,0,F/0/S,24.0,0,0.007608,0.000302,0.001064,0.024500,0.001823,...,0,0,0,0,1,F,0,S,0.285714,1
2,0003_01,0,A/0/S,58.0,1,0.003001,0.119948,0.000000,0.299670,0.002030,...,1,0,0,0,1,A,0,S,1.000000,1
3,0003_02,0,A/0/S,33.0,0,0.000000,0.043035,0.015793,0.148563,0.007997,...,1,0,0,0,1,A,0,S,1.000000,1
4,0004_01,0,F/1/S,16.0,0,0.021149,0.002348,0.006428,0.025214,0.000083,...,0,0,0,0,1,F,1,S,0.285714,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0,A/98/P,41.0,1,0.000000,0.228726,0.000000,0.073322,0.003066,...,1,0,1,0,0,A,98,P,1.000000,0
8689,9278_01,1,G/1499/S,18.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,1,0,G,1499,S,0.142857,1
8690,9279_01,0,G/1500/S,26.0,0,0.000000,0.000000,0.079687,0.000045,0.000000,...,0,0,0,0,1,G,1500,S,0.142857,1
8691,9280_01,0,E/608/S,32.0,0,0.000000,0.035186,0.000000,0.015753,0.134049,...,1,0,1,0,0,E,608,S,0.428571,1


In [290]:
test_df

Unnamed: 0,PassengerId,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CabinChar_1,CabinChar_2,CabinChar_3,CabinChar_Scaled,CabinChar3_Scaled
0,0013_01,1,G/3/S,27.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,1,G,3,S,0.142857,1
1,0018_01,0,F/4/S,19.0,0,0.000000,0.000302,0.000000,0.125982,0.000000,...,0,0,0,0,1,F,4,S,0.285714,1
2,0019_01,1,C/0/S,31.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,1,0,1,0,0,C,0,S,0.714286,1
3,0021_01,0,C/1/S,38.0,0,0.000000,0.223124,0.000000,0.008077,0.024241,...,1,0,0,0,1,C,1,S,0.714286,1
4,0023_01,0,F/5/S,20.0,0,0.000698,0.000000,0.027030,0.000000,0.000000,...,0,0,0,0,1,F,5,S,0.285714,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,1,G/1496/S,34.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,1,G,1496,S,0.142857,1
4273,9269_01,0,G/160/P,42.0,0,0.000000,0.028410,0.000724,0.000446,0.005967,...,0,0,0,0,1,G,160,P,0.142857,0
4274,9271_01,1,D/296/P,26.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,1,1,0,0,D,296,P,0.571429,0
4275,9273_01,0,D/297/P,26.0,0,0.000000,0.089894,0.000000,0.000000,0.021672,...,1,0,0,0,1,D,297,P,0.571429,0


# Model

In [291]:
# Assuming 'train_df' is your DataFrame and 'Transported' is the target variable
features = [
    'CryoSleep', 'SpendingDiversity', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa',
    'VRDeck', 'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_55 Cancri e','CabinChar3_Scaled'
]
X = train_df[features]
y = train_df['Transported']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Voting Classifier with Gradient Boosting, Random Forest and Logistic Regression

In [292]:
# Define individual models
clf1 = GradientBoostingClassifier(n_estimators=100, random_state=42)
clf2 = RandomForestClassifier(n_estimators=100, random_state=42)
clf3 = LogisticRegression(random_state=42)

# Combine them into a VotingClassifier
# Using 'soft' voting to leverage the probability predictions
voting_clf = VotingClassifier(estimators=[
    ('gb', clf1), ('rf', clf2), ('lr', clf3)],
    voting='soft')

# Train the Voting Classifier
voting_clf.fit(X_train, y_train)

In [293]:
# Make predictions
y_pred = voting_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the Voting Classifier: {accuracy:.4f}')

Accuracy of the Voting Classifier: 0.7861


# Testing Model

In [294]:
# Assuming 'test_df' is your DataFrame containing the test data and has the same features as the training data
X_test_subset = test_df[features]

# Make predictions on the test data using the trained Voting Classifier model
y_pred_test_subset = voting_clf.predict(X_test_subset)

# Optionally, convert predictions to a more interpretable format (e.g., 'True' for transported, 'False' for not transported)
# This step is optional and depends on how you want to use or display the predictions
predictions_test = ['True' if pred else 'False' for pred in y_pred_test_subset]

# Create a DataFrame with relevant identifiers (e.g., PassengerId) and predicted Transported labels for the test data
predictions_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],  # Replace 'PassengerId' with the actual identifier column name if different
    'Transported': predictions_test
})

# Display the first few rows of the predictions DataFrame to verify
print(predictions_df.head())

# Optionally, save the predictions to a CSV file for further use or submission
predictions_df.to_csv('Team_2-Spaceship_Titanic.csv', index=False)

print("Predictions for the test dataset have been saved to 'voting_classifier_predictions.csv'")


  PassengerId Transported
0     0013_01        True
1     0018_01       False
2     0019_01        True
3     0021_01        True
4     0023_01        True
Predictions for the test dataset have been saved to 'voting_classifier_predictions.csv'
