## Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

## Data

### Load Data

In [None]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

print(f"Train shape: {train_data.shape}")
print(f"Test shape: {test_data.shape}")
print('')
print(train_data.head())

### Display missing values

In [None]:
print('Missing values (train set):')
print(train_data.isna().sum())
print('')
print('Missing values (test set):')
print(test_data.isna().sum())

### Data types

In [None]:
print('Train set data types:')
print(train_data.dtypes)
print('')
print('Test set data types:')
print(test_data.dtypes)

We need to convert some features to the appropriate data types (int64 or float64) for the model to work properly.

## Data Exploration

### Transported distribution

In [None]:
# Figure size
plt.figure(figsize=(5,5))
counts = train_data['Transported'].value_counts()
labels = [f"Transported ({counts[True]})", f"Not Transported ({counts[False]})"]
plt.pie(counts, labels=labels, autopct='%1.1f%%')

plt.axis('equal')
plt.show()

### Expenses distribution

In [None]:
exp_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

fig = plt.figure(figsize=(10,20))
for i, feature in enumerate(exp_features):
    # Left plot
    ax=fig.add_subplot(5,2,2*i+1)
    sns.histplot(data=train_data, x=feature, axes=ax, bins=30, kde=False, hue='Transported')
    ax.set_title(feature)
    
fig.tight_layout()
plt.show()

We notice that most passengers have no expenses. We will create a new feature indicating whether a passenger has expenses or not.

### CryoSleep distribution

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=train_data, x='CryoSleep', hue='Transported')
plt.title('CryoSleep distribution')
plt.show()

CryoSleep seems to have an impact on the target variable since most passengers in CryoSleep were transported.

### HomePlanet & Destination distribution

In [None]:
# HomePlanet distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=train_data, x='HomePlanet', hue='Transported')
plt.title('HomePlanet distribution')
plt.show()

# Destination distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=train_data, x='Destination', hue='Transported')
plt.title('Destination distribution')
plt.show()

HomePlanet & Destination do not seem to have a significant impact on the target variable (especially Destination). However, we will keep them for now as they might interact with other features.

### VIP distribution

In [None]:
fig = plt.figure(figsize=(8, 5))
sns.countplot(data=train_data, x='VIP', hue='Transported')
plt.title('VIP distribution')
plt.show()

VIP does not seem to be a significant feature. We will drop it.

### Qualitative features

We can't plot all qualitative features yet because they have too many unique values. We will handle them later.

In [None]:
qualitative_features = ['PassengerId', 'Name', 'Cabin']

print(train_data[qualitative_features].head())

- We can extract the group of a passenger from the PassengerId feature. We will create a new feature indicating the group of a passenger.
- We can extract the deck, num and side of a passenger from the Cabin feature. We will create three new features indicating the deck, num and side of a passenger.
- We can extract the last name of a passenger from the Name feature. We will create a new feature indicating the last name of a passenger.

## Data Cleaning

### Display Missing values

In [None]:
# Missing values summary for train set
print("Train Set Missing Values:")
na_train_cols = train_data.columns[train_data.isna().any()].tolist()
mv_train = pd.DataFrame(train_data[na_train_cols].isna().sum(), columns=['Number_missing'])
mv_train['Percentage_missing'] = np.round(100 * mv_train['Number_missing'] / len(train_data), 2)
print(mv_train)

print("\n" + "="*50 + "\n")

# Missing values summary for test set
print("Test Set Missing Values:")
na_test_cols = test_data.columns[test_data.isna().any()].tolist()
mv_test = pd.DataFrame(test_data[na_test_cols].isna().sum(), columns=['Number_missing'])
mv_test['Percentage_missing'] = np.round(100 * mv_test['Number_missing'] / len(test_data), 2)
print(mv_test)

Only 2% of values are missing. However, almost every feature has missing values. We need to handle them.

### Missing values per passenger

In [None]:
train_data['MissingPerPassenger'] = train_data.isna().sum(axis=1)
plt.figure(figsize=(10, 4))
sns.countplot(data=train_data, x=train_data['MissingPerPassenger'], hue='Transported')
plt.title('Missing values per passenger')
train_data = train_data.drop(columns=['MissingPerPassenger'], axis=1)

We're only missing around 2% of the values, but around 25% of passengers have at least one missing value.
The easiest way to handle them could be to use the media for numerical features and the mode for categorical features.
However, we will look at the joint distribution of missing values to see if there are patterns.

### Features creation

#### Compute spendings

In [None]:
def compute_spendings(data: pd.DataFrame):
    data['TotalSpendings'] = data[exp_features].sum(axis=1)
    data['HasNotSpent'] = (data['TotalSpendings'] == 0).astype(int)
    return data

#### Passenger groups

In [None]:
def create_groups(data: pd.DataFrame):
    data['Group'] = data['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
    data['GroupSize'] = data.groupby('Group')['Group'].transform('count')
    data['IsAlone'] = (data['GroupSize'] == 1).astype(int)
    return data

#### Split Cabin features

In [None]:
def split_cabin(data: pd.DataFrame):
    data['Cabin'] = data['Cabin'].fillna('Z/9999/Z')

    data['CabinDeck'] = data['Cabin'].apply(lambda x: x.split('/')[0])
    data['CabinNum']  = data['Cabin'].apply(lambda x: x.split('/')[1]).astype(int)
    data['CabinSide'] = data['Cabin'].apply(lambda x: x.split('/')[2])

    data.loc[data['CabinDeck'] == 'Z', 'CabinDeck'] = np.nan
    data.loc[data['CabinNum'] == 9999, 'CabinNum'] = np.nan
    data.loc[data['CabinSide'] == 'Z', 'CabinSide'] = np.nan

    data = data.drop('Cabin', axis=1)
    
    return data

#### Compute family size

In [None]:
def compute_family_size(data: pd.DataFrame):
    data['Name'] = data['Name'].fillna('Unknown Unknown')
    data['LastName'] = data['Name'].str.split().str[-1]
    data['FamilySize'] = data.groupby('LastName')['LastName'].transform('count')

    data.loc[data['LastName'] == 'Unknown', 'LastName'] = np.nan
    data.loc[data['FamilySize'] > 100, 'FamilySize'] = np.nan

    data = data.drop('Name', axis=1)

    return data

### Handle missing values

#### Fill HomePlanet missing values

In [None]:
def fill_homeplanet_values(data: pd.DataFrame):
    GHP_grouped = data.groupby(['Group', 'HomePlanet'])['HomePlanet'].size().unstack().fillna(0)

    # Passengers with missing HomePlanet but whose group has known HomePlanet
    GHP_index = data[data['HomePlanet'].isna()][(data[data['HomePlanet'].isna()]['Group']).isin(GHP_grouped.index)].index

    data.loc[GHP_index, 'HomePlanet'] = data.iloc[GHP_index, :]['Group'].map(lambda x: GHP_grouped.idxmax(axis=1)[x])

    # Decks A, B, C or T are from Europa
    data.loc[(data['HomePlanet'].isna()) & (data['CabinDeck'].isin(['A', 'B', 'C', 'T'])), 'HomePlanet'] = 'Europa'
    # Deck G is from Earth
    data.loc[(data['HomePlanet'].isna()) & (data['CabinDeck'] == 'G'), 'HomePlanet'] = 'Earth'

    SHP_grouped = data.groupby(['LastName', 'HomePlanet'])['HomePlanet'].size().unstack().fillna(0)

    # Passengers with missing HomePlanet but whose surname has known HomePlanet
    SHP_index = data[data['HomePlanet'].isna()][(data[data['HomePlanet'].isna()]['LastName']).isin(SHP_grouped.index)].index

    data.loc[SHP_index, 'HomePlanet'] = data.iloc[SHP_index, :]['LastName'].map(lambda x: SHP_grouped.idxmax(axis=1)[x])

    # No one from deck D is from Earth
    # Fill remaining HomePlanet missing values with Earth (if not on deck D) or Mars (if on Deck D)
    data.loc[(data['HomePlanet'].isna()) & ~(data['CabinDeck'] == 'D'), 'HomePlanet'] = 'Earth'
    data.loc[(data['HomePlanet'].isna()) & (data['CabinDeck'] == 'D'), 'HomePlanet'] = 'Mars'

    return data

#### Fill Destination missing values

In [None]:
def fill_destination_values(data: pd.DataFrame):
    # Most passengers are going to TRAPPIST-1e
    data.loc[(data['Destination'].isna()), 'Destination'] = 'TRAPPIST-1e'
    return data

#### Fill LastName missing values

In [None]:
def fill_lastname_values(data: pd.DataFrame):
    # Group by Group and LastName if Group size > 1
    GLN_grouped = data[data['GroupSize'] > 1].groupby(['Group', 'LastName'])['LastName'].size().unstack().fillna(0)

    # Passengers with missing LastName but whose group has known LastName
    GLN_index = data[data['LastName'].isna()][(data[data['LastName'].isna()]['Group']).isin(GLN_grouped.index)].index

    data.loc[GLN_index, 'LastName'] = data.iloc[GLN_index, :]['Group'].map(lambda x: GLN_grouped.idxmax(axis=1)[x])

    return data

#### Fill family size missing values

In [None]:
def fill_family_size_values(data: pd.DataFrame):
    data['LastName'] = data['LastName'].fillna('Unknown')

    data['FamilySize'] = data['LastName'].map(lambda x: data['LastName'].value_counts()[x])

    data.loc[data['LastName'] == 'Unknown', 'FamilySize'] = np.nan
    # Unknown last name means no family
    data.loc[data['FamilySize'] > 100, 'FamilySize'] = 0

    return data

#### Fill Cabin Side missing values

In [None]:
def fill_cabin_side_values(data: pd.DataFrame):
    GCS_grouped = data[data['GroupSize'] > 1].groupby(['Group', 'CabinSide'])['CabinSide'].size().unstack().fillna(0)

    # Passengers with missing CabinSide but whose group has known CabinSide
    GCS_index = data[data['CabinSide'].isna()][(data[data['CabinSide'].isna()]['Group']).isin(GCS_grouped.index)].index

    data.loc[GCS_index, 'CabinSide'] = data.iloc[GCS_index, :]['Group'].map(lambda x: GCS_grouped.idxmax(axis=1)[x])

    SCS_grouped = data[data['GroupSize'] > 1].groupby(['LastName', 'CabinSide'])['CabinSide'].size().unstack().fillna(0)

    # Passengers with missing CabinSide but whose surname has known CabinSide
    SCS_index = data[data['CabinSide'].isna()][(data[data['CabinSide'].isna()]['LastName']).isin(SCS_grouped.index)].index

    data.loc[SCS_index, 'CabinSide'] = data.iloc[SCS_index, :]['LastName'].map(lambda x: SCS_grouped.idxmax(axis=1)[x])

    data = data.drop('LastName', axis=1)

    # Fill remaining CabinSide missing values with 'Z' (unknown)
    data.loc[data['CabinSide'].isna(), 'CabinSide'] = 'Z'

    return data

#### Fill Cabin Deck missing values

In [None]:
def fill_cabin_deck_values(data: pd.DataFrame):
    GCD_grouped = data[data['GroupSize'] > 1].groupby(['Group', 'CabinDeck'])['CabinDeck'].size().unstack().fillna(0)

    # Passengers with missing CabinDeck but whose group has known CabinDeck
    GCD_grouped_index = data[data['CabinDeck'].isna()][(data[data['CabinDeck'].isna()]['Group']).isin(GCD_grouped.index)].index

    data.loc[GCD_grouped_index, 'CabinDeck'] = data.iloc[GCD_grouped_index, :]['Group'].map(lambda x: GCD_grouped.idxmax(axis=1)[x])

    CD_na_rows = data.loc[data['CabinDeck'].isna(), 'CabinDeck'].index
    data.loc[data['CabinDeck'].isna(), 'CabinDeck'] = data.groupby(['HomePlanet', 'Destination', 'IsAlone'])['CabinDeck'].transform(lambda x: x.fillna(pd.Series.mode(x)[0]))[CD_na_rows]
    
    return data

#### Fill VIP missing values

In [None]:
def fill_vip_values(data: pd.DataFrame):
    # Most passengers are not VIP
    data.loc[data['VIP'].isna(), 'VIP'] = False
    return data

#### Fill CryoSleep missing values

In [None]:
def fill_cryosleep_values(data: pd.DataFrame):
    # Passengers who did not spend anything are likely in CryoSleep
    data.loc[(data['CryoSleep'].isna()) & (data['HasNotSpent'] == 1), 'CryoSleep'] = True
    data.loc[(data['CryoSleep'].isna()) & (data['HasNotSpent'] == 0), 'CryoSleep'] = False
    return data

#### Fill Spendings missing values

In [None]:
def fill_spendings_values(data: pd.DataFrame):
    # Passengers who are in CryoSleep are not expected to have spendings
    for feature in exp_features:
        data.loc[(data[feature].isna() & (data['CryoSleep'] == True)), feature] = 0
        median_value = data.loc[data['CryoSleep'] == False, feature].median()
        data.loc[(data[feature].isna() & (data['CryoSleep'] == False)), feature] = median_value
    return data

#### Clean function

In [None]:
def clean(data: pd.DataFrame):
    # Create features
    compute_spendings(data)
    create_groups(data)
    split_cabin(data)
    compute_family_size(data)

    # Fill missing values
    fill_homeplanet_values(data)
    fill_destination_values(data)
    fill_lastname_values(data)
    fill_family_size_values(data)
    fill_cabin_side_values(data)
    fill_cabin_deck_values(data)
    fill_vip_values(data)
    fill_cryosleep_values(data)
    fill_spendings_values(data)

    # Drop unneeded features
    data = data.drop(['PassengerId', 'Group', 'GroupSize'], axis=1)

    return data

cleaned_train_data = clean(train_data)
# print(cleaned_train_data.head())
cleaned_test_data = clean(test_data)

## Model Training

In [None]:
y = cleaned_train_data['Transported']

features = [
    'CryoSleep',
    'RoomService',
    'FoodCourt',
    'ShoppingMall',
    'Spa',
    'VRDeck',
    'TotalSpendings',
    'HomePlanet',
    'RoomService',
    'FoodCourt',
    'ShoppingMall',
    'Spa',
    'VRDeck',
    'Destination',
    'Age',
    'CabinDeck',
    'FamilySize',
    'IsAlone',
    'CabinSide',
    'HasNotSpent',
    'VIP',
]

X = pd.get_dummies(cleaned_train_data[features])
X_test = pd.get_dummies(cleaned_test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({ 'PassengerId': test_data.PassengerId, 'Transported': predictions })
output.to_csv('submission.csv', index=False)

## Model Simulation

In [None]:
# from sklearn.model_selection import cross_val_score

# # Kaggle Score Simulation
# print("\n" + "="*50)
# print("KAGGLE SCORE SIMULATION")
# print("="*50)

# # Separate features and target
# y = train_data['Transported']
# X = train_data.drop('Transported', axis=1)

# # Clean the data
# X_cleaned = clean(X.copy())

# # Handle remaining missing values and encode categorical variables
# X_cleaned = X_cleaned.fillna(X_cleaned.median(numeric_only=True))
# X_cleaned = pd.get_dummies(X_cleaned, drop_first=True)

# # Cross-validation
# model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=42)
# cv_scores = cross_val_score(model, X_cleaned, y, cv=5, scoring='accuracy')

# print(f"\nCross-Validation Scores: {cv_scores}")
# print(f"Mean CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
# print(f"\n📊 Expected Kaggle Score Range: {cv_scores.mean() - cv_scores.std():.4f} - {cv_scores.mean() + cv_scores.std():.4f}")