In [None]:
# df = pd.read_csv("../data/spaceship-titanic/train.csv")
# df['FirstName'] = df['Name'].apply(lambda name: name.split(' ')[0] if type(name) == str else None)

# from transformers import pipeline
# from tqdm import tqdm

# tqdm.pandas()

# classifier = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-1", device="mps")
# def predict_gender(name):
#     result = None
#     if name is not None:
#         result = classifier(name, ["male", "female"])['labels'][0]
#     return result

# df['Gender'] = df['FirstName'].progress_apply(predict_gender)

# print(df['Gender'])

# df = df.drop(columns=['FirstName'])
# df.to_csv('../data/spaceship-titanic/train_augmented.csv')

In [None]:
# df = pd.read_csv("../data/spaceship-titanic/test.csv")
# df['FirstName'] = df['Name'].apply(lambda name: name.split(' ')[0] if type(name) == str else None)

# from transformers import pipeline
# from tqdm import tqdm

# tqdm.pandas()

# classifier = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-1", device="mps")
# def predict_gender(name):
#     result = None
#     if name is not None:
#         result = classifier(name, ["male", "female"])['labels'][0]
#     return result

# df['Gender'] = df['FirstName'].progress_apply(predict_gender)

# print(df['Gender'])

# df = df.drop(columns=['FirstName'])
# df.to_csv('../data/spaceship-titanic/test_augmented.csv')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


In [None]:
def prepare_df(df):
    df[["CabinDeck", "CabinNumber", "CabinSide"]] = df['Cabin'].apply(lambda cabin: cabin.split("/") if type(cabin) == str else [None, None, None]).apply(pd.Series)
    df = df.drop(columns=["Cabin"])

    df[["PassengerGroup", "PassengerNumber"]] = df['PassengerId'].apply(lambda passenger_id: [float(passenger_id_part) for passenger_id_part in passenger_id.split("_")] if type(passenger_id) == str else [None, None]).apply(pd.Series)
    df["PassengerGroup"] = df["PassengerGroup"].astype(float)
    df["PassengerNumber"] = df["PassengerNumber"].astype(float)

    df["Single"] = False
    df.loc[df[df["PassengerNumber"] < 2]["PassengerGroup"].index, "Single"] = True

    df["Couple"] = False
    df.loc[df[np.logical_and(2 < df["PassengerNumber"], df["PassengerNumber"] < 3)]["PassengerGroup"].index, "Couple"] = True

    df["SmallGroup"] = False
    df.loc[df[np.logical_and(3 < df["PassengerNumber"], df["PassengerNumber"] < 6)]["PassengerGroup"].index, "SmallGroup"] = True

    df["LargeGroup"] = False
    df.loc[df[df["PassengerNumber"] > 6]["PassengerGroup"].index, "LargeGroup"] = True

    df = df.drop(columns=["PassengerGroup"])

    df = pd.concat([df, pd.get_dummies(df["HomePlanet"], prefix="HomePlanet")], axis=1)
    df = df.drop(columns=["HomePlanet"])

    df = pd.concat([df, pd.get_dummies(df["Destination"], prefix="Destination")], axis=1)
    df = df.drop(columns=["Destination"])

    df = pd.concat([df, pd.get_dummies(df["CabinDeck"], prefix="CabinDeck")], axis=1)
    df = df.drop(columns=["CabinDeck"])

    df = pd.concat([df, pd.get_dummies(df["CabinSide"], prefix="CabinSide")], axis=1)
    df = df.drop(columns=["CabinSide"])

    df = pd.concat([df, pd.get_dummies(df["Gender"], prefix="Gender")], axis=1)
    df = df.drop(columns=["Gender"])

    # df["CabinNumber"] = df["CabinNumber"].astype(float)
    df = df.drop(columns=["CabinNumber"])

    df["CryoSleep"] = df["CryoSleep"].astype(bool)

    df["VIP"] = df["VIP"].astype(bool)

    spends = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    iqr = spends.quantile(q=0.75) - spends.quantile(q=0.25)
    df["Rich"] = spends > spends.quantile(q=0.75) + 2 * iqr
    df["Poor"] = spends < spends.quantile(q=0.25)
    df["Average"] =  np.logical_and(spends.quantile(q=0.25) < spends, spends < spends.quantile(q=0.75) + 2 * iqr)

    df = df.drop(columns=["Name"])

    for column in df:
        if df[column].isna().any():
            if df[column].dtype == float:
                df[column] = SimpleImputer(strategy="median").fit_transform(df[[column]])
            if df[column].dtype == bool:
                df[column] = SimpleImputer(strategy="most_frequent").fit_transform(df[[column]])
    
    # df = df.dropna()

    return df

In [None]:
def prepare_X_y(X, y=None):
    float_mask = (X.dtypes == float).values

    X.iloc[:, float_mask] = StandardScaler().fit_transform(X.iloc[:, float_mask])
    X = X.astype(float)

    # Convert data to PyTorch tensors
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    y_tensor = None
    if y is not None:
        y_tensor = torch.tensor(y.values, dtype=torch.float32)

    return X_tensor, y_tensor

In [None]:
# df = pd.read_csv("../data/spaceship-titanic/train.csv")
df = pd.read_csv("../data/spaceship-titanic/train_augmented.csv")

df = prepare_df(df)

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["Transported", "PassengerId"]), df["Transported"], train_size=0.9)

X_train_tensor, y_train_tensor = prepare_X_y(X=X_train, y=y_train)
X_test_tensor, y_test_tensor = prepare_X_y(X=X_test, y=y_test)

# Create DataLoader for batch processing
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.5)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

# Initialize the model
input_size = X_train.shape[1]
model = BinaryClassifier(input_size)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.AdamW(model.parameters(), lr=1e-3)

scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)  # Decay rate of 0.9

# Training the model
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    scheduler.step()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}')

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor).squeeze()
        predictions = (outputs >= 0.5).float()
        accuracy = (predictions == y_test_tensor).float().mean()
        print(f'Test Accuracy: {accuracy:.4f}')

In [None]:
X_tensor, y_tensor = prepare_X_y(df.drop(columns=["Transported", "PassengerId"]), df["Transported"])
X_dataset = TensorDataset(X_tensor, y_tensor)
X_loader = DataLoader(X_dataset, batch_size=32, shuffle=True)

input_size = X_tensor.shape[1]
model = BinaryClassifier(input_size)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=3e-2)

scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)  # Decay rate of 0.9

# Training the model
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for inputs, labels in X_loader:
        optimizer.zero_grad()
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    scheduler.step()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}')

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor).squeeze()
        predictions = (outputs >= 0.5).float()
        accuracy = (predictions == y_test_tensor).float().mean()
        print(f'Test Accuracy: {accuracy:.4f}')

In [None]:
df_comp = pd.read_csv("../data/spaceship-titanic/test_augmented.csv")
df_comp = prepare_df(df_comp)

passenger_id = df_comp.PassengerId.values

X_tensor, _ = prepare_X_y(X=df_comp.drop(columns=["PassengerId"]))

model.eval()
with torch.no_grad():
    outputs = model(X_tensor).squeeze()
    predictions = (outputs >= 0.5).float()

data = np.stack([passenger_id, predictions.numpy().astype(bool)], axis=1)
df_sub = pd.DataFrame(data=data, columns=["PassengerId","Transported"])
df_sub.to_csv("../data/spaceship-titanic/submission.csv", index=False)