# Notes
- Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. >=_02 in the passenger ID it means that is a group travelling
- Cabin is split up to deck/num/side
- RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Monetory values in terms of amount spent


Machine Learning Task
- Predict Transported(True/False) (Binary Classification)

# Setup the Dependencies

In [None]:
!pip install kaggle

In [None]:
!mkdir  ~/.kaggle

In [None]:
!cp kaggle.json ~/.kaggle/kaggle.json

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c spaceship-titanic

In [None]:
!unzip spaceship-titanic.zip

# Load Data and EDA

In [None]:
!pip install ydata-profiling

In [None]:
import pandas as pd
from ydata_profiling import ProfileReport

In [None]:
df = pd.read_csv('train.csv')
df.head()

In [None]:
df.info()

In [None]:
profile = ProfileReport(df=df, title='Spaceship Titanic Report')

In [None]:
profile.to_notebook_iframe()

In [None]:
profile.to_file('Spaceship_Titanic_Report.html')

In [None]:
def split_cabin(x):
  if len(str(x).split('/')) < 3:
    return ['Missing', 'Missing', 'Missing']
  else:
    return str(x).split('/')

In [None]:
df.columns

In [None]:
# Create a preprocessing function to transform our dataset
def preprocessing(df):
  # Fill missing value in homeplanet with missing
  df['HomePlanet'].fillna('Missing', inplace=True)

  # Cryosleep - highly correlated - drop na rows
  df['CryoSleep'].fillna('Missing', inplace=True)

  # Cabin preprocessing - extract Deck and Side
  df['TempCabin'] = df['Cabin'].apply(lambda x: split_cabin(x))
  df['Deck'] = df['TempCabin'].apply(lambda x: x[0])
  df['Side'] = df['TempCabin'].apply(lambda x: x[2])
  df.drop(columns=['TempCabin', 'Cabin'], axis=1, inplace=True)

  # Destination
  df['Destination'].fillna('Missing', inplace=True)

  # Age
  df['Age'].fillna(df['Age'].mean(), inplace=True)

  # VIP - drop na rows
  df['VIP'].fillna('Missing', inplace=True)

  # Monetory spending columns
  df['RoomService'].fillna(0, inplace=True)
  df['FoodCourt'].fillna(0, inplace=True)
  df['ShoppingMall'].fillna(0, inplace=True)
  df['Spa'].fillna(0, inplace=True)
  df['VRDeck'].fillna(0, inplace=True)

  # Drop name due to high cardinality
  df.drop(columns=['Name'], axis=1, inplace=True)

  # Drop remaining row
  # df.dropna(inplace=True)

In [None]:
analytical_base_table = df.copy()

In [None]:
df.shape

In [None]:
preprocessing(analytical_base_table)

In [None]:
analytical_base_table.shape

In [None]:
analytical_base_table.head()

In [None]:
analytical_base_table.info()

In [None]:
# Unique Value
abt = analytical_base_table.drop(columns=['PassengerId', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1)
for i in abt:
  print(f"Unique value of {i} : {abt[i].unique()}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Check whether dataset balanced or not
sns.countplot(x='Transported', data=df)
plt.show()

# Modelling
- Feature and Target values - X, y
- One hot encoded any categorical columns
- Train, holdout split
- Train on bunch of algorithms

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
# Feature columns
X = analytical_base_table.drop(columns=['Transported', 'PassengerId'], axis=1)

# Target column
y = analytical_base_table['Transported']

In [None]:
# One hot encoding
X = pd.get_dummies(X)

In [None]:
X.columns

In [None]:
# Train & Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.head()

# Setup ML Pipeline

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
pipelines = {
    'rf': make_pipeline(StandardScaler(), RandomForestClassifier(random_state=1234)),
    'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1234)),
    'svm': make_pipeline(StandardScaler(), SVC()),
    'logistic': make_pipeline(StandardScaler(), LogisticRegression())
}

In [None]:
LogisticRegression().get_params()

In [None]:
for algo, pipeline in pipelines.items():
  print(f"Algorithm : {algo} and Pipeline : {pipeline}")

In [None]:
grid = {
    'rf': {
        'randomforestclassifier__n_estimators': [100, 200, 300, 400],
        'randomforestclassifier__max_features': ["sqrt", "log2"]
    },
    'gb': {
        'gradientboostingclassifier__n_estimators': [100, 200, 300],
        'gradientboostingclassifier__criterion': ["friedman_mse", "squared_error"],
        'gradientboostingclassifier__learning_rate': [0.1, 0.01, 0.001]
    },
    'svm': {
        'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'svc__gamma': ['scale', 'auto']
    },
    'logistic': {
        'logisticregression__C' : np.logspace(-4, 4, 20),
        'logisticregression__max_iter': [100, 1000, 2500, 5000]
    }
}

In [None]:
pipelines['rf']

In [None]:
for algo, pipeline in pipelines.items():
  print(f"Algorithm : {algo} and Pipeline : {pipeline}")

In [None]:
# Create a blank dictionary to hold the models
fit_models = {}

# Loop through all the algorithms
for algo, pipeline in pipelines.items():
  print(f"Training the {algo} model.")

  # Create a Grid Search CV
  model = GridSearchCV(pipeline, grid[algo], n_jobs=-1, cv=10)

  # Train the model
  model.fit(X_train, y_train)

  # Store the results inside the dictionary
  fit_models[algo] = model

# Evaluate Performance on Test Partition
- Grab the testing data from test.csv and evaluate on that

In [None]:
for algo, model in fit_models.items():
  print(algo,'\n',model)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
# Evaluate the performance of the model
for algo, model in fit_models.items():
  yhat = model.predict(X_test)
  accuracy = accuracy_score(y_test, yhat)
  precision = precision_score(y_test, yhat)
  recall = recall_score(y_test, yhat)
  print(f"Metrics for {algo}: Accuracy={accuracy}, Precision={precision}, Recall={recall}")

# Save Best Model

In [None]:
import pickle

In [None]:
with open('gradientBoosted.pkl', 'wb') as f:
  pickle.dump(fit_models['gb'], f)

In [None]:
with open('gradientBoosted.pkl', 'rb') as f:
  reloaded_model = pickle.load(f)

In [None]:
reloaded_model

# Predict on Test Data

In [None]:
# Read test.csv
test_df = pd.read_csv('test.csv')

In [None]:
# Deep Copy
abt_test = test_df.copy()

In [None]:
# Preprocessing
preprocessing(abt_test)

In [None]:
print(test_df.shape, abt_test.shape)

In [None]:
# One hot encoding for categorical variables
abt_test = pd.get_dummies(abt_test.drop('PassengerId', axis=1))

In [None]:
len(test_df.columns), len(abt_test.columns)

In [None]:
yhat_test = fit_models['gb'].predict(abt_test)

In [None]:
submission = pd.DataFrame([test_df['PassengerId'], yhat_test]).T
submission.columns = ['PassengerId', 'Transported']
submission

# Submit to Kaggle

In [None]:
submission.to_csv("kaggle_submission.csv", index=False)

In [None]:
!kaggle competitions submit -c spaceship-titanic -m "initial gb model" -f "kaggle_submission.csv"