# Libraries

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings("ignore")

# Load Data

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Prepare Data

## Handle Missing Values

In [3]:
cols_with_missing_train = [col for col in train_data.columns if train_data[col].isnull().any()]
cols_with_missing_test = [col for col in test_data.columns if test_data[col].isnull().any()]
print('train columns with missing data:', cols_with_missing_train)
print('test columns with missing data:', cols_with_missing_test)

train columns with missing data: ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name']
test columns with missing data: ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name']


We fill missing values with the mean for continuously values features and the mode for categorical features:

In [4]:
for col in cols_with_missing_train:
    if col=='Age':
        fill = train_data[col].mean()
    else:
        fill = train_data[col].value_counts().index[0]
    train_data[col] = train_data[col].fillna(fill)
    test_data[col] = test_data[col].fillna(fill)

## Split Cabin
We extract the feature deck, num and side of the cabin:


In [5]:
def extract_deck(s):
    return s.split('/')[0]

def extract_num(s):
    return s.split('/')[1]

def extract_side(s):
    return s.split('/')[2]

train_data['Deck'] = train_data['Cabin'].apply(extract_deck)
train_data['Num'] = train_data['Cabin'].apply(extract_num)
train_data['Side'] = train_data['Cabin'].apply(extract_side)

test_data['Deck'] = test_data['Cabin'].apply(extract_deck)
test_data['Num'] = test_data['Cabin'].apply(extract_num)
test_data['Side'] = test_data['Cabin'].apply(extract_side)

## One-hot encoding
We encode the following categorical features (note that is is another way to do this compared to the acceptor site prediction notebook):

In [6]:
data = pd.concat([train_data[test_data.columns], test_data])
features_cat = ['HomePlanet', 'Destination', 'Deck', 'Side']
for feature in features_cat:
    data[pd.get_dummies(data[feature], prefix=feature).columns] = pd.get_dummies(data[feature], prefix=feature)

## Drop Features

We drop some features:

In [7]:
features_cat

['HomePlanet', 'Destination', 'Deck', 'Side']

In [8]:
features_drop = features_cat+['Name', 'PassengerId', 'Cabin']
train_data.drop(features_drop, axis=1, inplace=True)
test_data.drop(features_drop, axis=1, inplace=True)

# Fit model and make predictions



In [9]:
from sklearn.linear_model import LogisticRegression

X = train_data
y = X.pop('Transported')

model = LogisticRegression()

model.fit(X,y)

LogisticRegression()

In [10]:
predictions = model.predict(test_data)

In [11]:
predictions

array([ True, False,  True, ...,  True,  True,  True])

# Write submission file

In [12]:
submission = pd.read_csv("sample_submission.csv")
submission['Transported'] = predictions
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,True
4274,9271_01,True
4275,9273_01,True


In [13]:
submission.to_csv('submission.csv', index=False)