In [358]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Note this is a pure classification task. There are many different options out there, as listed below:
* Neural Networks (Cross-Entropy would work well here)
* Clustering (KMeans, DBSCAN, Hierarchal, GMM, etc.)
* Ensembles with SOTA
* Etc, etc.

For this notebook, because I'm currently learning more about basic ML, I will try using a decision tree and evaluate performance. 

First, I'm going to analyze the data to see if there are any issues with it

In [359]:
df = pd.read_csv("../input/titanic/train.csv")

display(df.describe())
display(df.head())
print(df.shape)

In [360]:
num_vars = df.columns[df.dtypes != 'object']
cat_vars = df.columns[df.dtypes == 'object']

print(num_vars)
print(cat_vars)

In [361]:
df[num_vars].isnull().sum().sort_values(ascending=False)/len(df)

In [362]:
df[cat_vars].isnull().sum().sort_values(ascending=False)/len(df)

Evidently, we can see that most of the data is present, however for the cabin, column, a lot of data is not there, hence we will remove it. For age, we will not as 80% of the rows will, and for now we will settle with filling the remaining 20% with the mean of the existing values. For embarked, we will just fill the remaining with the most column letter.

In [363]:
df[num_vars] = df[num_vars].apply(lambda col: col.fillna(col.mean()))
df = df.drop(columns=['Cabin', 'PassengerId'])
num_vars = df.columns[df.dtypes != 'object']
cat_vars = df.columns[df.dtypes == 'object']
df[cat_vars] = df[cat_vars].apply(lambda col: col.fillna(col.mode()[0]))

display(df[num_vars].isnull().sum().sort_values(ascending=False)/len(df))
display(df[cat_vars].isnull().sum().sort_values(ascending=False)/len(df))

sex = pd.get_dummies(df['Sex'], drop_first=True)
embark = pd.get_dummies(df['Embarked'], drop_first=True)
pclass = pd.get_dummies(df['Pclass'], drop_first=True)


df = pd.concat([df, sex, embark, pclass], axis=1)
df.drop(['Sex', 'Embarked', 'Pclass', "Ticket", "Name"], axis=1, inplace=True)

df.head()

Now that the data has been cleaned, we can start using our decision tree classifier.

In [364]:
dataset = df.to_numpy()
print(dataset.shape)

In [365]:
X = dataset[:, 1:]
Y = dataset[:, 0]

In [366]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import MiniBatchKMeans

clf = RandomForestClassifier()
clf = clf.fit(X, Y)

test_df = pd.read_csv("../input/titanic/test.csv")
test_df = test_df.drop(columns=['Cabin', 'PassengerId'])
num_vars = test_df.columns[test_df.dtypes != 'object']
cat_vars = test_df.columns[test_df.dtypes == 'object']
test_df[num_vars] = test_df[num_vars].apply(lambda col: col.fillna(col.mean()))
display(test_df[num_vars].isnull().sum().sort_values(ascending=False)/len(df))
display(test_df[cat_vars].isnull().sum().sort_values(ascending=False)/len(df))

sex = pd.get_dummies(test_df['Sex'], drop_first=True)
embark = pd.get_dummies(test_df['Embarked'], drop_first=True)
pclass = pd.get_dummies(test_df['Pclass'], drop_first=True)

test_df = pd.concat([test_df, sex, embark, pclass], axis=1)
test_df.drop(['Sex', 'Embarked', 'Pclass', "Ticket", "Name"], axis=1, inplace=True)
display(test_df.head())


test_X = test_df.to_numpy()
predictions = clf.predict(test_X)

inter = []
count = 892
for i in range(len(predictions)):
    inter.append([count, int(predictions[i])])
    count += 1

output = pd.DataFrame(inter, columns=["PassengerId", "Survived"])
output.to_csv('predictions_fixed.csv', index=False)
