# Machine Learning & The Titanic
https://www.kaggle.com/c/titanic/overview

In [278]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [281]:
# importing data
training_data = pd.read_csv("train.csv")

# looking at data
# print(training_data.head())
print(training_data.columns)
print(training_data.shape)
missing_val_count_by_column = (training_data.isnull().sum())
print("missing values: ", "\n", missing_val_count_by_column[missing_val_count_by_column > 0])

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
(891, 12)
missing values:  
 Age         177
Cabin       687
Embarked      2
dtype: int64


In [290]:
# features to fit model off of
features = ["Pclass", "SibSp", "Parch", "Age"]

# model data
x = training_data[features]
y = training_data.Survived
print(x)

     Pclass  SibSp  Parch   Age
0         3      1      0  22.0
1         1      1      0  38.0
2         3      0      0  26.0
3         1      1      0  35.0
4         3      0      0  35.0
5         3      0      0   NaN
6         1      0      0  54.0
7         3      3      1   2.0
8         3      0      2  27.0
9         2      1      0  14.0
10        3      1      1   4.0
11        1      0      0  58.0
12        3      0      0  20.0
13        3      1      5  39.0
14        3      0      0  14.0
15        2      0      0  55.0
16        3      4      1   2.0
17        2      0      0   NaN
18        3      1      0  31.0
19        3      0      0   NaN
20        2      0      0  35.0
21        2      0      0  34.0
22        3      0      0  15.0
23        1      0      0  28.0
24        3      3      1   8.0
25        3      1      5  38.0
26        3      0      0   NaN
27        1      3      2  19.0
28        3      0      0   NaN
29        3      0      0   NaN
..      

In [299]:
# build model based off of split data to test validity
train_x, val_x, train_y, val_y = train_test_split(x, y, random_state = 1)

# Imputation
my_imputer = SimpleImputer(strategy = "median")
imputed_x_train = pd.DataFrame(my_imputer.fit_transform(train_x))
imputed_x_valid = pd.DataFrame(my_imputer.transform(val_x))

# Imputation removed column names; put them back
imputed_x_train.columns = train_x.columns
imputed_x_valid.columns = val_x.columns

# function to test model validity based on mean absolute deviation
def score_model(split_model, x_t=imputed_x_train, x_v=imputed_x_valid, y_t=train_y, y_v=val_y):
    split_model.fit(x_t, y_t)
    split_preds = split_model.predict(x_v)
    return mean_absolute_error(y_v, split_preds)

# find optimal number of estimators based of minimum mean absolute deviation of prediction from split data
scores = []
n_est = range(1,100)
for i in range(len(n_est)):
    scores.append(score_model(RandomForestClassifier(n_estimators = n_est[i], random_state=1)))
    
best_n_estimators = n_est[np.where(scores == np.amin(scores))[0][0]]

# define model, test validity
model = RandomForestClassifier(n_estimators = best_n_estimators, random_state=1)
score_model(model)

0.3183856502242152

In [300]:
# actual model fit
final_x = pd.DataFrame(my_imputer.transform(x))
final_x.columns = x.columns

model.fit(final_x, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=8, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [301]:
# using model to make predictions off of test data
test_data = pd.read_csv("test.csv")

test_x = test_data[features]
final_test_x = pd.DataFrame(my_imputer.transform(test_x))
final_test_x.columns = test_x.columns
test_preds = model.predict(final_test_x)
# print(test_preds)

In [302]:
# creating output file
output = pd.DataFrame({'PassengerId': test_data.PassengerId,
                      'Survived': test_preds})
output.to_csv('submission.csv', index=False)