# Machine Learning & The Titanic
https://www.kaggle.com/c/titanic/overview

In [14]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


In [45]:
# importing data
training_data = pd.read_csv("train.csv")

# looking at data
# print(training_data.head())
print(training_data.columns)
# print(training_data.shape)
object_cols = [col for col in training_data.columns if training_data[col].dtype == "object"]

missing_val_count_by_column = (training_data.isnull().sum())
print("missing values: ", "\n", missing_val_count_by_column[missing_val_count_by_column > 0], "\n", "object cols: ", "\n", object_cols)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
missing values:  
 Age         177
Cabin       687
Embarked      2
dtype: int64 
 object cols:  
 ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [50]:
# features to fit model off of
features = ["Pclass", "SibSp", "Parch", "Age", "Sex", "Fare"]

# model data
x = training_data[features]
y = training_data.Survived
print(x)

     Pclass  SibSp  Parch   Age     Sex      Fare
0         3      1      0  22.0    male    7.2500
1         1      1      0  38.0  female   71.2833
2         3      0      0  26.0  female    7.9250
3         1      1      0  35.0  female   53.1000
4         3      0      0  35.0    male    8.0500
5         3      0      0   NaN    male    8.4583
6         1      0      0  54.0    male   51.8625
7         3      3      1   2.0    male   21.0750
8         3      0      2  27.0  female   11.1333
9         2      1      0  14.0  female   30.0708
10        3      1      1   4.0  female   16.7000
11        1      0      0  58.0  female   26.5500
12        3      0      0  20.0    male    8.0500
13        3      1      5  39.0    male   31.2750
14        3      0      0  14.0  female    7.8542
15        2      0      0  55.0  female   16.0000
16        3      4      1   2.0    male   29.1250
17        2      0      0   NaN    male   13.0000
18        3      1      0  31.0  female   18.0000


In [53]:
# preproccessing data

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=1)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

train_x, val_x, train_y, val_y = train_test_split(x, y, random_state = 1)

# accounting for categorical variables-------------------------------------------------------------------------
split_object_cols = [col for col in train_x.columns if train_x[col].dtype == "object"]

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_x[split_object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(val_x[split_object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = train_x.index
OH_cols_valid.index = val_x.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = train_x.drop(object_cols, axis=1)
num_X_valid = val_x.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# accounting for missing values-----------------------------------------------------------------------------------
my_imputer = SimpleImputer(strategy = "median")
final_x_train = pd.DataFrame(my_imputer.fit_transform(OH_X_train))
final_x_valid = pd.DataFrame(my_imputer.transform(OH_X_valid))

score_dataset(final_x_train, final_x_valid, train_y, val_y)

0.26171419805025714

In [52]:
# testing model

def score_model(split_model, x_t=final_x_train, x_v=final_x_valid, y_t=train_y, y_v=val_y):
    split_model.fit(x_t, y_t)
    split_preds = split_model.predict(x_v)
    return mean_absolute_error(y_v, split_preds)

# find optimal number of estimators based of minimum mean absolute deviation of prediction from split data
scores = []
n_est = range(1,100)
for i in range(len(n_est)):
    scores.append(score_model(RandomForestClassifier(n_estimators = n_est[i], random_state=1)))
    
best_n_estimators = n_est[np.where(scores == np.amin(scores))[0][0]]

# define model, test validity
model = RandomForestClassifier(n_estimators = best_n_estimators, random_state=1)
score_model(model)

0.20179372197309417

In [56]:
# actual model fit
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(x[object_cols]))
OH_cols.index = x.index
num_X = x.drop(object_cols, axis=1)
OH_X = pd.concat([num_X, OH_cols], axis=1)

final_x = pd.DataFrame(my_imputer.transform(OH_X))

model.fit(final_x, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [61]:
# using model to make predictions off of test data
test_data = pd.read_csv("test.csv")
x_test = test_data[features]

OH_encoder_test = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_test = pd.DataFrame(OH_encoder_test.fit_transform(x_test[object_cols]))
OH_cols_test.index = x_test.index
num_X_test = x_test.drop(object_cols, axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

final_x = pd.DataFrame(my_imputer.transform(OH_X_test))

test_preds = model.predict(final_x)
# print(test_preds)

In [62]:
# creating output file
output = pd.DataFrame({'PassengerId': test_data.PassengerId,
                      'Survived': test_preds})
output.to_csv('submission.csv', index=False)