# Machine Learning & The Titanic
https://www.kaggle.com/c/titanic/overview

In [56]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [2]:
# importing data
training_data = pd.read_csv("train.csv")

# looking at data
# print(training_data.head())
print(training_data.columns)
# print(training_data.shape)
object_cols = [col for col in training_data.columns if training_data[col].dtype == "object"]

missing_val_count_by_column = (training_data.isnull().sum())
print("missing values: ", "\n", missing_val_count_by_column[missing_val_count_by_column > 0], "\n", "object cols: ", "\n", object_cols)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
missing values:  
 Age         177
Cabin       687
Embarked      2
dtype: int64 
 object cols:  
 ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [59]:
# Read the data
X_full = pd.read_csv("train.csv", index_col='PassengerId')
X_test_full = pd.read_csv('test.csv', index_col='PassengerId')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['Survived'], inplace=True)
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, random_state=1)

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_full.columns if
                    X_full[cname].nunique() < 10 and 
                    X_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_full.columns if 
                X_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
# categorical_cols = ["Sex"]
# numerical_cols = ["Pclass", "SibSp", "Parch", "Age", "Fare"]
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

X = X_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [77]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Optimize n_estimators
def score_pipeline(n, optim_pre = preprocessor):
    optim_model = RandomForestClassifier(n_estimators = n, random_state=1)
    optim_pipeline = Pipeline(steps=[('preprocessor', optim_pre), ('model', optim_model)])
    optim_scores = -1 * cross_val_score(optim_pipeline, X, y,
                              cv=10,
                              scoring='neg_mean_absolute_error')
    return(optim_scores.mean())
score_list = []
n_est = range(1,100)
for i in range(len(n_est)):
    score_list.append(score_pipeline(n_est[i]))
best_n_estimators = n_est[np.where(score_list == np.amin(score_list))[0][0]]
print(best_n_estimators)

# define model
model = RandomForestClassifier(n_estimators = best_n_estimators, random_state=1)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Preprocessing of training data, fit model 
my_pipeline.fit(X, y)

# test pipeline via cross validation
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=10,
                              scoring='neg_mean_absolute_error')

# print("MAE scores:\n", scores)
print("Average MAE score (across experiments):")
print(scores.mean())

# Preprocessing of validation data, get predictions
# preds = my_pipeline.predict(X_valid)
# print('MAE:', mean_absolute_error(y_valid, preds))

10
Average MAE score (across experiments):
0.175060152082624


In [78]:
# Preprocessing of test data, fit model
preds_test = my_pipeline.predict(X_test)

# output
output = pd.DataFrame({'PassengerId': X_test.index,
                      'Survived': preds_test})
output.to_csv('submission.csv', index=False)

In [38]:
# # features to fit model off of
# features = ["Pclass", "SibSp", "Parch", "Age", "Sex", "Fare"]

# # model data
# x = training_data[features]
# y = training_data.Survived
# print(x)

In [39]:
# # preproccessing data

# def score_dataset(X_train, X_valid, y_train, y_valid):
#     model = RandomForestRegressor(n_estimators=10, random_state=1)
#     model.fit(X_train, y_train)
#     preds = model.predict(X_valid)
#     return mean_absolute_error(y_valid, preds)

# train_x, val_x, train_y, val_y = train_test_split(x, y, random_state = 1)

# # accounting for categorical variables-------------------------------------------------------------------------
# split_object_cols = [col for col in train_x.columns if train_x[col].dtype == "object"]

# # Apply one-hot encoder to each column with categorical data
# OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
# OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_x[split_object_cols]))
# OH_cols_valid = pd.DataFrame(OH_encoder.transform(val_x[split_object_cols]))

# # One-hot encoding removed index; put it back
# OH_cols_train.index = train_x.index
# OH_cols_valid.index = val_x.index

# # Remove categorical columns (will replace with one-hot encoding)
# num_X_train = train_x.drop(split_object_cols, axis=1)
# num_X_valid = val_x.drop(split_object_cols, axis=1)

# # Add one-hot encoded columns to numerical features
# OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
# OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# # accounting for missing values-----------------------------------------------------------------------------------
# my_imputer = SimpleImputer(strategy = "median")
# final_x_train = pd.DataFrame(my_imputer.fit_transform(OH_X_train))
# final_x_valid = pd.DataFrame(my_imputer.transform(OH_X_valid))

# score_dataset(final_x_train, final_x_valid, train_y, val_y)

In [40]:
# # testing model

# def score_model(split_model, x_t=final_x_train, x_v=final_x_valid, y_t=train_y, y_v=val_y):
#     split_model.fit(x_t, y_t)
#     split_preds = split_model.predict(x_v)
#     return mean_absolute_error(y_v, split_preds)

# # find optimal number of estimators based of minimum mean absolute deviation of prediction from split data
# scores = []
# n_est = range(1,100)
# for i in range(len(n_est)):
#     scores.append(score_model(RandomForestClassifier(n_estimators = n_est[i], random_state=1)))
    
# best_n_estimators = n_est[np.where(scores == np.amin(scores))[0][0]]

# # define model, test validity
# model = RandomForestClassifier(n_estimators = best_n_estimators, random_state=1)
# score_model(model)

In [41]:
# # actual model fit
# OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
# OH_cols = pd.DataFrame(OH_encoder.fit_transform(x[split_object_cols]))
# OH_cols.index = x.index
# num_X = x.drop(split_object_cols, axis=1)
# OH_X = pd.concat([num_X, OH_cols], axis=1)

# final_x = pd.DataFrame(my_imputer.transform(OH_X))

# model.fit(final_x, y)

In [42]:
# # using model to make predictions off of test data
# test_data = pd.read_csv("test.csv")
# x_test = test_data[features]

# OH_encoder_test = OneHotEncoder(handle_unknown='ignore', sparse=False)
# OH_cols_test = pd.DataFrame(OH_encoder_test.fit_transform(x_test[split_object_cols]))
# OH_cols_test.index = x_test.index
# num_X_test = x_test.drop(split_object_cols, axis=1)
# OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# final_x = pd.DataFrame(my_imputer.transform(OH_X_test))

# test_preds = model.predict(final_x)
# # print(test_preds)

In [43]:
# # creating output file
# output = pd.DataFrame({'PassengerId': test_data.PassengerId,
#                       'Survived': test_preds})
# output.to_csv('submission.csv', index=False)