# Modeling

In [1]:
# Imports
import pandas as pd
import pickle

from IPython.display import display, Markdown as md
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

## Read data

In [2]:
df = pd.read_csv("../data/processed/titanic.csv", sep=";")

In [3]:
display(md("**Rows**: {} - **Columns**: {}".format(df.shape[0], df.shape[1])))
display(df.head())

**Rows**: 183 - **Columns**: 12

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
1,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
2,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
3,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
4,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S


In [4]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183 entries, 0 to 182
Data columns (total 12 columns):
PassengerId    183 non-null int64
Survived       183 non-null int64
Pclass         183 non-null int64
Name           183 non-null object
Sex            183 non-null object
Age            183 non-null float64
SibSp          183 non-null int64
Parch          183 non-null int64
Ticket         183 non-null object
Fare           183 non-null float64
Cabin          183 non-null object
Embarked       183 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 17.2+ KB


None

## Train and test split

In [5]:
LABEL = "Survived"
EXCLUDE_FEATURES = ["Survived", "PassengerId", "Name", "Sex", "Ticket", "Cabin", "Embarked"]
FEATURES = [col for col in df.columns.values if col not in EXCLUDE_FEATURES]

X = df.loc[:, FEATURES].values
y = df.loc[:, LABEL].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

## Modeling

In [6]:
clf = RandomForestClassifier()

parameters = {'n_estimators': [4, 5], 
              'max_depth': [2, 10], 
              'min_samples_split': [2, 3],
              'min_samples_leaf': [1, 5, 8]
             }

grid_obj = GridSearchCV(clf, parameters)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data
clf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [7]:
predictions = clf.predict(X_test)
print("Accuracy: {}".format(accuracy_score(y_test, predictions)))

Accuracy: 0.5675675675675675


## Export predictions

In [8]:
predictions_df = pd.concat([df["PassengerId"], pd.Series(predictions, name="prediction")], axis=1)
display(predictions_df.head())

predictions_df.to_csv("../data/predictions/predictions.csv", index=False, sep=";")

Unnamed: 0,PassengerId,prediction
0,2,1.0
1,4,1.0
2,7,1.0
3,11,0.0
4,12,1.0


## Export model

In [9]:
pickle.dump(clf, open("../models/random_forest_clf.pkl", "wb"))