# Model Training

In [None]:
import numpy as np
import pandas as pd

## Read in the cleaned data

In [None]:
train_df = pd.read_csv('./data/cleaned_data/train_cleaned.csv')
test_df = pd.read_csv("./data/cleaned_data/test_cleaned.csv")

test_label = pd.read_csv("./data/cleaned_data/test.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

We have read in the data and it's good practice to just look at your data to make sure that it has been imported correctly. head() shows the first five rows of the dataset and the opposite is tail() which shows the last five rows in the dataset.

In [None]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.copy()
X_train.shape, Y_train.shape, X_test.shape

In [None]:
X_train.head()

In [None]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

In [None]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

In [None]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

In [None]:
models = pd.DataFrame({
    'Model': ['Random Forest', 'Perceptron', 'Decision Tree'],
    'Score': [acc_random_forest, acc_perceptron, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

## Cross-validation

### Decision tree w/ cross-validation

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(DecisionTreeClassifier(), X_train, Y_train, cv=10)

msg = "%s: %0.2f (+/- %0.2f)" % ('Decision tree', scores.mean(), scores.std())
print (msg)

### Random forest w/ cross-validation

In [None]:
scores = cross_val_score(RandomForestClassifier(n_estimators=100), X_train, Y_train, cv=10)

msg = "%s: %0.2f (+/- %0.2f)" % ('Random forest', scores.mean(), scores.std())
print (msg)

### Perceptron w/ cross-validation

In [None]:
scores = cross_val_score(Perceptron(), X_train, Y_train, cv=10)

msg = "%s: %0.2f (+/- %0.2f)" % ('Perceptron', scores.mean(), scores.std())
print (msg)

In [None]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
# submission.to_csv('../output/submission.csv', index=False)

## Neural network w/ multi-layer perceptron

In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.neural_network import MLPClassifier

# clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
#                      hidden_layer_sizes=(50,50), random_state=1)
clf = MLPClassifier()
clf.fit(X_train, Y_train)

In [None]:
clf.score(X_train, Y_train)

In [None]:
Y_pred = clf.predict(X_test)
Y_pred

In [None]:
submission = pd.DataFrame({
        "PassengerId": test_label["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('./output/submission.csv', index=False)