In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Loading our training data
import pandas as pd
import numpy as np

org_train_data = pd.read_csv('../input/titanic/train.csv')
train_data = org_train_data.copy()
train_data = train_data.drop(['PassengerId'], axis=1)

In [None]:
# Ordinal encoding categorical data (all obj dtypes)
train_data['Sex'] = pd.factorize(train_data['Sex'])[0]
train_data['Ticket'] = pd.factorize(train_data['Ticket'])[0]
train_data['Cabin'] = pd.factorize(train_data['Cabin'])[0]
train_data['Embarked'] = pd.factorize(train_data['Embarked'])[0]
train_data

In [None]:
# Ordinal encoding name column (on a passenger status basis)
new_name_list = []
name_list = train_data['Name']

for x in name_list:
    if 'Mr.' in x:
        new_name_list.append('1')
    elif 'Mrs.' in x:
        new_name_list.append('2')
    elif 'Miss.' in x:
        new_name_list.append('3')
    elif 'Rev.' in x:
        new_name_list.append('4')
    elif 'Master.' in x:
        new_name_list.append('5')
    elif 'Dr.' in x:
        new_name_list.append('6')
    elif 'Don.' in x:
        new_name_list.append('7')
    else:
        new_name_list.append('0')

train_data['Name'] = new_name_list
train_data['Name'] = train_data['Name'].astype(int)



In [None]:
# Balancing our training data set (for there are far fewer survivors)

survived_list = train_data[train_data['Survived'] == 1]
dead_list = train_data[train_data['Survived'] == 0]

new_dead_list = dead_list.iloc[:len(survived_list),:]
balanced_train_data = pd.concat([survived_list,new_dead_list])
balanced_train_data
    

In [None]:
# Dealing with missing values- imputing and extending our dataset with a column indicating wether a value was missing
from sklearn.impute import SimpleImputer

cols_w_missing = [col for col in balanced_train_data.columns
                 if balanced_train_data[col].isnull().any()]
for col in cols_w_missing:
    balanced_train_data[col + "_was_missing"] = balanced_train_data[col].isnull()

imputer = SimpleImputer()
imputed_data = pd.DataFrame(imputer.fit_transform(balanced_train_data))
imputed_data.columns = balanced_train_data.columns
imputed_data = imputed_data.astype(int)


In [None]:
# Feature engeneering- Mutual Information
# After labeling all categorical data, we will look for the contribution to uncertainty reduction for our target column using the MI regression tool.
# That way we'd be able to select the best features to use for our clasification models.

from sklearn.feature_selection import mutual_info_regression
import matplotlib.pyplot as plt

comparison_feature = imputed_data.Survived
compared_features = imputed_data.drop(['Survived'], axis=1)

# Dealing with discrete values using mi reg
discrete_features = compared_features.dtypes == int

def mi_score(X,y,discrete_features):
    mi_scores = mutual_info_regression(X,y,discrete_features = discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = mi_score(compared_features,comparison_feature,discrete_features)


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("MI Scores")
plot_mi_scores(mi_scores)

In [None]:
# After examining the results, we can get rid of the last 3 features for they are pretty much inefficient

imputed_data = imputed_data.drop(['Age_was_missing','SibSp','Parch'], axis=1)
imputed_data

In [None]:
# One-Hot encoding for Name, Embarked columns
from sklearn.model_selection import train_test_split



name_train_dummies = pd.get_dummies(data=imputed_data['Name'])
emb_train_dummies = pd.get_dummies(data=imputed_data['Embarked'])


imputed_data = imputed_data.drop(["Name","Embarked"], axis=1)


imputed_data = pd.concat([imputed_data, name_train_dummies, emb_train_dummies], axis=1)


imputed_data = imputed_data.rename(columns = {'0':'NoStatus', '1':'Mr', '2':'Mrs', '3':'Miss', '4':'Rev', '5':'Master'
                                    , '6':'Dr', '7':'Don'})


In [None]:
y = imputed_data.Survived
X = imputed_data.drop(['Survived'], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)


In [None]:
# Model & Training - Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Choosing the optimal n_estimator value
scores = []
for k in range(1,200):
    rand_forest = RandomForestClassifier(n_estimators = k)
    rand_forest.fit(X_train, y_train)
    preds = rand_forest.predict(X_valid)
    scores.append(accuracy_score(y_valid, preds))
    
plt.plot(range(1,200),scores)
plt.xlable('n_estimators')
plt.ylabel('Prediction accuracy')

In [None]:
# As we can see- the optimal value is approximately 15, hence:

rand_forest = RandomForestClassifier(n_estimators = 15)
rand_forest.fit(X_train, y_train)
preds = rand_forest.predict(X_valid)
score = accuracy_score(y_valid, preds)

In [None]:
score_list = {'Random Forest Classifier':[score]}
score_matrix = pd.DataFrame(score_list)
score_matrix

In [None]:
# Model & Training - Naive Bayes
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_preds = gnb.predict(X_valid)
nb = [accuracy_score(y_valid, gnb_preds)]
score_matrix['NaiveBayes'] = nb

In [None]:
# Model & Training - Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_preds = logreg.predict(X_valid)
lr = [accuracy_score(y_valid, logreg_preds)]
score_matrix['LogisticREgression'] = lr
score_matrix

In [None]:
# Model & Training - K Nearest Neighbor
# Finding the optimal value for k
from sklearn.neighbors import KNeighborsClassifier

scores = []
for k in range(1,200):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    knn_preds = knn.predict(X_valid)
    scores.append(accuracy_score(y_valid, knn_preds))

plt.plot(range(1,200), scores)
plt.xlabel('K neighbors')
plt.ylabel('Predictions accuracy')


In [None]:
# As we can see the optimal value is around 15, hence:

knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_valid)
knn_mat = accuracy_score(y_valid, knn_preds)
score_matrix['KNN'] = knn_mat


In [None]:
# Model & Training - Support Vector Machine
from sklearn import svm
svm = svm.SVC()
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_valid)
svm_mat = accuracy_score(y_valid, svm_preds)
score_matrix['SVM'] = svm_mat


In [None]:
score_matrix