In [None]:
# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

visualize_data = False

In [None]:
import titanic_age_regression
import titanic_classifiers
import titanic_feature_creation
import titanic_feature_transformation
import titanic_plotting

# get titanic & test csv files as a DataFrame
titanic_df = pd.read_csv("../input/train.csv", dtype={"Age": np.float64}, )
test_df    = pd.read_csv("../input/test.csv", dtype={"Age": np.float64}, )

# preview the data
if visualize_data:
    titanic_df.head()

In [None]:
if visualize_data:
    titanic_df.info()
    print("----------------------------")
    test_df.info()

In [None]:
titanic_df = titanic_feature_creation.create_title_classes(titanic_df)
test_df = titanic_feature_creation.create_title_classes(test_df)

In [None]:
# drop unnecessary columns, these columns won't be useful in analysis and prediction
titanic_df = titanic_df.drop(['PassengerId','Name','Ticket'], axis=1)
test_df    = test_df.drop(['Name','Ticket'], axis=1)

In [None]:
# Embarked

# only in titanic_df, fill the two missing values with the most occurred value, which is "S".
titanic_df["Embarked"] = titanic_df["Embarked"].fillna("S")

if visualize_data:
    titanic_plotting.make_embarked_survival_plot(titanic_df)
    
# Remove "S" dummy variable, 
# and leave "C" & "Q", since they seem to have a good rate for Survival.

titanic_df = titanic_feature_creation.one_hot_encode_embarked_variables(titanic_df)
test_df = titanic_feature_creation.one_hot_encode_embarked_variables(test_df)

In [None]:
# Fare

# only for test_df, since there is a missing "Fare" values
test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True)

if visualize_data:
    titanic_plotting.make_fare_plot(titanic_df)

In [None]:

titanic_df.drop("Cabin",axis=1,inplace=True)
test_df.drop("Cabin",axis=1,inplace=True)

titanic_df = titanic_feature_creation.one_hot_encode_gender_variables(titanic_df)
test_df = titanic_feature_creation.one_hot_encode_gender_variables(test_df)

In [None]:
# Family
if visualize_data:
    titanic_plotting.make_parch_sibsp_survival_plots(titanic_df)


In [None]:
# Pclass
if visualize_data:
    titanic_plotting.make_class_survival_plot(titanic_df)

titanic_df = titanic_feature_creation.one_hot_encode_class_variables(titanic_df)
test_df = titanic_feature_creation.one_hot_encode_class_variables(test_df)


In [None]:
titanic_df = titanic_feature_creation.define_young_old_miss_features(titanic_df)
test_df = titanic_feature_creation.define_young_old_miss_features(test_df)

In [None]:
# Age 

#basic_routines.make_family_size_feature(test_df)
#basic_routines.make_family_size_feature(titanic_df)

X_train_regression, Y_train_regression, X_test_regression = titanic_age_regression.create_age_training_test_sets(titanic_df, test_df)

# k Nearest Neighbors

parameters = {'n_neighbors':[16,17,18],'weights':['uniform']}
X_train_regression,X_test_regression = titanic_feature_transformation.min_max_scaling(X_train_regression,X_test_regression)
knn = titanic_age_regression.create_knn_regressor_age(X_train_regression, Y_train_regression,parameters)
titanic_df, test_df = titanic_age_regression.fill_in_missing_age_values(titanic_df, test_df, knn)

# Decision Tree Regressor

#parameters = {'min_samples_leaf':[20], 'min_weight_fraction_leaf':[0.0125,0.025,0.05], 'min_impurity_split':[4e-7]}
#regression_tree = titanic_age_regression.create_decision_tree_regressor_age(X_train_regression, Y_train_regression, parameters)
#titanic_df, test_df = titanic_age_regression.fill_in_missing_age_values(titanic_df, test_df, regression_tree)
#if visualize_data:
#    viz_tree = titanic_plotting.visualize_decision_tree_and_write_to_file(regression_tree,X_train_regression.columns,"AGES","Decision_Tree_Regressor_Age.pdf")
#    from IPython.display import Image
#    Image(viz_tree.create_png())

# Fill in with median ages corresponding to titles

#titanic_df, test_df = titanic_age_regression.fill_nan_ages_with_median_title_ages(titanic_df,test_df)

# Use Linear Regression to predict ages WORKING VERY BADLY, AGES > 100!!!

#parameters = {'alpha':[500]}
#linear_regressor = titanic_age_regression.create_linear_regressor(X_train_regression,Y_train_regression,parameters)
#titanic_df, test_df = titanic_age_regression.fill_in_missing_age_values(titanic_df, test_df, linear_regressor)
#print titanic_df[titanic_df['Age'] > 100]
#print test_df[test_df['Age'] > 100]

In [None]:
if visualize_data:
    titanic_plotting.plot_age_survival_distribution(titanic_df)

In [None]:
# define training and testing sets

X_train = titanic_df.drop(["Survived"],axis=1)
Y_train = titanic_df["Survived"]
X_test  = test_df.drop(["PassengerId"],axis=1).copy()

# Feature Transformation

X_train,X_test = titanic_feature_transformation.min_max_scaling(X_train,X_test)

#X_train,X_test = titanic_feature_transformation.transform_features_by_pca(X_train,X_test)

In [None]:
parameters = {'min_samples_leaf':[25],'min_weight_fraction_leaf':[0.1], 'min_impurity_split':[2e-1]}

dtc = titanic_classifiers.create_decision_tree_classifier(X_train,Y_train,parameters)

if visualize_data:
    viz_tree = titanic_plotting.visualize_decision_tree_and_write_to_file(dtc,X_train.columns,"01","Decision_Tree_Classifier.pdf")
    from IPython.display import Image
    Image(viz_tree.create_png())

In [None]:
from sklearn.ensemble import RandomForestClassifier

parameters = {'min_samples_leaf':[5], 'min_weight_fraction_leaf':[0.025], 'n_estimators':[45,50,55], 'criterion':['gini'],'min_impurity_split':[0.1]}

random_forest = titanic_classifiers.create_random_forest_classifier(X_train,Y_train,parameters)

Y_pred_rf = random_forest.predict(X_test)


In [None]:
from sklearn.svm import SVC, LinearSVC

parameters = {"C":[3.5,4,4.55], "kernel":['rbf'], "gamma":[0.625,0.75,0.875]}

svc = titanic_classifiers.create_support_vector_machine_classifier(X_train,Y_train,parameters)

Y_pred_svc = svc.predict(X_test)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

parameters = {'learning_rate':[0.1], 'n_estimators':[35],'min_weight_fraction_leaf':[0.08],'max_depth':[None],'min_impurity_split':[0.075]}

gbc = titanic_classifiers.create_gradient_boosted_classifier(X_train,Y_train,parameters)

Y_pred_gb = gbc.predict(X_test)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

parameters = {"n_estimators":[500],"learning_rate":[0.01]}

adbc = titanic_classifiers.create_adaboost_classifier(X_train,Y_train,parameters)

Y_pred_adbc = adbc.predict(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

parameters = {'n_neighbors':[5,7,9,11,13],'weights':['uniform']}

knn = titanic_classifiers.create_knn_classifier(X_train,Y_train,parameters)

Y_pred_knn = knn.predict(X_test)

In [None]:
# put ensemble together
#total_score = knn_score + rf_score + svc_score + logreg_score

#Y_pred_avg = (1./3.) * Y_pred_knn + (1./3.) * Y_pred_rf + (1./3.) * Y_pred_svc
#Y_pred_avg = Y_pred_rf
Y_pred_avg = Y_pred_svc
#Y_pred_avg = Y_pred_knn
#Y_pred_avg = Y_pred_adbc
#Y_pred_avg = Y_pred_gb
Y_pred = list(map(lambda x: 0 if x < 0.5 else 1, Y_pred_avg))

In [None]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('titanic.csv', index=False)