In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
combine = [train_df, test_df]

# cleanup of NaN values
combine[0] = combine[0].astype(object).where(pd.notnull(combine[0]), None)
combine[1] = combine[1].astype(object).where(pd.notnull(combine[1]), None)

In [None]:
# defining ages by groups
def age_group(age): 
    if age is None:
        return 'undetermined_age'
    if age <= 14:
        return 'is_children'
    elif age > 14 and age <=24:
        return 'is_young'
    elif age > 24 and age <=64:
        return 'is_adult'
    else:
        return 'is_senior'
    
# applying age groups to dataframe (converting age to categorical feature)
combine[0]['Age'] = combine[0]['Age'].apply(lambda a: age_group(a))
combine[1]['Age'] = combine[1]['Age'].apply(lambda a: age_group(a))

In [None]:
# defining deck by cabin
def deck(cabin):
    if cabin is None:
        return 'undetermined_deck'
    else:
        return cabin[:1]

# getting decks from cabin numbers
combine[0]['Cabin'] = combine[0]['Cabin'].apply(lambda a: deck(a))
combine[1]['Cabin'] = combine[1]['Cabin'].apply(lambda a: deck(a))

In [None]:
def embarkation_port(embarked):
    if embarked == 'C':
        return 'Cherbourg'
    elif embarked == 'Q':
        return 'Queenstown'
    elif embarked == 'S':
        return 'Southampton'
    else:
        return embarked
# getting port from embarked column
combine[0]['Embarked'] = combine[0]['Embarked'].apply(lambda a: embarkation_port(a))
combine[1]['Embarked'] = combine[1]['Embarked'].apply(lambda a: embarkation_port(a))

In [None]:
# applying number of relatives
combine[0]['Relatives'] = combine[0]['SibSp'] + combine[0]['Parch']
combine[0] = combine[0].drop(columns=['SibSp', 'Parch'])

combine[1]['Relatives'] = combine[1]['SibSp'] + combine[1]['Parch']
combine[1] = combine[1].drop(columns=['SibSp', 'Parch'])

In [None]:
# I am assuming these columns do not bring much to the analysis
combine[0] = combine[0].drop(columns=['Name', 'Ticket','Fare'])
combine[1] = combine[1].drop(columns=['Name', 'Ticket','Fare'])

train_df = combine[0]
test_df = combine[1]

In [None]:
train_df.groupby(['Survived'])['Survived'].count().plot(kind='bar', stacked=True)
train_df.groupby(['Age', 'Survived'])['Age'].count().unstack('Survived').plot(kind='bar', stacked=True)
train_df.groupby(['Sex', 'Survived'])['Sex'].count().unstack('Survived').plot(kind='bar', stacked=True)
train_df.groupby(['Pclass', 'Survived'])['Pclass'].count().unstack('Survived').plot(kind='bar', stacked=True)
train_df.groupby(['Cabin', 'Survived'])['Cabin'].count().unstack('Survived').plot(kind='bar', stacked=True)
train_df.groupby(['Relatives', 'Survived'])['Relatives'].count().unstack('Survived').plot(kind='bar', stacked=True)
plt.show()

In [None]:
import seaborn as sns
# Survival distribution by Age Groups
sns.kdeplot(train_df.loc[train_df['Age'] == 'is_children', 'Survived'], label = 'Children', shade = True)
sns.kdeplot(train_df.loc[train_df['Age'] == 'is_young', 'Survived'], label = 'Young', shade = True)
sns.kdeplot(train_df.loc[train_df['Age'] == 'is_adult', 'Survived'], label = 'Adults', shade = True)
#sns.kdeplot(train_df.loc[train_df['Age'] == 'is_senior', 'Survived'], label = 'Seniors', shade = True)
plt.xlabel('Survived'); 
plt.ylabel('Density'); 
plt.title('Survival distribution by Age Groups');

In [None]:
import seaborn as sns
# Survival distribution by Sex
sns.kdeplot(train_df.loc[train_df['Sex'] == 'male', 'Survived'], label = 'Female', shade = True)
sns.kdeplot(train_df.loc[train_df['Sex'] == 'female', 'Survived'], label = 'Male', shade = True)
plt.xlabel('Survived'); 
plt.ylabel('Density'); 
plt.title('Survival distribution by Sex');

In [None]:
import seaborn as sns
# Survival distribution by Deck
sns.kdeplot(train_df.loc[train_df['Cabin'] == 'A', 'Survived'], label = 'Deck A', shade = True)
sns.kdeplot(train_df.loc[train_df['Cabin'] == 'B', 'Survived'], label = 'Deck B', shade = True)
sns.kdeplot(train_df.loc[train_df['Cabin'] == 'C', 'Survived'], label = 'Deck C', shade = True)
sns.kdeplot(train_df.loc[train_df['Cabin'] == 'D', 'Survived'], label = 'Deck D', shade = True)
sns.kdeplot(train_df.loc[train_df['Cabin'] == 'E', 'Survived'], label = 'Deck E', shade = True)
sns.kdeplot(train_df.loc[train_df['Cabin'] == 'F', 'Survived'], label = 'Deck F', shade = True)
sns.kdeplot(train_df.loc[train_df['Cabin'] == 'G', 'Survived'], label = 'Deck G', shade = True)
sns.kdeplot(train_df.loc[train_df['Cabin'] == 'T', 'Survived'], label = 'Deck T', shade = True)
sns.kdeplot(train_df.loc[train_df['Cabin'] == 'undetermined_deck', 'Survived'], label = 'Undetermined Deck', shade = True)
plt.xlabel('Survived'); 
plt.ylabel('Density'); 
plt.title('Survival distribution by Decks');

In [None]:
import seaborn as sns
# Survival distribution by Pclass
sns.kdeplot(train_df.loc[train_df['Pclass'] == 1, 'Survived'], label = 'First Class', shade = True)
sns.kdeplot(train_df.loc[train_df['Pclass'] == 2, 'Survived'], label = 'Second Class', shade = True)
sns.kdeplot(train_df.loc[train_df['Pclass'] == 3, 'Survived'], label = 'Third Class', shade = True)
plt.xlabel('Survived'); 
plt.ylabel('Density'); 
plt.title('Survival distribution by Pclass');

In [None]:
g = sns.FacetGrid(train_df, col='Survived', size=8)
g.map(plt.hist, 'Age', bins=5)

In [None]:
grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=4.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=5)
grid.add_legend();

In [None]:
# Select only categorical variables
category_df = combine[0].select_dtypes('object').filter(items = ['Pclass','Sex','Age','Cabin','Embarked'])
category_df['Survived'] = combine[0]['Survived'].astype(np.int64)

dummy_df = pd.get_dummies(category_df)

# Correlations in one-hot encoded dataframe
most_correlated = dummy_df.corr()['Survived'].abs().sort_values(ascending=False)[:8]
combine[0] = dummy_df.loc[:, most_correlated.index]

category_df = combine[1].select_dtypes('object').filter(items = ['Pclass','Sex','Age','Cabin','Embarked'])

dummy_df = pd.get_dummies(category_df)

combine[1] = dummy_df.loc[:, most_correlated.index]

In [None]:
train_df = combine[0]
test_df = combine[1]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(train_df.drop("Survived", axis=1), train_df["Survived"], test_size=0.25)

X_test  = test_df.drop("Survived", axis=1).copy()
X_train.shape, y_train.shape, X_validation.shape, y_validation.shape, X_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_validation, y_validation) * 100, 2)
acc_log

In [None]:
# Support Vector Machines
from sklearn.svm import SVC, LinearSVC

svc = SVC()
svc.fit(X_train, y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_validation, y_validation) * 100, 2)
acc_svc

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_validation, y_validation) * 100, 2)
acc_knn

In [None]:
survived = svc.predict(X_test)
result_df = pd.read_csv('../input/test.csv')
result_df["Survived"] = survived
result_df = result_df[["PassengerId", "Survived"]]
result_df.to_csv('results.csv', index = False)