In [None]:
from IPython.core.display import HTML
HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;}
</style>
""");

In [None]:
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import os
print(os.listdir("../input"))

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
plt.rcParams['figure.figsize'] = (12, 9)

In [None]:
# Load and preview datasets 
train_dataset, test_dataset = pd.read_csv('../input/train.csv'), pd.read_csv('../input/test.csv')
print('Training Dataset: %s, Testing Dataset: %s' %(str(train_dataset.shape), str(test_dataset.shape)))
train_dataset.head()
train_dataset.dtypes.reset_index()

# Exploratory Data Analysis

In [None]:
survivors = train_dataset[train_dataset['Survived'] == 1]['Pclass'].value_counts()
dead = train_dataset[train_dataset['Survived'] == 0]['Pclass'].value_counts()

df_survival_pclass = pd.DataFrame([survivors, dead])
df_survival_pclass.index = ['Dead', 'Survived']
df_survival_pclass.plot(kind='bar', stacked=True, title='Passengers Dead and Survived by Passenger Classes');

In [None]:
train_dataset['Dead'] = 1 - train_dataset['Survived']
train_dataset.groupby('Sex').agg('sum')[['Survived', 'Dead']].plot(kind='bar', stacked=True, colors=['g', 'r']);

In [None]:
def null_check(train_dataset, test_dataset):
    print("Training Dataset:")
    print(train_dataset.isnull().sum())
    
    print("\nTesting Dataset:")
    print(test_dataset.isnull().sum())

null_check(train_dataset, test_dataset)

# Data Wrangling, Feature Selection

In [None]:
# Replace NaN values in the column 'Age' with the median value 
train_dataset['Age'] = train_dataset['Age'].fillna(train_dataset['Age'].median())
test_dataset['Age'] = test_dataset['Age'].fillna(test_dataset['Age'].median())
# training["Age"].fillna(training["Age"].median(), inplace = True)
# testing["Age"].fillna(testing["Age"].median(), inplace = True) 

# Apply same concept as above
train_dataset["Embarked"].fillna("S", inplace = True)
train_dataset["Fare"].fillna(train_dataset["Fare"].median(), inplace = True)
test_dataset["Fare"].fillna(test_dataset["Fare"].median(), inplace = True)

# Drop columns 'Cabin' and 'Ticket' since they contain a lot of noise
train_dataset.drop(labels = ["Cabin", "Ticket"], axis = 1, inplace = True)
test_dataset.drop(labels = ["Cabin", "Ticket"], axis = 1, inplace = True)

null_check(train_dataset, test_dataset)

In [None]:
sns.barplot(x='Sex', y='Survived', data=train_dataset, capsize=.2)
plt.title('Survival Based on Gender')
plt.show()