In [None]:
from IPython.core.display import HTML
HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;}
</style>
""");

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, KFold
from xgboost import XGBClassifier
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import os
print(os.listdir("../input"))

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
plt.rcParams['figure.figsize'] = (12, 9)

In [None]:
# Load and preview datasets 
train_dataset, test_dataset = pd.read_csv('../input/train.csv'), pd.read_csv('../input/test.csv')
print('Training Dataset: %s, Testing Dataset: %s' %(str(train_dataset.shape), str(test_dataset.shape)))
train_dataset.head()
train_dataset.dtypes.reset_index()

# Exploratory Data Analysis

In [None]:
survivors = train_dataset[train_dataset['Survived'] == 1]['Pclass'].value_counts()
dead = train_dataset[train_dataset['Survived'] == 0]['Pclass'].value_counts()

df_survival_pclass = pd.DataFrame([survivors, dead])
df_survival_pclass.index = ['Dead', 'Survived']
df_survival_pclass.plot(kind='bar', stacked=True, title='Passengers Dead and Survived by Passenger Classes');

In [None]:
train_dataset['Dead'] = 1 - train_dataset['Survived']
train_dataset.groupby('Sex').agg('sum')[['Survived', 'Dead']].plot(kind='bar', stacked=True, colors=['g', 'r']);

In [None]:
def null_check(train_dataset, test_dataset):
    print("Training Dataset:")
    print(train_dataset.isnull().sum())
    
    print("\nTesting Dataset:")
    print(test_dataset.isnull().sum())

null_check(train_dataset, test_dataset)

# Data Wrangling, Feature Selection

In [None]:
try:
    # Replace NaN values in the column 'Age' with the median value 
    train_dataset['Age'] = train_dataset['Age'].fillna(train_dataset['Age'].median())
    test_dataset['Age'] = test_dataset['Age'].fillna(test_dataset['Age'].median())
except:
    pass

try:
    # Apply same concept as above
    train_dataset["Embarked"].fillna("S", inplace = True)
    test_dataset["Embarked"].fillna("S", inplace = True)
    train_dataset["Fare"].fillna(train_dataset["Fare"].median(), inplace = True)
    test_dataset["Fare"].fillna(test_dataset["Fare"].median(), inplace = True)
except:
    pass

try:
    # Drop columns 'Cabin' and 'Ticket' since they contain a lot of noise
    train_dataset.drop(labels = ["Cabin", "Ticket"], axis = 1, inplace = True)
    test_dataset.drop(labels = ["Cabin", "Ticket"], axis = 1, inplace = True)
except:
    pass

encoder_sex = preprocessing.LabelEncoder()
encoder_sex.fit(train_dataset['Sex'])
train_dataset['Sex'] = encoder_sex.transform(train_dataset['Sex'])

encoder_embarked = preprocessing.LabelEncoder()
encoder_embarked.fit(train_dataset['Embarked'])
train_dataset['Embarked'] = encoder_embarked.transform(train_dataset['Embarked'])
null_check(train_dataset, test_dataset)

In [None]:
train_dataset.head()

In [None]:
sns.barplot(x='Sex', y='Survived', data=train_dataset, capsize=.2)
plt.title('Survival Based on Gender')
plt.show()

# XGBoost Regression Hyperparameter Tuning

In [None]:
# y_train = train_dataset['Survived']
# df_ = train_dataset[['PassengerId','Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked']]
# df_.set_index('PassengerId', drop=True, inplace=True)
# X_train = df_.values

# X, y = X_train, y_train

# kf = KFold(n_splits=3, random_state=42, shuffle=True)
# xgbr = XGBRegressor()
# parameters = {'nthread':[4], 'objective':['reg:linear'], 'learning_rate': [.03, 0.05, .07], 'max_depth': [5, 6, 7], 'min_child_weight': [4], 'silent': [1], 'subsample': [0.7], 'colsample_bytree': [0.7], 'n_estimators': range(50,500,25)}
# for train_index, test_index in kf.split(X):
#     print("train_index:", train_index, "test_index", test_index)
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
    
#     xgb_grid = GridSearchCV(xgbr,parameters, cv=3, n_jobs=1, verbose=True)
#     xgb_grid.fit(X_train, y_train)

#     print(xgb_grid.best_score_)
#     print(xgb_grid.best_params_)

In [None]:
# xgbr = XGBRegressor()
# parameters = {'nthread':[4], 'objective':['reg:linear'], 'learning_rate': [.03, 0.05, .07], 'max_depth': [5, 6, 7], 'min_child_weight': [4], 'silent': [1], 'subsample': [0.7], 'colsample_bytree': [0.7], 'n_estimators': [500]}
# xgb_grid = GridSearchCV(xgbr,parameters, cv = 3, n_jobs = 1, verbose=True)
# xgb_grid.fit(X_train, y_train)

# print(xgb_grid.best_score_)
# print(xgb_grid.best_params_)

# dftest_sorted=dftest.sort_values("PassengerId")
# dftest_sorted=dftest_sorted[["PassengerId","Survived"]]

# dftest_sorted.to_csv("Titanic.csv",index=False)
# print('print csv')