In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading Data 

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
test_data.head()
all_data = pd.concat([train_data, test_data])
train_data.shape, test_data.shape, all_data.shape

# Data Exploration

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
# it is useful to split into numeric and categorical data for better visualization using historgrams and bar charts
numeric_df = train_data[['Age', 'SibSp', 'Fare', 'Parch']]
cat_df = train_data[['Survived', 'Embarked', 'Sex', 'Pclass', 'Cabin']]

In [None]:
for col in numeric_df.columns:
    plt.hist(numeric_df[col])
    plt.title(col)
    plt.show()

In [None]:
print (numeric_df.corr())
sns.heatmap(numeric_df.corr())

In [None]:
# compare the survival rate across Age, Fare, Parch, SibSp
pd.pivot_table(train_data, index='Survived', values=['Age', 'SibSp', 'Parch', 'Fare'])

In [None]:
for col in cat_df.columns:
    # plt.title(col)
    sns.barplot(x=cat_df[col].value_counts().index, y=cat_df[col].value_counts()).set_title(col)
    plt.show()

# Data Cleaning

In [None]:
# drop/fill null values
all_data.dropna(subset=['Embarked'], inplace=True)

# fill the null values using the mean
all_data.Age = all_data.Age.fillna(train_data.Age.mean())
all_data.Fare = all_data.Fare.fillna(train_data.Fare.mean())

# normalize Fare 
all_data['norm_fare'] = np.log(all_data.Fare+1)
#all_data.Fare.hist()
all_data.norm_fare.hist()
all_data.norm_fare.fillna(all_data.norm_fare.mean())

# Feature Engineering

In [None]:
# adding the cabin_adv feature column from Cabin
all_data['cabin_adv'] = all_data.Cabin.apply(lambda x: str(x)[0])
all_data.head()

# Data Pre-processing for Modelling

In [None]:
all_dummies = pd.get_dummies(all_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'norm_fare', 'Embarked', 'cabin_adv', 'Survived' ]])
all_dummies


In [None]:
# split x_train, x_test
train_idx = (all_dummies['Survived']==0) | (all_dummies['Survived']==1)
x_train = all_dummies[train_idx]
y_train = all_dummies[train_idx]['Survived']
x_test = all_dummies[~train_idx]
x_train.shape, y_train.shape, x_test.shape

In [None]:
x_train = x_train.drop(columns=['Survived'])
x_train

# Modelling 

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn

In [None]:
gnb = GaussianNB()
cross_val_score(gnb, x_train, y_train, cv=5)


In [None]:
rf = RandomForestClassifier()
cross_val_score(rf, x_train, y_train, cv=5)

In [None]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
cross_val_score(xgb_model, x_train, y_train, cv=5)

# Submission

In [None]:
submission_df = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
rf.fit(x_train, y_train)
y_test = rf.predict(x_test)


In [None]:
y_test

In [None]:
submission_df['Survived'] = y_test.astype('int64')

In [None]:
submission_df.to_csv('submission.csv', index=False)