### Importing required packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import os
os.listdir("../input")

### Loading dataset

In [None]:
data = pd.read_csv("../input/train.csv")
data.head()

### Data types

In [None]:
print(data.dtypes)

We can see that numbers are represted as int or float in this dataset and data type conversion is not needed here.

### Proportion of target (Survived)

In [None]:
print("Total count:", len(data))
print()
print(round((data.Survived.value_counts()/len(data)) * 100,2))

This dataset has a decent proportion of target class and it is not skewed to any one.

### Unique Values

In [None]:
def get_uniquevals(df):
    print("-"*40)
    for col in df.columns:
        if len(df[col].unique()) <= 10:
            print("{} - Unqiue Values:".format(df[col].name))
            print(df[col].unique())
            print()
            print("{} - # of occurences of each values:".format(df[col].name))
            print(df[col].value_counts())
        else:
            print("{} has {} unqiue values:".format(df[col].name,len(df[col].unique())))
        print("-"*40)

In [None]:
get_uniquevals(data)

Pclass, Sex, SibSp, Parch and Embarked are having **less distinct values** and they can be converted to numeric values 

### Null Values

In [None]:
def getnullcounts(df):
    print("-"*20)
    non_nullcols = []
    for col in df.columns:
        if df[col].isna().sum() > 0:
            print("{} : {}".format(df[col].name, df[col].isna().sum()))
        else:
            non_nullcols.append(df[col].name)
    print("-"*20)
    print('Non-null features:\n',', '.join(non_nullcols))
    print("-"*20)

In [None]:
getnullcounts(data)

Age & Embarked has null values which should be imputed before prediction

### Feature Elimination

In [None]:
def feature_elimination(df):
    print('Features to be considered for elimiation:')
    for col in df.columns:
        if len(df[col].unique()) == (len(df)) and df[col].dtype != 'object':
            print(df[col].name)
        if len(df[col].unique()) > (len(df)*0.50) and df[col].dtype == 'object':
            print(df[col].name)

In [None]:
feature_elimination(data)

Note: These are suggestions as the number of distinct values are high. Care should be taken before elimination.
for example "Name" can be used to create a feature called "title" which can be used for prediction

### Visual Exploration

In [None]:
f, ax = plt.subplots(figsize=(11,5))
sns.boxplot(x='Survived', y="Age",  data=data);

Very less chances for age > 60+ to survive (except some outliers)

In [None]:
f, ax = plt.subplots(figsize=(11,5))
sns.boxplot(x="Sex", y="Age", hue="Survived", data=data);

Gives clarity to the above finding that Male has very less chances for age > 60+

In [None]:
f, ax = plt.subplots(figsize=(7,3))
sns.barplot(x='Sex', y="Survived",  data=data);

Number of female passengers survived is more than male passengers

In [None]:
sns.barplot(x="Pclass", y="Survived", data=data);

Passenger Class 1 has high survival rate

In [None]:
sns.barplot(x="Pclass", y="Survived",hue="Sex", data=data);

In [None]:
sns.barplot(x="SibSp", y="Survived", data=data);

More the siblings less the survival chance

In [None]:
sns.barplot(x="Parch", y="Survived", data=data);

In [None]:
data["family"] = data["SibSp"] + data["Parch"]
data["occumpanied"] = data["family"].apply(lambda x: 0 if x == 0 else 1)
sns.barplot(x="Survived", y="occumpanied", data=data);

Those who are occumpanied by a family member (elder or siblings) had high survival rate

In [None]:
sns.distplot(data['Age'].dropna());

Passengers aged between 18-38 had high survival rate compared to others

In [None]:
survived = data.loc[data['Survived']==1,"Age"].dropna()
sns.distplot(survived)
plt.title("Survived");

In [None]:
not_survived = data.loc[data['Survived']==0,"Age"].dropna()
sns.distplot(not_survived)
plt.title("Not Survived");

Infants had high survival rate and elderly passengers above 65+ were less likely to survive

In [None]:
sns.pairplot(data.dropna());