In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# in this notebook, i will try to establish understanding of data, and implement logistic regression of data.


In [None]:
# some necessary libraries and packages
import sys
import matplotlib
import scipy as sp
import IPython 
from IPython import display
import sklearn

import random
import time



In [None]:
# load data modelling libraries

# modeling algorithm
from sklearn import linear_model

# model helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

# visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

#config the visualisers
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

In [None]:
data_raw = pd.read_csv('/kaggle/input/titanic/train.csv')
data_val = pd.read_csv('/kaggle/input/titanic/test.csv')

data1 = data_raw.copy(deep = True) # any cange in one, will not affect other. 

data_cleaner = [data1, data_val] # both train and test combined in 

print(data_raw.info())
data_raw.sample(10)

In [None]:
#checking null elements in each feature
print('train columns with null values : \n', data1.isnull().sum())
print("_"*10)

print('test/val columns with null values : \n', data_val.isnull().sum())
print("_"*10)

data_raw.describe(include ='all')

In [None]:
# impute the following columns
# Age with median 
# Embarked with Mode
# Fare with Median
for dataset in data_cleaner:
    dataset['Age'].fillna(dataset['Age'].median(), inplace= True)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace= True)
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace= True)
    
drop_columns = ['PassengerId','Cabin','Ticket']
data1.drop(drop_columns, axis=1, inplace =True)

print(data1.isnull().sum())
print('_'*10)
print(data_val.isnull().sum())

In [None]:
# craete new features, train and test/val data set
# family size -> no of members in family
# alone -> if was alone or not
# title -> includes Mr, Mrs, master .....
for dataset in data_cleaner :
    dataset['FamilySize'] = dataset['SibSp']+dataset['Parch'] + 1
    
    # init alone 
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['FamilySize']>1] = 0 # ip values in  IsAlone
    
    # for title, str.split is used. 
    # Cumings, Mrs. John Bradley (Florence Briggs Thayer) ,is the entry
    # first split around ", "  and then select the second part
    # Mrs. John Bradley (Florence Briggs Thayer)
    # now split around "." and take the first part --> Mrs
    dataset['Title'] = dataset['Name'].str.split(", ",expand=True)[1].str.split(".",expand=True)[0]
    
    #craeting 4 farebins, ie. fare is now divided in 4 parts, qcut is used here,
    #because, it depends on freq, hence tries to have almost same no of data
    # in each bin. used in cases like below, where fares must be rather fixed already in a few slots. 
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    
    #craeting bins for age . cut is used here and is generally used for Age. 
    # it creates bins without considering frequency of data, hence might have different 
    # no of data in different bins
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)

In [None]:
data1['Title'].value_counts()
# as we can see here, to many titles are present, and may not prove to be helpful
# hence we will convert them to Misc (ie. miscillanious)

In [None]:
stat_min =10
title_names = (data1['Title'].value_counts()<stat_min)
#print(title_names)
#print('_'*10)

data1['Title'] = data1['Title'].apply(lambda x: 'Misc'
                                     if title_names.loc[x]==True else x)
print(data1['Title'].value_counts())
print('_'*10)

In [None]:
# imputed datas !!!!
data1.info()
data_val.info()
data1.sample(10)

In [None]:
# now we will work on  --  converting formats !
# ie. convert catagorical data into mathematical data

In [None]:
label = LabelEncoder()
for dataset in data_cleaner:
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
    dataset['Title_Code'] = label.fit_transform(dataset['Title'])
    dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])
    
# define y variable
Target = ['Survived']
data1_x = ['Sex','Pclass','Embarked','Title','SibSp','Parch','Age','Fare','FamilySize','IsAlone']

data1_xy = Target+data1_x

In [None]:
data1.columns

In [None]:
# train test split !!

In [None]:
data1_x

In [None]:
#Exploratory analysis 
# group by, groups elements on basis of index passedin groupby
for x in data1_x:
    if data1[x].dtype!='float64' :
        print("Survival correlation by :",x)
        print(data1[[x,Target[0]]].groupby(x).mean())
        print('_'*13,'\n')
        
print(pd.crosstab(data1['Title'],data1[Target[0]]))
        

In [None]:
# plotting different types of plots and analysing them

plt.figure(figsize=[16,12])

plt.subplot(231)
plt.boxplot(x=data1['Fare'],showmeans = True, meanline=True)
plt.title("Fare Boxplot")
plt.ylabel("Fare ($)")

plt.subplot(232)
plt.boxplot(x=data1['Age'],showmeans = True, meanline=True)
plt.title("Age Boxplot")
plt.ylabel("Age (years)")

plt.subplot(233)
plt.boxplot(x=data1['FamilySize'],showmeans = True, meanline=True)
plt.title("Family Size Boxplot")
plt.ylabel("Family Size (#)")

plt.subplot(234)
plt.hist(x=[data1[data1['Survived']==1]['Fare'] , data1[data1['Survived']==0]['Fare']],
        stacked=True, color=['g','r'], label=['Survived','Dead'])
plt.title("Fare hist by Survival")
plt.xlabel('Fare($)')
plt.ylabel('# of passengers')
plt.legend()

plt.subplot(235)
plt.hist(x=[data1[data1['Survived']==1]['Age'] , data1[data1['Survived']==0]['Age']],
        stacked=True, color=['g','r'], label=['Survived','Dead'])
plt.title("Age hist by Survival")
plt.xlabel('Age(years)')
plt.ylabel('# of passengers')
plt.legend()

plt.subplot(236)
plt.hist(x=[data1[data1['Survived']==1]['FamilySize'] , data1[data1['Survived']==0]['FamilySize']],
        stacked=True, color=['g','r'], label=['Survived','Dead'])
plt.title("Family Size hist by Survival")
plt.xlabel('Family Size(#)')
plt.ylabel('# of passengers')
plt.legend()