##Step 2: Data analysis

In [1]:
%matplotlib qt

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import exec.data_framework.utildata as ut
from utils.utilgen import viewdf

#sns.set_style("whitegrid")
%reload_ext autoreload
%autoreload 2

###Load and check data

In [2]:
# Data
dataTrainPath = 'data\\data_derived\\train1.csv'
dataTrain = pd.read_csv(dataTrainPath, index_col=0)
dataC = dataTrain.copy()


In [3]:
# ###Data Dictionary
 
# Variable - Definition - Key
# Survived - Survival - 0 = No, 1 = Yes 
# Pclass - Ticket class - 1 = 1st, 2 = 2nd, 3 = 3rd 
# Sex - Sex 	 
# Age - Age in years 
# SibSp - Nb of siblings / spouses aboard the Titanic 	 
# Parch - Nb of parents / children aboard the Titanic 	 
# Ticket - Ticket number 	 
# Fare - Passenger fare 	 
# Cabin - Cabin number 	 
# Embarked - Port of Embarkation - C = Cherbourg, Q = Queenstown, S = Southampton 

# Variable Notes 
# 
# Pclass: A proxy for socio-economic status (SES) 
# 1st = Upper 
# 2nd = Middle
# 3rd = Lower 
# 
# Age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5 
# 
# SibSp: The dataset defines family relations in this way... 
# Sibling = brother, sister, stepbrother, stepsister 
# Spouse = husband, wife (mistresses and fiancés were ignored) 
# 
# Parch: The dataset defines family relations in this way... 
# Parent = mother, father
# Child = daughter, son, stepdaughter, stepson 
# Some children travelled only with a nanny, therefore parch=0 for them. 

In [4]:
display(dataTrain.sample(n=5))

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
536,1,2,"Hart, Miss. Eva Miriam",female,7.0,0,2,F.C.C. 13529,26.25,,S
545,0,1,"Douglas, Mr. Walter Donald",male,50.0,1,0,PC 17761,106.425,C86,C
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
636,1,2,"Davis, Miss. Mary",female,28.0,0,0,237668,13.0,,S


In [5]:
viewdf(dataTrain.sample(n=40))

In [6]:
display(dataTrain.describe())

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,623.0,623.0,499.0,623.0,623.0,623.0
mean,0.373997,2.292135,29.812625,0.487961,0.362761,32.78885
std,0.484252,0.845673,14.436087,0.970348,0.745852,51.625832
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,1.0,21.0,0.0,0.0,7.925
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,39.0,1.0,0.0,30.5
max,1.0,3.0,74.0,8.0,5.0,512.3292


###Missing values and dummy variables

In [7]:
display(dataC.isna().sum())


Survived      0
Pclass        0
Name          0
Sex           0
Age         124
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       476
Embarked      1
dtype: int64

In [8]:
# Impute one nan in Embarked (mode)
# Dummy Embarked and Sex

ut.imputeFeature(dataC, 'Embarked', 'mode')
dataC = ut.dummyFeature(dataC, 'Embarked', prefix_sep='') 
dataC = ut.dummyFeature(dataC, 'Sex') 
display(dataC.sample(n=5))

Unnamed: 0_level_0,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,EmbarkedC,EmbarkedQ,EmbarkedS,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,W./C. 6607,23.45,,0,0,1,1,0
108,1,3,"Moss, Mr. Albert Johan",,0,0,312991,7.775,,0,0,1,0,1
140,0,1,"Giglio, Mr. Victor",24.0,0,0,PC 17593,79.2,B86,1,0,0,0,1
177,0,3,"Lefebre, Master. Henry Forbes",,3,1,4133,25.4667,,0,0,1,0,1
264,0,1,"Harrison, Mr. William",40.0,0,0,112059,0.0,B94,0,0,1,0,1


In [9]:
# Drop Sex_male
ut.dropFeature(dataC, 'Sex_male')
dataC.rename(columns = {'Sex_female':'Female'}, inplace = True)

In [10]:
display(dataC.describe())
display(dataC[dataC['Age'].isnull()].describe())
display(dataC[dataC['Cabin'].isnull()].describe())

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,EmbarkedC,EmbarkedQ,EmbarkedS,Female
count,623.0,623.0,499.0,623.0,623.0,623.0,623.0,623.0,623.0,623.0
mean,0.373997,2.292135,29.812625,0.487961,0.362761,32.78885,0.182986,0.081862,0.735152,0.329053
std,0.484252,0.845673,14.436087,0.970348,0.745852,51.625832,0.386965,0.274375,0.441607,0.470247
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,21.0,0.0,0.0,7.925,0.0,0.0,0.0,0.0
50%,0.0,3.0,28.0,0.0,0.0,14.4542,0.0,0.0,1.0,0.0
75%,1.0,3.0,39.0,1.0,0.0,30.5,0.0,0.0,1.0,1.0
max,1.0,3.0,74.0,8.0,5.0,512.3292,1.0,1.0,1.0,1.0


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,EmbarkedC,EmbarkedQ,EmbarkedS,Female
count,124.0,124.0,0.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0
mean,0.290323,2.580645,,0.379032,0.16129,21.061557,0.209677,0.258065,0.532258,0.290323
std,0.455753,0.776659,,1.15183,0.500197,32.370833,0.40873,0.439345,0.500983,0.455753
min,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,3.0,,0.0,0.0,7.75,0.0,0.0,0.0,0.0
50%,0.0,3.0,,0.0,0.0,8.05,0.0,0.0,1.0,0.0
75%,1.0,3.0,,0.0,0.0,23.45,0.0,1.0,1.0,1.0
max,1.0,3.0,,8.0,2.0,227.525,1.0,1.0,1.0,1.0


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,EmbarkedC,EmbarkedQ,EmbarkedS,Female
count,476.0,476.0,364.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0
mean,0.296218,2.638655,27.795549,0.491597,0.329832,18.184995,0.132353,0.102941,0.764706,0.302521
std,0.457069,0.594446,13.570141,1.047269,0.741572,23.011913,0.339231,0.304202,0.424629,0.459833
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,20.0,0.0,0.0,7.8542,0.0,0.0,1.0,0.0
50%,0.0,3.0,26.0,0.0,0.0,10.5,0.0,0.0,1.0,0.0
75%,1.0,3.0,35.0,1.0,0.0,21.075,0.0,0.0,1.0,1.0
max,1.0,3.0,74.0,8.0,5.0,227.525,1.0,1.0,1.0,1.0


In [11]:
# Drop Cabin column
# Impute Age column (mean - analogue of using zero weights)

ut.dropFeature(dataC, 'Cabin')
ut.imputeFeature(dataC, 'Age', 'mean')
display(dataC.isna().sum())


Survived     0
Pclass       0
Name         0
Age          0
SibSp        0
Parch        0
Ticket       0
Fare         0
EmbarkedC    0
EmbarkedQ    0
EmbarkedS    0
Female       0
dtype: int64

In [12]:
# Drop Name, Ticket
ut.dropFeature(dataC, 'Ticket')
ut.dropFeature(dataC, 'Name')
display(dataC.sample(5))

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,EmbarkedC,EmbarkedQ,EmbarkedS,Female
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
526,0,3,40.5,0,0,7.75,0,1,0,0
687,0,3,14.0,4,1,39.6875,0,0,1,0
767,0,1,29.812625,0,0,39.6,1,0,0,0
719,0,3,29.812625,0,0,15.5,0,1,0,0
557,1,1,48.0,1,0,39.6,1,0,0,1


###Distribution characteristics

In [13]:
# fig, ax = plt.subplots(2, 1)
# (ut.normFeatures(dataC, ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'])).boxplot(ax=ax[0])
# (ut.normFeatures(dataC, ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], 'maxmin')).boxplot(ax=ax[1])
# plt.tight_layout()


In [14]:
fig, ax = plt.subplots(2, 1)
sns.boxplot(data=ut.normFeatures(dataC, ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']), ax=ax[0])
sns.boxplot(data=ut.normFeatures(dataC, ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], 'maxmin'), ax=ax[1])
plt.tight_layout()


###Feature-feature and class-feature relationships

#### Distribution Histogram

In [59]:
## --- Method 1
# dataC.hist()
# plt.tight_layout()

In [128]:
## --- Method 2
# ut.histColumns(dataC.drop(columns=['EmbarkedQ']))

In [162]:
## --- Method 3
g = sns.FacetGrid(dataC.drop(columns='EmbarkedQ').melt(), col='variable', palette='Set1', sharey=False, sharex=False, col_wrap=3)
g = (g.map(sns.distplot, 'value', kde=False, hist_kws={"rwidth": 0.5, 'edgecolor': 'black', 'alpha': 1.0}))

#### Distribution Histogram (Filtered by Survived)

In [66]:
## --- Method 1
# for x in dataC.drop(columns='EmbarkedQ').groupby('Survived'):
#     x[1].hist()
#     plt.tight_layout()

In [132]:
## --- Method 2
# for x in dataC.drop(columns='EmbarkedQ').groupby('Survived'):
#     ut.histColumns(x[1])


In [179]:
## --- Method 3
for x in dataC.drop(columns='EmbarkedQ').groupby('Survived'):
    g = sns.FacetGrid(x[1].melt(), col='variable', palette='Set1', sharey=False, sharex=False, col_wrap=3)
    g = (g.map(sns.distplot, 'value', kde=False, bins=20, hist_kws={"rwidth": 0.6, 'edgecolor': 'black', 'alpha': 1.0}))

#### Distribution Histogram (Filtered by Survived, combined)

In [113]:
## --- Method 1
# fig, ax = plt.subplots(2,2)
# dataC.groupby('Survived')['Female'].hist(alpha=0.5, normed=True, ax=ax[0,0])
# ax[0,0].set_xlabel('Female')
# dataC.groupby('Survived')['Age'].hist(alpha=0.5, normed=True, ax=ax[0,1])
# ax[0,1].set_xlabel('Age')
# dataC.groupby('Survived')['Pclass'].hist(alpha=0.5, normed=True, ax=ax[1,0])
# ax[1,0].set_xlabel('Pclass')
# dataC.groupby('Survived')['EmbarkedC'].hist(alpha=0.5, normed=True, ax=ax[1,1])
# ax[1,1].set_xlabel('EmbarkedC')
# plt.tight_layout()


In [120]:
## --- Method 2
# ut.histColumns(dataC.drop(columns='EmbarkedQ'), groupby='Survived')


In [185]:
## --- Method 3
g = sns.FacetGrid(dataC.drop(columns='EmbarkedQ').melt(['Survived']), col='variable', hue='Survived',
                  palette='Set1', sharey=False, sharex=False, col_wrap=3)
g = (g.map(sns.distplot, 'value', kde=False, bins=20, hist_kws={'rwidth': 0.6, 'edgecolor': 'black', 'alpha': 0.5}).add_legend())

#### Correlations

In [167]:
sns.heatmap(dataC.corr(), annot=True, fmt=".2f")
plt.tight_layout()

#### Linear regression by individual features

In [297]:
# --- Method 1
# from pandas.plotting import scatter_matrix
# scatter_matrix(dataC[['Survived','Age','Pclass','Embarked_C']], alpha=0.3)

In [161]:
## --- Method 2
# sns.pairplot(dataC, vars=['Survived','Female','Age','Pclass','Fare','EmbarkedC'], kind="reg", 
#              markers="+", plot_kws={'scatter_kws': {'alpha': 0.5}}, diag_kws={'alpha': 1})

<seaborn.axisgrid.PairGrid at 0x22569b4e780>

In [169]:
## --- Method 3
g = sns.PairGrid(dataC, vars=['Survived','Female','Age','Pclass','Fare','EmbarkedC'])
g = g.map(sns.regplot, logistic=False, marker="+",
          scatter_kws= {'alpha': 0.5})


In [171]:
g = sns.PairGrid(dataC, vars=['Survived','Female','Age','Parch','SibSp','EmbarkedC'])
g = g.map(sns.regplot, logistic=False, marker="+",
          scatter_kws= {'alpha': 0.5})
plt.tight_layout()

#### Local regression by individual features

In [178]:
g = sns.PairGrid(dataC, vars=['Survived', 'Age', 'Pclass', 'Parch', 'SibSp'])
g = g.map(sns.regplot, lowess=True, marker="+")


  r = func(a, **kwargs)
  res = _lowess(y, x, frac=frac, it=it, delta=delta)
  res = _lowess(y, x, frac=frac, it=it, delta=delta)


#### Linear regression by individual features (filtered)

In [186]:
g = sns.PairGrid(dataC, vars=['Survived','Age','Pclass','SibSp','Parch'], hue='Female')
g = (g.map(sns.regplot, logistic=False, marker="+",
          scatter_kws= {'alpha': 0.5}).add_legend())

In [192]:
dataC['Adult']=(dataC['Age']>20)
g = sns.PairGrid(dataC, vars=['Survived','Female','Pclass','SibSp','Parch'], hue='Adult')
g = (g.map(sns.regplot, logistic=False, marker="+",
          scatter_kws= {'alpha': 0.5}).add_legend())
dataC.drop(columns='Adult', inplace=True)

### Data normalization

In [23]:
dataF = dataC.copy()
dataF['SibSpYes'] = (dataC['SibSp'] > 0).astype(int)
dataF['ParchYes'] = (dataC['Parch'] > 0).astype(int)
dataF.drop(columns=['SibSp', 'Parch', 'EmbarkedQ'], inplace=True)
display(dataF.describe())
ut.clipFeature(dataF, 'Age', nStd=3)
ut.clipFeature(dataF, 'Fare', nStd=3)
display(dataF.describe())
display(dataF.sample(5))
sns.boxplot(data=ut.normFeatures(dataF, method='maxmin'))


Unnamed: 0,Survived,Pclass,Age,Fare,EmbarkedC,EmbarkedS,Female,SibSpYes,ParchYes
count,623.0,623.0,623.0,623.0,623.0,623.0,623.0,623.0,623.0
mean,0.373997,2.292135,29.812625,32.78885,0.182986,0.735152,0.329053,0.317817,0.24077
std,0.484252,0.845673,12.917218,51.625832,0.386965,0.441607,0.470247,0.466002,0.427895
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,22.0,7.925,0.0,0.0,0.0,0.0,0.0
50%,0.0,3.0,29.812625,14.4542,0.0,1.0,0.0,0.0,0.0
75%,1.0,3.0,35.0,30.5,0.0,1.0,1.0,1.0,0.0
max,1.0,3.0,74.0,512.3292,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,Survived,Pclass,Age,Fare,EmbarkedC,EmbarkedS,Female,SibSpYes,ParchYes
count,623.0,623.0,623.0,623.0,623.0,623.0,623.0,623.0,623.0
mean,0.373997,2.292135,29.790669,30.502084,0.182986,0.735152,0.329053,0.317817,0.24077
std,0.484252,0.845673,12.848101,39.199255,0.386965,0.441607,0.470247,0.466002,0.427895
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,22.0,7.925,0.0,0.0,0.0,0.0,0.0
50%,0.0,3.0,29.812625,14.4542,0.0,1.0,0.0,0.0,0.0
75%,1.0,3.0,35.0,30.5,0.0,1.0,1.0,1.0,0.0
max,1.0,3.0,68.564278,187.666346,1.0,1.0,1.0,1.0,1.0


Unnamed: 0_level_0,Survived,Pclass,Age,Fare,EmbarkedC,EmbarkedS,Female,SibSpYes,ParchYes
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
204,0,3,45.5,7.225,1,0,0,0,0
557,1,1,48.0,39.6,1,0,1,1,0
144,0,3,19.0,6.75,0,0,0,0,0
4,1,1,35.0,53.1,0,1,1,1,0
634,0,1,29.812625,0.0,0,1,0,0,0


<matplotlib.axes._subplots.AxesSubplot at 0x2911cb41898>