# Hypothesis testing on titanic dataset

Via hypothesis testing, we'll try to find the answer to the following questions:

- Is passenger gender and cabin class related?
- Does the age of passengers vary across cabin classes?

## 1. Import data

In [4]:
# import pandas
import pandas as pd

# import titanic dataset
df = pd.read_csv('data/titanic.csv')
df

Unnamed: 0.1,Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,1,"Allen, Miss Elisabeth Walton",1st,29.00,female,1,1
1,2,"Allison, Miss Helen Loraine",1st,2.00,female,0,1
2,3,"Allison, Mr Hudson Joshua Creighton",1st,30.00,male,0,0
3,4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,female,0,1
4,5,"Allison, Master Hudson Trevor",1st,0.92,male,1,0
...,...,...,...,...,...,...,...
1308,1309,"Zakarian, Mr Artun",3rd,27.00,male,0,0
1309,1310,"Zakarian, Mr Maprieder",3rd,26.00,male,0,0
1310,1311,"Zenni, Mr Philip",3rd,22.00,male,0,0
1311,1312,"Lievens, Mr Rene",3rd,24.00,male,0,0


## 2. Quick EDA

In [5]:
# categorical nominal
df['SexCode'].unique()

array([1, 0])

In [6]:
# categorical ordinal
df['PClass'].unique()

array(['1st', '2nd', '*', '3rd'], dtype=object)

In [8]:
# descriptive statistics
df.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
count,1313.0,1313,1313,756.0,1313,1313.0,1313.0
unique,,1310,4,,2,,
top,,"Connolly, Miss Kate",3rd,,male,,
freq,,2,711,,851,,
mean,657.0,,,30.397989,,0.342727,0.351866
std,379.174762,,,14.259049,,0.474802,0.477734
min,1.0,,,0.17,,0.0,0.0
25%,329.0,,,21.0,,0.0,0.0
50%,657.0,,,28.0,,0.0,0.0
75%,985.0,,,39.0,,1.0,1.0


In [9]:
# info on dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1313 non-null   int64  
 1   Name        1313 non-null   object 
 2   PClass      1313 non-null   object 
 3   Age         756 non-null    float64
 4   Sex         1313 non-null   object 
 5   Survived    1313 non-null   int64  
 6   SexCode     1313 non-null   int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 71.9+ KB


### 2.1 Remove missing values

In [11]:
# remove * missing cabin class
df = df[df['PClass'] != '*']

In [12]:
# check if PClass is now clean
df['PClass'].unique()

array(['1st', '2nd', '3rd'], dtype=object)

In [13]:
# remove NULL values
df = df[["PClass", "Age"]].dropna()

In [14]:
# check new shape
df.shape

(756, 2)

## 3. Is passenger gender and cabin class related?

### 3.1 Normal distribution check

In [19]:
# check if observations have an approximately normal distribution

from scipy.stats import shapiro

shapiro(df["Age"])

ShapiroResult(statistic=0.9784689545631409, pvalue=4.15032630485257e-09)

In [18]:
# p_value < alpha
# it's not normally distributed
# therefore it's non-parametric

### 3.2 Run hypothesis test

In [27]:
# convert PClass strings into integers
p_class = df['PClass'].replace({'1st' : 1, '2nd' : 2, '3rd': 3})

# import mstats
from scipy.stats import mstats

# get p-value
mstats.kruskalwallis(list(p_class), list(df['Age'].values))

KruskalResult(statistic=1051.907060470326, pvalue=9.371812053548873e-231)

In [28]:
# p_value < alpha
# reject H0
# there is a significant difference between the distriubtion of gender across cabin classes

## 4. Does the age of passengers vary across cabin classes?

In [29]:
# compute count (frequency) of factors

In [30]:
# computing frequency (count) of factors
pd.crosstab(df.PClass, df.Age, margins=True)

Age,0.17,0.33,0.8,0.83,0.92,1.0,1.5,2.0,3.0,4.0,...,61.0,62.0,63.0,64.0,65.0,67.0,69.0,70.0,71.0,All
PClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1st,0,0,0,0,1,0,0,1,0,1,...,3,2,2,5,1,1,1,1,2,226
2nd,0,0,1,1,0,3,0,4,3,2,...,0,0,0,1,0,0,0,0,1,212
3rd,1,1,0,1,0,2,2,2,3,4,...,0,0,1,0,1,0,0,0,0,318
All,1,1,1,2,1,5,2,7,6,7,...,3,2,3,6,2,1,1,1,3,756


In [31]:
# convert PClass strings into integers
#p_class = map(lambda x: int(x[0]), df['PClass'].values)

# get p-value
mstats.kruskalwallis(list(p_class), list(df['Age'].values))

# p_value < alpha
# there is a significant difference between the distribution of age across cabin classes

KruskalResult(statistic=1051.907060470326, pvalue=9.371812053548873e-231)