In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [2]:
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis/hepatitis.data", header=None,
                names=['Class', 'AGE', 'SEX', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 
                       'LIVER FIRM', 'SPLEEN PALPABLE', 'SPIDERS', 'ASCITES', 'VARICES', 'BILIRUBIN', 'ALK PHOSPHATE',
                       'SGOT', 'ALBUMIN', 'PROTIME', 'HISTOLOGY'])
df = pd.DataFrame(data)

In [7]:
print(df['AGE'].min())
print(df['AGE'].max())

7
78


A complete tally of age distribution

In [9]:
age_col = df['AGE']
age_data = {}
for x in age_col:
    age_data[x] = age_data.get(x, 0) + 1
dict(list(age_data.items()))

{30: 8,
 50: 6,
 78: 1,
 31: 4,
 34: 8,
 51: 6,
 23: 4,
 39: 6,
 32: 4,
 41: 3,
 47: 4,
 38: 8,
 66: 1,
 40: 3,
 22: 2,
 27: 4,
 42: 5,
 25: 3,
 49: 3,
 58: 2,
 61: 3,
 62: 2,
 26: 1,
 35: 2,
 37: 5,
 20: 3,
 65: 1,
 52: 3,
 33: 3,
 56: 3,
 28: 5,
 36: 7,
 44: 5,
 64: 1,
 45: 5,
 57: 2,
 24: 2,
 67: 1,
 59: 1,
 60: 1,
 48: 2,
 54: 5,
 7: 1,
 69: 1,
 72: 1,
 70: 1,
 46: 1,
 53: 1,
 43: 1}

Summary Statistics: Means, variance and standard deviation

In [12]:
ages = []
freq = []
for i in age_data:
    ages.append(i)
    freq.append(age_data.get(i))

In [15]:
print(np.array(ages).mean())
print(np.array(ages).var())
print(np.array(ages).std())

45.224489795918366
255.60266555601834
15.987578476930718


Efect Size

In [16]:
def get_dict_element(a_dict):
    arr = []
    for x in range(len(a_dict)):
        arr.append(a_dict.get(x, 1))
    return arr

Separate male and female images

In [20]:
m_age = {}
f_age = {}
ct_m = 0
ct_f = 0
ct = 0
sex_col = df['SEX']
for x in age_col:
    sex = sex_col.get(x, 0)      
    if sex == 1: 
        m_age[ct_m] = age_col.get(x, 0)
        ct_m += 1
    else:
        f_age[ct_f] = age_col.get(x, 0)
        ct_f += 1
    ct += 1

In [24]:
m = np.array(get_dict_element(m_age))
f = np.array(get_dict_element(f_age))

In [25]:
print(m[:5])
print(f[:5])

[39 39 34 62 35]
[41 28 58 58 20]


Calculate effect size

In [26]:
import math
def CohenEffectSize(group1, group2):
    diff = group1.mean() - group2.mean()
    var1 = group1.var()
    var2 = group2.var()
    n1, n2 = len(group1), len(group2)

    pooled_var = (n1*var1 + n2*var2)/(n1+n2)
    d = diff / math.sqrt(pooled_var)
    return d

In [27]:
d = CohenEffectSize(m, f)

In [28]:
print(d)

0.5757033718956814
