# Maternal Health Risk Data Exploration


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys, os
sys.path.append("../../")

In [3]:
from utils.transformer import  DataTransformer
from utils.dataset import Dataset
import metrics
from sklearn.metrics import accuracy_score

In [4]:
!ls "../"

adult  census  fake_samples  mhr  news	texas


In [7]:
train_dset = Dataset(
    dataset_name="mhr",
    dataset_dir="../",
    subset="train",
    random_state=1000,
    return_filtered_cols=False
)

In [8]:
print(train_dset.data.shape)
train_dset.data.head()

(811, 7)


Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,19.0,120.0,60.0,7.0,98.4,70.0,low risk
1,50.0,130.0,100.0,16.0,98.0,75.0,mid risk
2,19.0,120.0,75.0,7.9,98.0,70.0,low risk
3,19.0,90.0,65.0,7.5,101.0,70.0,low risk
4,40.0,140.0,100.0,18.0,98.0,90.0,high risk


In [23]:
transformer = DataTransformer(
        discrete_encode="onehot",
        numerical_preprocess="standard",
        target="RiskLevel"
)

train_dset = Dataset(
    dataset_name="mhr",
    dataset_dir="../",
    subset="train",
    data_frac=None,
    random_state=1000,
)

test_dset = Dataset(
    dataset_name="mhr",
    dataset_dir="../",
    subset="test",
    data_frac=None,
    random_state=1000
)


test_data = test_dset.data
train_data = train_dset.data
transformer.fit(train_data, train_dset.cat_cols)

print(train_data.shape, test_data.shape)
print(f"# categorical columns: {len(train_dset.cat_cols)}")
print(f"features/label dim: {test_dset.get_dim()}")

f1_score = metrics.efficacy_test(realdata=test_data, fakedata=train_data, target_name="RiskLevel", transformer=transformer)
acc_score = metrics.efficacy_test(realdata=test_data, fakedata=train_data, target_name="RiskLevel", transformer=transformer, scorer=accuracy_score)

print(f"F1 Score: {f1_score} \nAcc Score: {acc_score}")

(811, 7) (203, 7)
# categorical columns: 1
features/label dim: (7, 0)
F1 Score: 0.6403940886699507 
Acc Score: 0.6403940886699507


In [24]:
from utils.misc import geometric_sequence
seed = 1000
subsets = geometric_sequence(start_value=20, 
                             common_ratio=2, 
                             size=10)

In [25]:
transformer = DataTransformer(
        discrete_encode="onehot",
        numerical_preprocess="standard",
        target="RiskLevel"
)

test_dset = Dataset(
        dataset_name="mhr",
        dataset_dir="../",
        subset="test",
        data_frac=None,
        random_state=seed,
)

test_data = test_dset.data
transformer.fit(test_data, test_dset.cat_cols)
subsets = [640, 320, 160, 80, 40, 20]

for subset in subsets:
   
    
    train_dset = Dataset(
        dataset_name="mhr",
        dataset_dir="../",
        subset="train",
        data_frac=subset,
        random_state=seed,
        )

    
    train_data = train_dset.data
    
    f1_train = metrics.efficacy_test(realdata=train_data, fakedata=train_data, target_name="RiskLevel", transformer=transformer)
    acc_train = metrics.efficacy_test(realdata=train_data, fakedata=train_data, target_name="RiskLevel", transformer=transformer, scorer=accuracy_score)
    
    f1_test = metrics.efficacy_test(realdata=test_data, fakedata=train_data, target_name="RiskLevel", transformer=transformer)
    acc_test = metrics.efficacy_test(realdata=test_data, fakedata=train_data, target_name="RiskLevel", transformer=transformer, scorer=accuracy_score)

    print(f"subset: {subset}: Train/Test-F1: {f1_train:.3f}/{f1_test:.3f} Train/Test-Acc: {acc_train:.3f}/{acc_test:.3f}")

subset: 640: Train/Test-F1: 0.620/0.635 Train/Test-Acc: 0.620/0.635
subset: 320: Train/Test-F1: 0.616/0.621 Train/Test-Acc: 0.616/0.621
subset: 160: Train/Test-F1: 0.613/0.586 Train/Test-Acc: 0.613/0.586
subset: 80: Train/Test-F1: 0.700/0.606 Train/Test-Acc: 0.700/0.606
subset: 40: Train/Test-F1: 0.750/0.581 Train/Test-Acc: 0.750/0.581
subset: 20: Train/Test-F1: 0.900/0.458 Train/Test-Acc: 0.900/0.458
