In [24]:
%matplotlib inline

import pandas as pd
import numpy as np
import glob
import os
import time
import pickle
import datetime
import networkx as nx

import sys
sys.path.append('/home/ngrav/project/')
from wearables.scripts import utils as wearutils
from wearables.scripts import data as weardata
from wearables.scripts import train as weartrain
from wearables.scripts import eval_ as weareval
from wearables.scripts import model as wearmodels
from wearables.scripts import DTW as weardtw
from wearables.scripts import error_analysis as wearerr

import torch
import torch.nn as nn
import torch.nn.functional as F

from scipy.stats import mannwhitneyu
from scipy.stats import spearmanr
from scipy.stats import chi2_contingency
from scipy.stats import kruskal
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sktime.datatypes._panel._convert import from_2d_array_to_nested
from sktime.regression.compose._ensemble import ComposableTimeSeriesForestRegressor
from sktime.classification.compose import ComposableTimeSeriesForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
plt.rc('font', size = 9)
plt.rc('font', family='sans serif')
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
plt.rcParams['legend.frameon']=False
plt.rcParams['axes.grid']=False
plt.rcParams['legend.markerscale']=1
plt.rcParams['savefig.dpi'] = 600
sns.set_style("ticks")

def loadpkl(fp):
    with open(fp, 'rb') as f:
        return pickle.load(f)

In [2]:
pfp = '/home/ngrav/project/wearables/results/'
mfp = '/home/ngrav/scratch/wearables_model_zoo'

# filepaths to bst or pre-processed md with calculated metrics 
pp_md_fp = os.path.join(pfp, 'md_220912.csv')
bst_trainer = os.path.join(mfp, 'trainer_itv71_InceptionTime_GA4.pkl')
bst_modelpkl = os.path.join(mfp, '281-itv71_InceptionTime_GA4.pkl')

# load up to date md
md = pd.read_csv(pp_md_fp, index_col=0)
trainer = loadpkl(bst_trainer)

In [37]:
# add error
md['error'] = md['yhat'] - md['y']

In [38]:
threshold = 10
md['Error group'] = 'lt{}wks'.format(threshold)
md.loc[(md['error'] >= threshold), 'Error group'] = 'Higher-than-actual'
md.loc[(md['error'] <= -threshold), 'Error group'] = 'Lower-than-actual'


In [47]:
a = md.loc[(md['split']=='train') & (md['Pre-term birth']), 'error']
b = md.loc[(md['split']=='train') & (~(md['Pre-term birth'])), 'error']
np.mean(a) - np.mean(b)

2.306121300229327

In [48]:
mannwhitneyu(a, b)

MannwhitneyuResult(statistic=85907.0, pvalue=4.258878894955199e-05)

In [36]:
md['split'].value_counts()

train    1411
test      691
val       203
Name: split, dtype: int64

In [35]:
md.drop_duplicates(subset='record_id')['split'].value_counts()

train    658
test     325
val      100
Name: split, dtype: int64

In [30]:
1.18/87

0.013563218390804597

In [20]:
md['GA'].quantile(q=[0.25, 0.5, 0.75])

0.25    11.0
0.50    21.0
0.75    30.0
Name: GA, dtype: float64

In [21]:
md.loc[(md['Pre-term birth']), 'GA'].quantile(q=[0.25, 0.5, 0.75])

0.25    10.0
0.50    16.0
0.75    24.0
Name: GA, dtype: float64

In [22]:
md.loc[~(md['Pre-term birth']), 'GA'].quantile(q=[0.25, 0.5, 0.75])

0.25    11.0
0.50    22.0
0.75    31.0
Name: GA, dtype: float64

In [25]:
a = md.loc[(md['Pre-term birth']), 'GA']
b = md.loc[~(md['Pre-term birth']), 'GA']

mannwhitneyu(a, b)

MannwhitneyuResult(statistic=226779.5, pvalue=9.300808994185986e-08)

In [15]:
md['gestage_by'].value_counts()

2.0    1189
1.0     651
3.0     413
4.0      38
0.0      14
Name: gestage_by, dtype: int64

In [None]:
md['gestage_by'].value_counts()

In [12]:
pt_df = md.drop_duplicates(subset='record_id')
pt_df['gestage_by'].value_counts() / 1083

2.0    0.495845
1.0    0.289935
3.0    0.188366
4.0    0.019391
0.0    0.006464
Name: gestage_by, dtype: float64

In [14]:
pt_df['gestage_by'].value_counts() 

2.0    537
1.0    314
3.0    204
4.0     21
0.0      7
Name: gestage_by, dtype: int64

In [8]:
[i for i in pt_df.columns if 'deliv' in i]

['deliv_mode']

In [9]:
len(trainer.data.train_ids)

1399

In [8]:
len(trainer.data.test_ids)

690

In [11]:
len(trainer.data.val_ids)

216

In [13]:
def print_n(uids):
    grp = []
    for uid in uids:
        parts = uid.split('_')
        pid = int(parts[0])
        grp.append(pid)
    print('n_grp: {}\tn_samples: {}'.format(len(np.unique(grp)), len(uids)))
    return grp

In [18]:
train_pid = print_n(trainer.data.train_ids)
test_pid = print_n(trainer.data.test_ids)
val_pid = print_n(trainer.data.val_ids)

n_grp: 658	n_samples: 1399
n_grp: 325	n_samples: 690
n_grp: 100	n_samples: 216


In [20]:
print('N_grp: {}\tN_samples: {}'.format(658+325+100, 1399+690+216))

N_grp: 1083	N_samples: 2305


In [19]:
# check why 1260 got filtered out


1083