# Import Data

In [None]:
import pandas as pd
import numpy as np
from mcip import *

In [None]:
dfTest = pd.read_csv('./Data/testComplDataCI_5_year_Risk.csv')
dfTestMissing = pd.read_csv('./Data/preProcDataTestMissing.csv')
df = pd.read_csv('./Data/datasetOfCI_5_year_Risk.csv')
df.head(5)

In [None]:
len(dfTest)

In [None]:
len(df)

# Plot histograms of the risks of complete data and imputed data with the threshold

- plot of the original complete Data risk values
- plot of the imputed Data risk values

In [None]:
plotDistributionCI(X=dfTest['5YearAbsRisk'].as_matrix(),theshold=1.67,ymax=40, bins=300)

In [None]:
plotDistributionCI(X=df['5YearAbsRisk'].as_matrix(),theshold=1.67,ymax=400, bins=300)

# Print results

- Original data
    - HRs > 1.67
    - LRs <= 1.67
- Imputed Data, unique cases
    - HRs > 1.67
    - LRs <= 1.67
    - Uncertain: Interval of risk between the 1.67 threshold

## Original complete dataset
- 5YearAbsRisk is the risk computed on the original complete dataset

In [None]:
cond = dfTest['5YearAbsRisk'] > 1.67
dfTest['Binary_Outcome'] = np.zeros(len(dfTest))
dfTest.loc[cond,'Binary_Outcome'] = 1

In [None]:
print('HRs: ', np.sum(dfTest.Binary_Outcome==1))
print('LRs: ', np.sum(dfTest.Binary_Outcome==0))

## Missing values dataset

In [None]:
cond = df['5YearAbsRisk'] > 1.67
df['Binary_Outcome'] = np.zeros(len(df))
df.loc[cond,'Binary_Outcome'] = 1

In [None]:
uniqueCases = dfTest.oldIndex.unique()
len(uniqueCases)

In [None]:
#create variable Uncertainty

# From the subset of simulated data points with imputed values
# if all of them have a high risk higher than 1.67 then its High Risk = 1
# if all of the have a low risk lower than 1.67 then its Low Risk = 0
# if there are both low and high risk then its an Uncertain = 2

from tqdm import tqdm_notebook

for i in tqdm_notebook(uniqueCases):
    subset = df[df['oldIndex'] == i]
    
    if(subset.Binary_Outcome.sum() == len(subset)):
        df.loc[df['oldIndex'] == i,'Uncertainty_Outcome'] = 1
    elif(subset.Binary_Outcome.sum() == 0):
        df.loc[df['oldIndex'] == i,'Uncertainty_Outcome'] = 0
    else:
        df.loc[df['oldIndex'] == i,'Uncertainty_Outcome'] = 2 #interval of risk crossing the 1.67 point

In [None]:
df.head(5)

In [None]:
# get a vector of unique risk status for each data point
uniqueUncertainty_Outcomes = np.array([np.max(df.loc[df['oldIndex'] == i,'Uncertainty_Outcome']) for i in uniqueCases])

In [None]:
print('High Risk Outcomes: ', np.sum(uniqueUncertainty_Outcomes == 1))
print('Low Risk Outcomes: ', np.sum(uniqueUncertainty_Outcomes == 0))
print('Uncertain Outcomes: ', np.sum(uniqueUncertainty_Outcomes == 2))

print('HRs --> HRs: ', np.sum((dfTest.Binary_Outcome==1) & (uniqueUncertainty_Outcomes == 1)))
print('LRs --> LRs: ', np.sum((dfTest.Binary_Outcome==0) & (uniqueUncertainty_Outcomes == 0)))
print('HRs --> LRs: ', np.sum((dfTest.Binary_Outcome==1) & (uniqueUncertainty_Outcomes == 0)))
print('LRs --> HRs: ', np.sum((dfTest.Binary_Outcome==0) & (uniqueUncertainty_Outcomes == 1)))

print('HRs --> Us: ', np.sum((dfTest.Binary_Outcome==1) & (uniqueUncertainty_Outcomes == 2)))
print('LRs --> Us: ', np.sum((dfTest.Binary_Outcome==0) & (uniqueUncertainty_Outcomes == 2)))



## Example of case with imputed values

In [None]:
df.loc[df.oldIndex==22162,:]

In [None]:
confidenceInterval(X=df.loc[df.oldIndex==22162,:].C_menarcheage,alpha=0.95)

In [None]:
dfTestMissing.loc[dfTestMissing.oldIndex==22162,:]

In [None]:
dfTest.loc[dfTest.oldIndex==22162,:]

In [None]:
dfTest.loc[dfTest.oldIndex==24442,'5YearAbsRisk']

## Plot of risk Distribution of a case near 1.67 risk

### Uncertain case

In [None]:
df[(df.Uncertainty_Outcome==2)].oldIndex.unique()

In [None]:
df[(df.Uncertainty_Outcome==2)].describe()

In [None]:
df.loc[df.oldIndex==29763,'5YearAbsRisk'].as_matrix()

In [None]:
plotDistributionCIRisk(df.loc[df.oldIndex==29763,'5YearAbsRisk'].as_matrix(), ymax=100, bins=50)

#### HR --> U

In [None]:
df[(df.Uncertainty_Outcome==2) & (df.G_5yearscore.between(left=1.67,right=7))].oldIndex.unique()

In [None]:
df[(df.Uncertainty_Outcome==2) & (df.G_5yearscore.between(left=1.67,right=7))].describe()

#### LR --> U

In [None]:
df[(df.Uncertainty_Outcome==2) & (df.G_5yearscore.between(left=0,right=1.669))].oldIndex.unique()

In [None]:
df[(df.Uncertainty_Outcome==2) & (df.G_5yearscore.between(left=0,right=1.669))].describe()

### HR 

In [None]:
df[(df.Uncertainty_Outcome==1)].oldIndex.unique()

In [None]:
df[(df.Uncertainty_Outcome==1)].describe()

#### HR --> HR

In [None]:
df[(df.Uncertainty_Outcome==1) & (df.G_5yearscore.between(left=1.67,right=7))].oldIndex.unique()

In [None]:
df[(df.Uncertainty_Outcome==1) & (df.G_5yearscore.between(left=1.67,right=7))].describe()

#### HR --> LR

In [None]:
df[(df.Uncertainty_Outcome==0) & (df.G_5yearscore.between(left=1.67,right=8))].oldIndex.unique()

In [None]:
df[(df.Uncertainty_Outcome==0) & (df.G_5yearscore.between(left=1.67,right=8))].describe()

### LR

In [None]:
df[(df.Uncertainty_Outcome==0)].oldIndex.unique()

In [None]:
df[(df.Uncertainty_Outcome==0)].describe()

#### LR --> LR

In [None]:
df[(df.Uncertainty_Outcome==0) & (df.G_5yearscore.between(left=0,right=1.669))].oldIndex.unique()

In [None]:
df[(df.Uncertainty_Outcome==0) & (df.G_5yearscore.between(left=0,right=1.669))].describe()

#### LR --> HR

In [None]:
df[(df.Uncertainty_Outcome==1) & (df.G_5yearscore.between(left=0,right=1.669))].oldIndex.unique()

In [None]:
df[(df.Uncertainty_Outcome==1) & (df.G_5yearscore.between(left=0,right=1.669))].describe()

# Analysis of Imputed values based on variables distribution
- Create a unique per index dataset of the imputed values

- Imputed Data
    - Analysis of categories per variable
- Original Data
    - Analysis of categories per variable


In [None]:
uniqueData = []
stringCols = ['MRN_D','race']
cols = [i for i in df.columns if i not in stringCols]

for i in tqdm_notebook(uniqueCases):
    subset = df[df['oldIndex'] == i]
    subset1 = subset[cols].mean(axis=0)
    subset2 = subset[stringCols].iloc[0]
    
    subset = np.concatenate((subset1.as_matrix(),subset2.as_matrix()))
    uniqueData.append(subset)
    
dfUnique = pd.DataFrame(data = np.array(uniqueData),columns=cols + stringCols)
    
#dfUnique.head(5)
len(dfUnique)

In [None]:
dfUnique.head(5)

# Analysis of outcomes in terms of missing data

- Present an analysis for each group of:
    - Uncertain, 
    - HR, 
    - LR, 
    - LRs-->HRs
    - HRs --> LRs, 

in terms of the percentage of missing values per variable

In [None]:
dfTestMissing = pd.read_csv('./Data/preProcDataTestMissing.csv')
dfTestMissingExt = pd.read_csv('./Data/preProcDataExtTestMissing.csv')

## Analysis merging test and validation data
dfTestMissing = dfTestMissing.append(dfTestMissingExt,ignore_index=True)
dfTestMissing.set_index('oldIndex',inplace=True)

In [None]:
len(dfTestMissing)

## Uncertain cases

In [None]:
len(dfTestMissing.loc[dfUnique.oldIndex[dfUnique.Uncertainty_Outcome==2],:])

In [None]:
dfTestMissing.loc[dfUnique.oldIndex[dfUnique.Uncertainty_Outcome==2],:].describe()

In [None]:
pd.DataFrame(dfTestMissing.loc[dfUnique.oldIndex[dfUnique.Uncertainty_Outcome==2],:].isnull().sum()/len(dfTestMissing.loc[dfUnique.oldIndex[dfUnique.Uncertainty_Outcome==2],:]))

## HRs

In [None]:
len(dfTestMissing.loc[dfUnique.oldIndex[dfUnique.Uncertainty_Outcome==1],:])

In [None]:
dfTestMissing.loc[dfUnique.oldIndex[dfUnique.Uncertainty_Outcome==1],:].describe()

In [None]:
pd.DataFrame(dfTestMissing.loc[dfUnique.oldIndex[dfUnique.Uncertainty_Outcome==1],:].isnull().sum()/len(dfTestMissing.loc[dfUnique.oldIndex[dfUnique.Uncertainty_Outcome==1],:]))

## LRs

In [None]:
len(dfTestMissing.loc[dfUnique.oldIndex[dfUnique.Uncertainty_Outcome==0],:])

In [None]:
dfTestMissing.loc[dfUnique.oldIndex[dfUnique.Uncertainty_Outcome==0],:].describe()

In [None]:
pd.DataFrame(dfTestMissing.loc[dfUnique.oldIndex[dfUnique.Uncertainty_Outcome==0],:].isnull().sum()/len(dfTestMissing.loc[dfUnique.oldIndex[dfUnique.Uncertainty_Outcome==0],:]))

## LRs --> HRs

In [None]:
cond = np.array((uniqueUncertainty_Outcomes == 1) & (dfTest.Binary_Outcome==0))
len(dfTestMissing[cond])

In [None]:
dfTestMissing.loc[cond,:].describe()

In [None]:
pd.DataFrame(dfTestMissing.loc[cond,:].isnull().sum()/len(dfTestMissing.loc[cond,:]))

## HRs --> LRs

In [None]:
cond = np.array((uniqueUncertainty_Outcomes == 0) & (dfTest.Binary_Outcome==1))
len(dfTestMissing[cond])

In [None]:
dfTestMissing.loc[cond,:].describe()

In [None]:
pd.DataFrame(dfTestMissing.loc[cond,:].isnull().sum()/len(dfTestMissing.loc[cond,:]))