In [1]:
import wget
wget.download('https://raw.githubusercontent.com/BorisMuzellec/MissingDataOT/master/utils.py')

import numpy as np
import pandas as pd
from utils import *
import torch

In [2]:
np.random.seed(0)

In [12]:
# Function produce_NA for generating missing values ------------------------------------------------------

def produce_NA(X, p_miss, mecha="MCAR", opt=None, p_obs=None, q=None):
    """
    Generate missing values for specifics missing-data mechanism and proportion of missing values. 
    
    Parameters
    ----------
    X : torch.DoubleTensor or np.ndarray, shape (n, d)
        Data for which missing values will be simulated.
        If a numpy array is provided, it will be converted to a pytorch tensor.
    p_miss : float
        Proportion of missing values to generate for variables which will have missing values.
    mecha : str, 
            Indicates the missing-data mechanism to be used. "MCAR" by default, "MAR", "MNAR" or "MNARsmask"
    opt: str, 
         For mecha = "MNAR", it indicates how the missing-data mechanism is generated: using a logistic regression ("logistic"), quantile censorship ("quantile") or logistic regression for generating a self-masked MNAR mechanism ("selfmasked").
    p_obs : float
            If mecha = "MAR", or mecha = "MNAR" with opt = "logistic" or "quanti", proportion of variables with *no* missing values that will be used for the logistic masking model.
    q : float
        If mecha = "MNAR" and opt = "quanti", quantile level at which the cuts should occur.
    
    Returns
    ----------
    A dictionnary containing:
    'X_init': the initial data matrix.
    'X_incomp': the data with the generated missing values.
    'mask': a matrix indexing the generated missing values.s
    """
    
    to_torch = torch.is_tensor(X) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        X = X.astype(np.float32)
        X = torch.from_numpy(X)
    
    if mecha == "MAR":
        mask = MAR_mask(X, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "logistic":
        mask = MNAR_mask_logistic(X, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "quantile":
        mask = MNAR_mask_quantiles(X, p_miss, q, 1-p_obs).double()
    elif mecha == "MNAR" and opt == "selfmasked":
        mask = MNAR_self_mask_logistic(X, p_miss).double()
    else:
        mask = (torch.rand(X.shape) < p_miss).double()
    
    X_nas = X.clone()
    X_nas[mask.bool()] = np.nan
    
    return {'X_init': X.double(), 'X_incomp': X_nas.double(), 'mask': mask}

In [3]:
data = pd.read_csv('trainSet.txt')

In [4]:
# N full rows
len(data.dropna())

6

In [5]:
# Percentage of missigness
round(data.isna().sum()/len(data), 2)

PatientID                0.00
ImageFile                0.00
Hospital                 0.00
Age                      0.00
Sex                      0.00
Temp_C                   0.18
Cough                    0.01
DifficultyInBreathing    0.00
WBC                      0.01
CRP                      0.04
Fibrinogen               0.68
LDH                      0.16
Ddimer                   0.72
Ox_percentage            0.28
PaO2                     0.20
SaO2                     0.68
pH                       0.24
CardiovascularDisease    0.02
RespiratoryFailure       0.18
Prognosis                0.00
dtype: float64

In [10]:
dataDropped = data.drop(['Ddimer', 'SaO2', 'Fibrinogen', 'Prognosis', 'PatientID', 'ImageFile', 'Hospital'], axis = 1).dropna()
# Binary variables (Hackathon description_26_04.pdf)
dataDropped['Cough'] = pd.factorize(dataDropped['Cough'])[0]
dataDropped['Sex'] = pd.factorize(dataDropped['Sex'])[0]
dataDropped['DifficultyInBreathing'] = pd.factorize(dataDropped['DifficultyInBreathing'])[0]
dataDropped['CardiovascularDisease'] = pd.factorize(dataDropped['CardiovascularDisease'])[0]
dataDropped['RespiratoryFailure'] = pd.factorize(dataDropped['RespiratoryFailure'])[0]

Unnamed: 0,Age,Sex,Temp_C,Cough,DifficultyInBreathing,WBC,CRP,LDH,Ox_percentage,PaO2,pH,CardiovascularDisease,RespiratoryFailure
24,56.0,0,38.3,0,0,7.85,9.00,159.0,93.0,73.2,7.47,0,0
25,61.0,0,38.0,1,0,3.57,57.40,309.0,92.0,57.0,7.47,0,0
27,64.0,0,36.0,1,1,5.26,41.90,299.0,96.0,72.2,7.51,1,0
28,54.0,0,36.6,1,1,7.01,67.90,458.0,97.0,61.0,7.45,0,0
29,44.0,0,37.7,0,1,9.18,42.70,243.0,98.0,23.0,7.43,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
815,47.0,1,36.0,1,0,11.20,0.02,161.0,97.0,99.0,7.40,0,0
821,59.0,1,36.3,0,1,6.90,1.15,237.0,95.0,82.0,7.39,0,0
827,52.0,0,37.5,0,0,5.40,1.63,307.0,89.0,62.3,7.46,0,0
836,48.0,0,36.5,0,1,5.91,1.86,235.0,99.0,81.9,7.42,0,0


In [14]:
dataDroppedMCAR = produce_NA(dataDropped.to_numpy(), p_miss=0.4, mecha="MCAR")

In [15]:
dataDroppedMCAR

{'X_init': tensor([[56.0000,  0.0000, 38.3000,  ...,  7.4700,  0.0000,  0.0000],
         [61.0000,  0.0000, 38.0000,  ...,  7.4700,  0.0000,  0.0000],
         [64.0000,  0.0000, 36.0000,  ...,  7.5100,  1.0000,  0.0000],
         ...,
         [52.0000,  0.0000, 37.5000,  ...,  7.4600,  0.0000,  0.0000],
         [48.0000,  0.0000, 36.5000,  ...,  7.4200,  0.0000,  0.0000],
         [50.0000,  0.0000, 39.0000,  ...,  7.4400,  0.0000,  0.0000]],
        dtype=torch.float64),
 'X_incomp': tensor([[56.0000,  0.0000, 38.3000,  ...,  7.4700,  0.0000,  0.0000],
         [    nan,  0.0000, 38.0000,  ...,     nan,  0.0000,  0.0000],
         [64.0000,     nan,     nan,  ...,  7.5100,  1.0000,  0.0000],
         ...,
         [52.0000,     nan,     nan,  ...,     nan,     nan,     nan],
         [    nan,  0.0000, 36.5000,  ...,     nan,     nan,     nan],
         [50.0000,     nan, 39.0000,  ...,  7.4400,     nan,  0.0000]],
        dtype=torch.float64),
 'mask': tensor([[0., 0., 0.,  ..., 