# Breast Cancer clustering imputation method

## Import Data

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('./Data/preProcData.csv')
dfTrain = pd.read_csv('./Data/preProcDataTrain.csv')
dfTest = pd.read_csv('./Data/preProcDataTest.csv')
dfTestMissing = pd.read_csv('./Data/preProcDataTestMissing.csv')
dfTestExt = pd.read_csv('./Data/preProcDataExtTest.csv')
dfTestMissingExt = pd.read_csv('./Data/preProcDataExtTestMissing.csv')

dfTest = dfTest.append(dfTestExt,ignore_index=True)        # merging the validation and testing set
dfTestMissing = dfTestMissing.append(dfTestMissingExt,ignore_index=True)

dfTestMissing.loc[0:5,:]

In [None]:
print('Number of instances: ' ,len(dfTestMissing))

## Define categorical and continuous variables

In [None]:
categorical = ['black','white','asian','hispanic','c_numBiopsy','C_firstlivebirthage','C_1stDegreeRelativesQty_BC']
continuous = ['C_currentage','C_menarcheage']

linearVarCols = ['C_currentage','C_menarcheage'] #columns to introduce uncertainty +-1 range

# Method of multiple clusters

## Multi-processing parallel processing

In [None]:
from tqdm import tqdm_notebook
import mcip
from mcip import *
from joblib import Parallel, delayed
import multiprocessing

numCores = multiprocessing.cpu_count()


X = Parallel(n_jobs=numCores)(delayed(pipelineOfVariation)(caseInd, dfTrain=dfTrain, dfTest=dfTestMissing,printOutput=False, \
                                                           tolerance_Value=0.9,categorical=categorical,continuous=continuous, \
                                                           radius=50, alpha=0.5,variations=True, partialLinear=True, \
                                                           linearVarCols=linearVarCols)
                                                   for caseInd in tqdm_notebook(range(len(dfTest))))

In [None]:
##############################################################################
# Imputation of only one case
"""
pipelineOfVariation(caseInd=5, dfTrain=dfTrain, dfTest=dfTestMissing,printOutput=False, \
                       tolerance_Value=0.9,categorical=categorical,continuous=continuous, \
                       radius=50, alpha=0.5,variations=True, partialLinear=True, \
                       linearVarCols=linearVarCols)
"""

## To dataframe

In [None]:
# X is a list of numpy arrays transforming it to numpy array
x = np.empty((0,dfTestMissing.shape[1]))
for i in tqdm_notebook(X):
    x = np.vstack((x,i))

In [None]:
x.shape

In [None]:
# transforming numpy array to dataframe
x = pd.DataFrame(data=x, columns=dfTestMissing.columns)
x.head(3)

In [None]:
# check number of missing values, if all zeros then imputation was succesful
x.isnull().sum()

## Race variable based on most frequent race

In [None]:
# Method to impute race based on most frequent race
x = getRace(x)

In [None]:
x.head(5)

In [None]:
x.isnull().sum()

In [None]:
dfTest = getRace(x=dfTest)
dfTest.head(5)

## Save dataset

In [None]:
x.to_csv('./Data/datasetOfCI.csv',index=False)
dfTest.to_csv('./Data/preProcDataTestWithRace.csv',index=False)