# Breast Cancer clustering imputation method

## Import Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
#df = pd.read_csv('./Data/barlow_risk_data_multiple.txt',header=None,sep="\s*", engine='python')
df = pd.read_csv('./Data/Athena_UCLA_Data.csv')
df.head(4)

In [None]:
print('Number of instances: ' ,len(df))

### Unique elements for each Variable

In [None]:
for i in df.columns:
    print('Unique values of variable \"' + i + '\": ' + str(df[i].unique()))

### Replace Missing values
- '\\0' values
- '99' values

In [None]:
df.replace(value=np.nan,to_replace='\\0',inplace=True)
cols = [i for i in df.columns if i!='C_currentage'] # 99 is not a missing value for age according to the dictionary
df.loc[:,cols] = df.loc[:,cols].replace(value=np.nan,to_replace=99)

print('List of Variables and the number of missing values per variable: ')
df.isnull().sum()

### Change type of string columns to numeric

In [None]:
df.C_1stDegreeRelativesQty_BC = pd.to_numeric(df.C_1stDegreeRelativesQty_BC)
df.C_firstlivebirthage = pd.to_numeric(df.C_firstlivebirthage)
df.C_1stDegreeRelativesQty_BC = pd.to_numeric(df.C_1stDegreeRelativesQty_BC)
df.asian = pd.to_numeric(df.asian)
df.hispanic = pd.to_numeric(df.hispanic)

### Replace Missing values
- '\\0' values
- '99' values

In [None]:
df.replace(value=np.nan,to_replace='\\0',inplace=True)
cols = [i for i in df.columns if i!='C_currentage']
df.loc[:,cols] = df.loc[:,cols].replace(value=np.nan,to_replace=99)

print('List of Variables and the number of missing values per variable: ')
df.isnull().sum()

## Set Hispanic values with 4 to 1

- Values in variable are 0,1,4. 1 and 4 represent whether you are hispanic.

In [None]:
df.hispanic.replace(value=1,to_replace=4,inplace=True)

## Droping American indians

- Droping American Indians as the Gail model in our possession does not account for this race.

In [None]:
df = df[~(df.amindian==1)]
cols = [i for i in df.columns if i != 'amindian']
df = df[cols]

df.reset_index(inplace=True, drop=True)

df.shape

## Number of missing values per variable

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum()/len(df)

In [None]:
len(df)

## Split into training and testing and validation

- Here we provide the option of generating a validation set, for our analyis we merged the validation and testing set resulting into a 60% training set and 40% test set.

- 20% testing data
- 80% training data

- if validation True

    - 20% testing data
    - 20% external testing data
    - 60% training data

In [None]:
import mcip
from mcip import *

In [None]:
dfTrain, dfTest, dfExtTest = train_test_split(df.copy(),testSetSize=0.2, extTestSetSize=0.2,external_validation=True, as_dataframe=True)
print('Shape of training data: ' , dfTrain.shape)
print('Shape of testing data: ' , dfTest.shape)
print('Shape of external testing data: ' , dfExtTest.shape)

In [None]:
len(dfTest.oldIndex.unique())

## Create a complete dataset of test cases

- Droping cases with missing values from the validation and test set.

In [None]:
dfTest = dfTest.dropna(axis=0)
dfTest.reset_index(inplace=True,drop=True)
dfTest.head(5)

In [None]:
dfTest.shape

In [None]:
dfExtTest = dfExtTest.dropna(axis=0)
dfExtTest.reset_index(inplace=True,drop=True)
dfExtTest.head(5)

In [None]:
dfExtTest.shape

## Save data 

In [None]:
df.to_csv('./Data/preProcData.csv',index=False)
dfTrain.to_csv('./Data/preProcDataTrain.csv',index=False)
dfTest.to_csv('./Data/preProcDataTest.csv',index=False)
dfExtTest.to_csv('./Data/preProcDataExtTest.csv',index=False)

In [None]:
cols

# Data Analysis of variables

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('ggplot')

font = {'family' : 'DejaVu Sans',
        'size'   : 14}

plt.rc('font', **font)

plt.rc('axes',labelsize=18)

fig, axes = plt.subplots(3, 3, figsize=(30, 30));

cols = [i for i in df.columns if i not in ['G_5yearscore','MRN_D']]

dicrete_cols = ['Black','White','Asian','Hispanic','Number of biopsies','First Degree Relatives \n with breast cancer']

ylabel = 'counts'

titles = ['Black','White','Asian','Hispanic','Current age','Menarche age','Age of first birth','Number of biopsies',
          'First Degree Relatives \n with breast cancer']

xlabels = ['Black/Counts','White/Counts','Asian/Counts','Hispanic/Counts','Current age/Years','Menarche age/Years',
           'Age of first birth/Years','Number of biopsies/Counts','First Degree Relatives \n with breast cancer/Counts']

subPlotRow = 0
subPlotCol = 0

for col,xlabel,title in zip(cols,xlabels,titles):
    
    if title in dicrete_cols:
        counts = df[col].dropna().astype('category').value_counts().as_matrix()
        bars = len(df[col].dropna().astype('category').unique())
        axes[subPlotRow,subPlotCol].bar(range(bars),counts,
                                         alpha=0.5,width = 0.5,color='blue')
        axes[subPlotRow,subPlotCol].set_title('Distribution of ' + title)
        axes[subPlotRow,subPlotCol].set_xlabel(xlabel)
        axes[subPlotRow,subPlotCol].set_ylabel(ylabel)
        axes[subPlotRow,subPlotCol].set_xticks(range(bars))# df[col].dropna().astype('category').unique())
    else:
        axes[subPlotRow,subPlotCol].hist(df[col].dropna().as_matrix(),
                                         alpha=0.5,bins=20,color = 'blue', align='left')
        
        if title=='Menarche age':
            axes[subPlotRow,subPlotCol].set_xticks(range(int(df[col].dropna().min()),
                                                         int(df[col].dropna().max()),2))
            
        axes[subPlotRow,subPlotCol].set_title('Distribution of ' + title)
        axes[subPlotRow,subPlotCol].set_xlabel(xlabel)
        axes[subPlotRow,subPlotCol].set_ylabel(ylabel)
    
    if subPlotCol == 2:
        subPlotRow += 1
        subPlotCol = -1
        
    subPlotCol += 1
    