**Note:**
This note book is to provide a general structure of the project and responsibility distribution

In [2]:
from sklearn.base import BaseEstimator, RegressorMixin
from scipy.optimize import minimize
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error, make_scorer
import pandas as pd
import random
from collections import Counter

from random import shuffle
import os


## Load Data

In [27]:
#os.listdir('/Users/NhungLe/Box Sync/Free/Data-Science-Projects/Breast Cancer Diagnosis/Raw Data/')

In [28]:
path = '/Users/NhungLe/Box Sync/Free/Data-Science-Projects/Breast Cancer Diagnosis/Raw Data/'

In [15]:
files = ['calc_case_description_train_set.csv', 'mass_case_description_train_set.csv',
        'calc_case_description_test_set.csv','mass_case_description_test_set.csv' ]

In [52]:
dataframes = {}
for file in files:
    name_l = file.split('_')
    f_name = str(name_l[0]) + '_' + str(name_l[3])
    df = pd.read_csv(path + file)
    dataframes[f_name] = df

In [55]:
# cal_train = pd.read_csv(path + 'calc_case_description_train_set.csv')
# cal_test = pd.read_csv(path + 'calc_case_description_test_set.csv')
# mass_train = pd.read_csv(path + 'mass_case_description_train_set.csv')
# mass_test = pd.read_csv(path + 'mass_case_description_test_set.csv')

In [59]:
for key in dataframes.keys(): 
    df = dataframes[key]
    print('Shape of {} is {}'.format(key, df.shape))
    print('Number of patients from {} is {}'.format(key, df['patient_id'].nunique() ))

Shape of calc_train is (1546, 14)
Number of patients from calc_train is 602
Shape of mass_train is (1318, 14)
Number of patients from mass_train is 691
Shape of calc_test is (326, 14)
Number of patients from calc_test is 151
Shape of mass_test is (378, 14)
Number of patients from mass_test is 201


In [None]:
print(1)

In [18]:
columns = ['image view', 'abnormality id', 'abnormality type', 'calc type', 'calc distribution', 'assessment', 'pathology',
      'subtlety']

In [20]:
for col in columns: 
    print('---------------')
    print('Unique values of column {}'.format(col))
    print(cal_train_desc[col].value_counts())

---------------
Unique values of column image view
MLO    807
CC     739
Name: image view, dtype: int64
---------------
Unique values of column abnormality id
1    1172
2     219
3      88
4      35
5      20
6      10
7       2
Name: abnormality id, dtype: int64
---------------
Unique values of column abnormality type
calcification    1546
Name: abnormality type, dtype: int64
---------------
Unique values of column calc type
PLEOMORPHIC                                                 664
AMORPHOUS                                                   138
PUNCTATE                                                    106
LUCENT_CENTER                                                93
VASCULAR                                                     82
FINE_LINEAR_BRANCHING                                        77
COARSE                                                       35
ROUND_AND_REGULAR-LUCENT_CENTER                              31
PLEOMORPHIC-FINE_LINEAR_BRANCHING                         

In [13]:
# Some patients have more than 1 pathology
multi_path_cal = cal_train_desc.groupby('patient_id').filter(lambda x: x['pathology'].nunique() > 1)

In [14]:
multi_path_cal

Unnamed: 0,patient_id,breast density,left or right breast,image view,abnormality id,abnormality type,calc type,calc distribution,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
293,P_00418,3,RIGHT,MLO,1,calcification,COARSE-ROUND_AND_REGULAR,,2,BENIGN_WITHOUT_CALLBACK,4,Calc-Training_P_00418_RIGHT_MLO/1.3.6.1.4.1.95...,Calc-Training_P_00418_RIGHT_MLO_1/1.3.6.1.4.1....,Calc-Training_P_00418_RIGHT_MLO_1/1.3.6.1.4.1....
292,P_00418,3,RIGHT,CC,1,calcification,COARSE-ROUND_AND_REGULAR,,2,BENIGN_WITHOUT_CALLBACK,4,Calc-Training_P_00418_RIGHT_CC/1.3.6.1.4.1.959...,Calc-Training_P_00418_RIGHT_CC_1/1.3.6.1.4.1.9...,Calc-Training_P_00418_RIGHT_CC_1/1.3.6.1.4.1.9...
290,P_00418,3,LEFT,CC,1,calcification,PLEOMORPHIC,CLUSTERED,4,MALIGNANT,2,Calc-Training_P_00418_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00418_LEFT_CC_1/1.3.6.1.4.1.95...,Calc-Training_P_00418_LEFT_CC_1/1.3.6.1.4.1.95...
291,P_00418,3,LEFT,MLO,1,calcification,PLEOMORPHIC,CLUSTERED,4,MALIGNANT,2,Calc-Training_P_00418_LEFT_MLO/1.3.6.1.4.1.959...,Calc-Training_P_00418_LEFT_MLO_1/1.3.6.1.4.1.9...,Calc-Training_P_00418_LEFT_MLO_1/1.3.6.1.4.1.9...
329,P_00467,3,LEFT,CC,1,calcification,COARSE,,2,BENIGN_WITHOUT_CALLBACK,3,Calc-Training_P_00467_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00467_LEFT_CC_1/1.3.6.1.4.1.95...,Calc-Training_P_00467_LEFT_CC_1/1.3.6.1.4.1.95...
330,P_00467,3,LEFT,MLO,1,calcification,COARSE,,2,BENIGN_WITHOUT_CALLBACK,3,Calc-Training_P_00467_LEFT_MLO/1.3.6.1.4.1.959...,Calc-Training_P_00467_LEFT_MLO_1/1.3.6.1.4.1.9...,Calc-Training_P_00467_LEFT_MLO_1/1.3.6.1.4.1.9...
331,P_00467,3,RIGHT,CC,1,calcification,PLEOMORPHIC,CLUSTERED,4,MALIGNANT,3,Calc-Training_P_00467_RIGHT_CC/1.3.6.1.4.1.959...,Calc-Training_P_00467_RIGHT_CC_1/1.3.6.1.4.1.9...,Calc-Training_P_00467_RIGHT_CC_1/1.3.6.1.4.1.9...
332,P_00467,3,RIGHT,MLO,1,calcification,PLEOMORPHIC,CLUSTERED,4,MALIGNANT,3,Calc-Training_P_00467_RIGHT_MLO/1.3.6.1.4.1.95...,Calc-Training_P_00467_RIGHT_MLO_1/1.3.6.1.4.1....,Calc-Training_P_00467_RIGHT_MLO_1/1.3.6.1.4.1....
447,P_00557,2,RIGHT,MLO,3,calcification,PLEOMORPHIC,CLUSTERED,2,BENIGN_WITHOUT_CALLBACK,4,Calc-Training_P_00557_RIGHT_MLO/1.3.6.1.4.1.95...,Calc-Training_P_00557_RIGHT_MLO_3/1.3.6.1.4.1....,Calc-Training_P_00557_RIGHT_MLO_3/1.3.6.1.4.1....
445,P_00557,2,RIGHT,MLO,1,calcification,PLEOMORPHIC,CLUSTERED,2,BENIGN_WITHOUT_CALLBACK,4,Calc-Training_P_00557_RIGHT_MLO/1.3.6.1.4.1.95...,Calc-Training_P_00557_RIGHT_MLO_1/1.3.6.1.4.1....,Calc-Training_P_00557_RIGHT_MLO_1/1.3.6.1.4.1....


In [49]:
multi_path_cal['patient_id'].nunique()

14

In [50]:
multi_path_cal.groupby('patient_id')['pathology'].nunique()

patient_id
P_00418    2
P_00467    2
P_00557    2
P_00600    2
P_00858    2
P_00937    2
P_00992    2
P_01156    2
P_01200    2
P_01276    2
P_01284    2
P_01409    2
P_01582    2
P_01819    2
Name: pathology, dtype: int64

**Note 1:** 
- There are 14 patients from Calcification train with more than 1 pathology so we can just leave these cases out.
- For these 14 patients, sometimes it is because they have biopsy for left and right breasts, each has a different pathology. Sometimes, on the same breast, some patient (e.g., P_00600) has both pathologies.

# Mass Train

In [3]:
mass_train_desc.sort_values(by = 'patient_id', inplace = True)

In [17]:
mass_train_desc['patient_id'].nunique()

691

In [16]:
mass_train_desc.shape

(1318, 14)

In [34]:
columns = ['image view', 'abnormality id', 'abnormality type', 'mass shape', 'mass margins', 'assessment', 'pathology',
      'subtlety']

In [35]:
for col in columns: 
    print('---------------')
    print('Unique values of column {}'.format(col))
    print(mass_train_desc[col].value_counts())

---------------
Unique values of column image view
MLO    711
CC     607
Name: image view, dtype: int64
---------------
Unique values of column abnormality id
1    1216
2      68
3      23
4       7
6       2
5       2
Name: abnormality id, dtype: int64
---------------
Unique values of column abnormality type
mass    1318
Name: abnormality type, dtype: int64
---------------
Unique values of column mass shape
IRREGULAR                                   351
OVAL                                        321
LOBULATED                                   305
ROUND                                       123
ARCHITECTURAL_DISTORTION                     80
IRREGULAR-ARCHITECTURAL_DISTORTION           45
LYMPH_NODE                                   26
ASYMMETRIC_BREAST_TISSUE                     20
FOCAL_ASYMMETRIC_DENSITY                     19
OVAL-LYMPH_NODE                               6
LOBULATED-IRREGULAR                           5
LOBULATED-LYMPH_NODE                          3
ROUND-OVAL  

In [53]:
multi_path_mass = mass_train_desc.groupby('patient_id').filter(lambda x: x['pathology'].nunique()> 1)

In [56]:
multi_path_mass['patient_id'].nunique()

13

### Check if every patient has both mass and cal images

In [59]:
cal_patient = cal_train_desc['patient_id'].unique().tolist()
mass_patient = mass_train_desc['patient_id'].unique().tolist()

In [70]:
print('Number of patients with cal images: {}'.format(len(cal_patient)))
print('Number of patients with mass images: {}'.format(len(mass_patient))) 
print('Number of patients that are in one list but not other {}'.format(len(np.setdiff1d(cal_patient, mass_patient).tolist())))
print('Number of patients that are in both lists {}'.format(len(set(cal_patient).intersection(set(mass_patient)))))

Number of patients with cal images: 602
Number of patients with mass images: 691
Number of patients that are in one list but not other 557
Number of patients that are in both lists 45
