In [1]:
%matplotlib inline
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

### Read data files

In [2]:
df_all = pd.read_csv('./data/data_all.csv')
df_all.fillna(0, inplace=True)
df_all.Calcs1.replace(0, 'None', inplace=True)

df_test = pd.read_csv('./data/data_test.csv')

df_cv = pd.read_csv('./data/predictions_cv.csv')

In [4]:
df_all['Size'] = df_all.apply(lambda x: max(x.Length, x.Width, x.Thickness), axis=1)
df_test['Size'] = df_test.apply(lambda x: max(x.Length, x.Width, x.Thickness), axis=1)

### Split into train and test sets

In [6]:
test_ids = df_test['ID'].as_matrix()
cv_ids = df_cv['ID'].as_matrix()

df_cv = df_all[df_all.ID.isin(cv_ids)]
df_test = df_all[df_all.ID.isin(test_ids)]

### Get statistics

In [9]:
print('Number of patients:')
test_ids = [x.split('_')[0] for x in test_ids]
test_ids = set(test_ids)
print('Test: {}'.format(len(test_ids)))
cv_ids = [x.split('_')[0] for x in cv_ids]
cv_ids = set(cv_ids)
print('Train: {}'.format(len(cv_ids)))
print('All: {}'.format(len(cv_ids.union(test_ids))))

Number of patients:
Test: 91
Train: 1139
All: 1227


In [11]:
print('Number of cases:')
print('Train: {}'.format(len(df_cv)))
print('Train benign: {}'.format(len(df_cv[df_cv.Cancer == 0])))
print('Train malignant: {}'.format(len(df_cv[df_cv.Cancer == 1])))
print('Test: {}'.format(len(df_test)))
print('Test benign: {}'.format(len(df_test[df_test.Cancer == 0])))
print('Test malignant: {}'.format(len(df_test[df_test.Cancer == 1])))

Number of cases:
Train: 1278
Train benign: 1151
Train malignant: 127
Test: 99
Test benign: 84
Test malignant: 15


In [14]:
print('Mean age:')
print('Train: {}'.format(np.mean(df_cv.Age)))
print('Test: {}'.format(np.mean(df_test.Age)))
all_age = list(df_test.Age.as_matrix()) + list(df_cv.Age.as_matrix())
print('All: {}'.format(np.mean(all_age)))

Mean age:
Train: 53.2472613459
Test: 52.3232323232
All: 53.1808278867


In [17]:
print('Mean size (STD):')
print('Train: {} ({})'.format(np.mean(df_cv.Size), np.std(df_cv.Size)))
print('Test: {} ({})'.format(np.mean(df_test.Size), np.std(df_test.Size)))
all_size = list(df_test.Size.as_matrix()) + list(df_cv.Size.as_matrix())
print('All: {} ({})'.format(np.mean(all_size), np.std(all_size)))

Mean size (STD):
Train: 25.9248826291 (14.8328398064)
Test: 26.8787878788 (12.7455856706)
All: 25.9934640523 (14.6947379705)
