In [1]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")
sns.set_style("ticks")
%matplotlib inline

In [2]:
bcdr_d0G_med = pd.read_csv('../data/clean/bcdr_d0G_med.csv')
bcdr_d0G_med.head()

Unnamed: 0,patient_id,study_id,series,lesion_id,segmentation_id,image_view,mammography_type,mammography_nodule,mammography_calcification,mammography_microcalcification,...,t_homo,t_savgh,t_svarh,t_senth,t_entro,t_dvarh,t_denth,t_inf1h,t_inf2h,classification
0,3,4,1,7,10,2,1,1,0,0,...,0.52281,55.3474,2771.1561,3.0068,4.3143,3.9515,1.5656,-0.19055,0.77071,Malign
1,3,4,1,7,11,4,1,1,0,0,...,0.46888,54.2754,2654.1706,3.1893,4.7304,6.7713,1.8039,-0.14996,0.72962,Malign
2,4,8,1,9,14,4,1,1,1,0,...,0.48007,36.5319,1166.2156,2.5313,4.0464,5.9552,1.7124,-0.028458,0.32486,Benign
3,4,8,1,9,15,2,1,1,1,0,...,0.43897,48.3651,2081.4079,3.0147,4.6922,7.3839,1.8264,-0.081846,0.57253,Benign
4,5,10,1,11,18,2,1,1,0,0,...,0.44106,38.5474,1306.9212,2.5311,4.2309,7.7829,1.8325,-0.024156,0.27899,Benign


In [20]:
bcdr_d0G_med['classification'] = bcdr_d0G_med['classification'].astype(str)

bcdr_d0G_med['diagnosis'] = [1 if x.strip() == 'Malign' else 0 for x in bcdr_d0G_med['classification']] 
bcdr_d0G_med = bcdr_d0G_med.drop(columns=[
    'patient_id', 'study_id','series',
    'lesion_id',  'segmentation_id',   'mammography_type', 
    'mammography_axillary_adenopathy', 'classification']) 
bcdr_d0G_med.head()

Unnamed: 0,image_view,mammography_nodule,mammography_calcification,mammography_microcalcification,mammography_architectural_distortion,mammography_stroma_distortion,age,density,i_mean,i_std_dev,...,t_homo,t_savgh,t_svarh,t_senth,t_entro,t_dvarh,t_denth,t_inf1h,t_inf2h,diagnosis
0,2,1,0,0,1,0,74,1.0,0.85478,0.096944,...,0.52281,55.3474,2771.1561,3.0068,4.3143,3.9515,1.5656,-0.19055,0.77071,1
1,4,1,0,0,1,0,74,1.0,0.83313,0.11988,...,0.46888,54.2754,2654.1706,3.1893,4.7304,6.7713,1.8039,-0.14996,0.72962,1
2,4,1,1,0,0,0,59,3.0,0.55903,0.070948,...,0.48007,36.5319,1166.2156,2.5313,4.0464,5.9552,1.7124,-0.028458,0.32486,0
3,2,1,1,0,0,0,59,3.0,0.75233,0.094601,...,0.43897,48.3651,2081.4079,3.0147,4.6922,7.3839,1.8264,-0.081846,0.57253,0
4,2,1,0,0,0,0,64,1.0,0.58568,0.069084,...,0.44106,38.5474,1306.9212,2.5311,4.2309,7.7829,1.8325,-0.024156,0.27899,0


In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(bcdr_d0G_med.corr(), annot=True)

## Feature selection using P-value

In [21]:
data = bcdr_d0G_med

# Correlation metrics
label_encoder = LabelEncoder()
data.iloc[:,data.shape[1] - 1] = label_encoder.fit_transform(data.iloc[:,0]).astype('float64')
corr = data.corr()
#sns.heatmap(corr)


# Next, we compare the correlation between features 
# and remove one of two features that have a correlation higher than 0.9
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
selected_columns = data.columns[columns]
data = data[selected_columns]

# pvalue feature selection
selected_columns = selected_columns[1:].values

import statsmodels.formula.api as sm

def backward_elimination(x, Y, sl, columns):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(Y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
                    columns = np.delete(columns, j)
                    
    regressor_OLS.summary()
    return x, columns

SL = 0.05
data_modeled, selected_columns = backward_elimination(data.iloc[:,1:].values, data.iloc[:,0].values, SL, selected_columns)


# Moving the result to a new Dataframe
result = pd.DataFrame()
result['diagnosis'] = data.iloc[:,0]

# Creating a Dataframe with the columns selected using the p-value and correlation
fs_data = pd.DataFrame(data = data_modeled, columns = selected_columns)

In [22]:
fs_data.head()

Unnamed: 0,mammography_nodule,i_mean,i_skewness,s_x_center_mass,s_y_center_mass,s_solidity,s_extent,t_corr,t_homo,t_senth
0,1.0,0.85478,-0.14516,0.13751,0.28846,0.82342,0.53535,0.77767,0.52281,3.0068
1,1.0,0.83313,-0.64293,0.21043,0.42407,0.61582,0.38115,0.73583,0.46888,3.1893
2,1.0,0.55903,1.2048,0.22107,0.7664,0.9721,0.72496,0.25991,0.48007,2.5313
3,1.0,0.75233,0.22278,0.2298,0.4992,0.97414,0.65971,0.54014,0.43897,3.0147
4,1.0,0.58568,0.28343,0.14915,0.23309,0.94872,0.64729,0.11137,0.44106,2.5311


## Feature selection using P-value (form 2) 

In [23]:
results = sm.OLS(bcdr_d0G_med.diagnosis, bcdr_d0G_med.drop(columns=['diagnosis'])).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:              diagnosis   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 2.854e+05
Date:                Tue, 04 Jun 2019   Prob (F-statistic):               0.00
Time:                        22:22:15   Log-Likelihood:                 1680.9
No. Observations:                 598   AIC:                            -3292.
Df Residuals:                     563   BIC:                            -3138.
Df Model:                          35                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [24]:
fs_data.to_csv('../data/clean/fs_data.csv', index=False)