In [97]:
import os
from pycm import *
import scipy.io
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

try: os.mkdir("result")
except: pass

labels_name=['healthy', 'ground', 'micronodules', 'emphysema', 'fibrosis']
labels_dict={'healthy': 1, 'ground': 2, 'micronodules': 3, 'emphysema': 4, 'fibrosis': 5}


def plot_CM(CM,labels_name,name,is_save):
    fig=plt.figure(figsize=(12, 12))
    plt.imshow(CM, cmap=plt.cm.Blues);
    for i in range(CM.shape[0]):
        for j in range(CM.shape[0]):
            if(CM[i, j] > CM.max()/2) : color="white"
            else: color="black"
            plt.text(j, i,CM[i, j] ,horizontalalignment="center",color=color,fontsize=17)
    plt.xticks(np.arange(CM.shape[0]), labels_name,fontsize='x-large',rotation=-30,fontweight='bold')
    plt.yticks(np.arange(CM.shape[0]),  labels_name,fontsize='x-large',fontweight='bold')
    plt.title(name,fontsize=18,fontweight='bold'); plt.ylabel('True label',fontsize=18); plt.xlabel('Predicted label',fontsize=18);
    if is_save:
        fig.savefig("result/"+name+'.jpg', dpi=3*fig.dpi)
        plt.close(fig)




### read features and concert to dataframe
feature_name='texture_features'
label_ILD=[]
features=[]
for ll in labels_name:
    label_path=os.path.join(feature_name+'/',ll)
    files=os.listdir(label_path)
    try: files.remove('.DS_Store')
    except: pass
    for row in files:
        file_path=os.path.join(label_path,row)
        feature_vector = scipy.io.loadmat(file_path)['feature_vector']

        # features.append(np.concatenate([feature_vector[0][:3*258],feature_vector[0][258*4:1080+258],feature_vector[0][1080+258*2: 1080+3*258],feature_vector[0][1080+258*4:]]))
        features.append(feature_vector[0])
    label_ILD+=[labels_dict[ll]]*len(files)
features=np.array(features)
label_ILD=np.array(label_ILD)
label_ILD = np.reshape(label_ILD,(features.shape[0],1))


for same_class_size in [False ]:
    if same_class_size: name="_same_class_size"
    else: name=""

    ########### select Train and Test sets #############
    # split data between train and test 
    # we choose 25% of data for Test
    # after selecting Train , test we shuffles each set using unison_shuffled_copies
    ####################################################
    Data=pd.DataFrame(np.concatenate([features, label_ILD],axis=1))
    if same_class_size:
        class_size=np.min(Data[Data.shape[1]-1].value_counts())
        Data=Data.groupby(Data.shape[1]-1).apply(lambda s: s.sample(n=class_size,replace=False,random_state=0))
        Data = Data.reset_index(level=[None])
        Data=Data.set_index('level_1')

    Data=Data.sample(frac=1,random_state=5) ## shuffle
    Train=Data.sample(frac=0.75,replace=False,random_state=0)
    Test= Data.drop(index=Train.index)

    x_train =  Train.loc[:,[i for i in range(Train.shape[1]-1)]].values
    y_train = Train.loc[:,[Train.shape[1]-1]].values.ravel()

    x_test =  Test.loc[:,[i for i in range(Test.shape[1]-1)]].values
    y_test = Test.loc[:,[Test.shape[1]-1]].values.ravel()

    ########### Data preprocessing ##############
    #  Data preprocessing is:
    #       1) zero-mean and scale variances to one 
    #       2) PCA for 0.95% of total varince
    #############################################
    scaler = StandardScaler()
    pca = PCA()
    x_train = scaler.fit_transform(x_train)
    pca.fit(x_train)
    cumsum = np.cumsum(pca.explained_variance_ratio_)
    pca_num = np.argmax(cumsum > 0.9999999)
    pca = PCA(n_components=pca_num)
    x_train = pca.fit_transform(x_train)


    ########### TRAINING ##############
    clf = svm.SVC(kernel='rbf',decision_function_shape='ovo' ,class_weight='balanced'  ,max_iter=-1)
    clf.fit(x_train, y_train)

    #**** predict label for Test data
    x_test=scaler.transform(x_test)
    x_test=pca.transform(x_test)
    label_predict_test=clf.predict(x_test)

    #*****
    cm = ConfusionMatrix(actual_vector=y_test, predict_vector=label_predict_test)
    CM=np.array([list(row.values()) for row in list(cm.matrix.values())])
    plot_CM(CM,labels_name,'CM_ILD_Test'+name,True)

    #*****
    accuracy=np.array(list((cm.ACC).values()))
    precision=np.array(list((cm.PPV).values()))
    recall=np.array(list((cm.TPR).values()))
    true_negative_rate=np.array(list((cm.TNR).values()))
    AUC=np.array(list((cm.AUC).values()))
    F1=np.array(list((cm.F1).values()))
    overall_accuracy=round(100*cm.Overall_ACC,2)
    overal_F1=round(100*cm.F1_Macro,2)

    df=pd.DataFrame(
        {"Accuracy":np.round(100*accuracy,2),
        "Recall":np.round(100*recall,2),
        "Precision":np.round(100*precision,2),
        "TN rate":np.round(100*true_negative_rate,2),
        "AUC":np.round(100*AUC,2),
        "F1":np.round(100*F1,2)})
    df=df.rename(index=dict((v-1,k) for k,v in labels_dict.items()))
    with open('result/ILD_Test'+name+'.txt', mode='w') as file_object:
        print(df, file=file_object)
        print('\n\t**** overal accuracy: '+str(overall_accuracy)+' % ', file=file_object)
        print('\t**** overal F1-score (Macro): '+str(overal_F1)+' %', file=file_object)
    #*****
    print('\nFeature vector length'+name+' :',Train.shape[1]-1)
    print('SVM, RBF kernel,PCA components: ',pca_num)
    print('\tAccuracy on Test: ' ,overall_accuracy,'%')
    print('\tF1_score on Test: ',overal_F1,'%')




'\n\n    ########### TRAINING ##############\n    clf = svm.SVC(kernel=\'rbf\',decision_function_shape=\'ovo\' ,class_weight=\'balanced\'  ,max_iter=-1)\n    clf.fit(x_train, y_train)\n\n    #**** predict label for Test data\n    x_test=scaler.transform(x_test)\n    x_test=pca.transform(x_test)\n    label_predict_test=clf.predict(x_test)\n\n    #*****\n    cm = ConfusionMatrix(actual_vector=y_test, predict_vector=label_predict_test)\n    CM=np.array([list(row.values()) for row in list(cm.matrix.values())])\n    plot_CM(CM,labels_name,\'CM_ILD_Test\'+name,True)\n\n    #*****\n    accuracy=np.array(list((cm.ACC).values()))\n    precision=np.array(list((cm.PPV).values()))\n    recall=np.array(list((cm.TPR).values()))\n    true_negative_rate=np.array(list((cm.TNR).values()))\n    AUC=np.array(list((cm.AUC).values()))\n    F1=np.array(list((cm.F1).values()))\n    overall_accuracy=round(100*cm.Overall_ACC,2)\n    overal_F1=round(100*cm.F1_Macro,2)\n\n    df=pd.DataFrame(\n        {"Accuracy"

In [98]:
res={
    'I_hvg_lattice':[0,{'Pk':0,'Z':0}],
    'I_hvg_Nolattice':[0,{'Pk':0,'Z':0}],
    'I_ivg_lattice':[0,{'Pk':0,'Z':0}],
    'I_ivg_Nolattice':[0,{'Pk':0,'Z':0}],
    'I_wavelet':[0],
    'I2_hvg_lattice':[0,{'Pk':0,'Z':0}],
    'I2_hvg_Nolattice':[0,{'Pk':0,'Z':0}],
    'I2_ivg_lattice':[0,{'Pk':0,'Z':0}],
    'I2_ivg_Nolattice':[0,{'Pk':0,'Z':0}],
    'I2_wavelet':[0]
    }

n_pcs= pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax()+1 for i in range(n_pcs)]
for i,row in enumerate(most_important):
    
    if 1<=row<=258 : 
        res['I_hvg_lattice'][0]+=pca.explained_variance_ratio_[i]*100
        if row%258 in [1,2]:
            res['I_hvg_lattice'][1]['Pk']+=pca.explained_variance_ratio_[i]*100
        else:
            res['I_hvg_lattice'][1]['Z']+=pca.explained_variance_ratio_[i]*100
    
    elif 258+1<=row<=258+258 :
        res['I_hvg_Nolattice'][0]+=pca.explained_variance_ratio_[i]*100
        if row%258 in [1,2]:
            res['I_hvg_Nolattice'][1]['Pk']+=pca.explained_variance_ratio_[i]*100
        else:
            res['I_hvg_Nolattice'][1]['Z']+=pca.explained_variance_ratio_[i]*100

    elif 2*258+1<=row<=258+258*2 :
        res['I_ivg_lattice'][0]+=pca.explained_variance_ratio_[i]*100
        if row%258 in [1,2]:
            res['I_ivg_lattice'][1]['Pk']+=pca.explained_variance_ratio_[i]*100
        else:
            res['I_ivg_lattice'][1]['Z']+=pca.explained_variance_ratio_[i]*100

    elif 3*258+1<=row<=258+258*3 :
        res['I_ivg_Nolattice'][0]+=pca.explained_variance_ratio_[i]*100
        if row%258 in [1,2]:
            res['I_ivg_Nolattice'][1]['Pk']+=pca.explained_variance_ratio_[i]*100
        else:
            res['I_ivg_Nolattice'][1]['Z']+=pca.explained_variance_ratio_[i]*100

    if 1080+1<=row<=258+1080 : 
        res['I2_hvg_lattice'][0]+=pca.explained_variance_ratio_[i]*100
        if row%258 in [1,2]:
            res['I2_hvg_lattice'][1]['Pk']+=pca.explained_variance_ratio_[i]*100
        else:
            res['I2_hvg_lattice'][1]['Z']+=pca.explained_variance_ratio_[i]*100

    elif 1080+258+1<=row<=258+258+1080 :
        res['I2_hvg_Nolattice'][0]+=pca.explained_variance_ratio_[i]*100
        if row%258 in [1,2]:
            res['I2_hvg_Nolattice'][1]['Pk']+=pca.explained_variance_ratio_[i]*100
        else:
            res['I2_hvg_Nolattice'][1]['Z']+=pca.explained_variance_ratio_[i]*100

    elif 1080+2*258+1<=row<=258+258*2+1080 :
        res['I2_ivg_lattice'][0]+=pca.explained_variance_ratio_[i]*100
        if row%258 in [1,2]:
            res['I2_ivg_lattice'][1]['Pk']+=pca.explained_variance_ratio_[i]*100
        else:
            res['I2_ivg_lattice'][1]['Z']+=pca.explained_variance_ratio_[i]*100

    elif 1080+3*258+1<=row<=258+258*3+1080 :
        res['I2_ivg_Nolattice'][0]+=pca.explained_variance_ratio_[i]*100
        if row%258 in [1,2]:
            res['I2_ivg_Nolattice'][1]['Pk']+=pca.explained_variance_ratio_[i]*100
        else:
            res['I2_ivg_Nolattice'][1]['Z']+=pca.explained_variance_ratio_[i]*100


    elif 4*258+1<=row<=4*258+48 :
        res['I_wavelet'][0]+=pca.explained_variance_ratio_[i]*100
    elif 1080+4*258+1<=row<=4*258+48+1080 :
        res['I2_wavelet'][0]+=pca.explained_variance_ratio_[i]*100


    

for key in res.keys():
    res[key][0]= round(res[key][0],3)
    try:
        res[key][1]['Z']= round(res[key][1]['Z'],3)
        res[key][1]['Pk']= round(res[key][1]['Pk'],3)
    except:
        pass

res={k: v for k, v in sorted(res.items(), key=lambda item: item[1],reverse=True)}


res

{'I2_hvg_lattice': [29.005, {'Pk': 0, 'Z': 29.005}],
 'I2_ivg_lattice': [23.958, {'Pk': 0, 'Z': 23.958}],
 'I_ivg_lattice': [17.588, {'Pk': 0.498, 'Z': 17.091}],
 'I_hvg_lattice': [10.478, {'Pk': 0.676, 'Z': 9.801}],
 'I_hvg_Nolattice': [7.175, {'Pk': 0.005, 'Z': 7.17}],
 'I2_wavelet': [6.26],
 'I_wavelet': [4.634],
 'I_ivg_Nolattice': [0.9, {'Pk': 0.326, 'Z': 0.574}],
 'I2_hvg_Nolattice': [0.002, {'Pk': 0, 'Z': 0.002}],
 'I2_ivg_Nolattice': [0.0, {'Pk': 0, 'Z': 0.0}]}

*****
******
******

In [95]:

labels_name=['healthy', 'ground', 'micronodules', 'emphysema', 'fibrosis']
labels_dict={'healthy': 1, 'ground': 2, 'micronodules': 3, 'emphysema': 4, 'fibrosis': 5}


### read features and concert to dataframe
feature_name='texture_features/graph'
label_ILD=[]
features_graph=[]
for ll in labels_name:
    label_path=os.path.join(feature_name+'/',ll)
    files=os.listdir(label_path)
    try: files.remove('.DS_Store')
    except: pass
    for row in files:
        file_path=os.path.join(label_path,row)
        feature_vector = scipy.io.loadmat(file_path)['feature_vector'] 
        features_graph.append(feature_vector[0])
    label_ILD+=[labels_dict[ll]]*len(files)
features_graph=np.array(features_graph)
label_ILD=np.array(label_ILD)
label_ILD = np.reshape(label_ILD,(features_graph.shape[0],1))

feature_name='texture_features/wavelet'
features_wavelet=[]
for ll in labels_name:
    label_path=os.path.join(feature_name+'/',ll)
    files=os.listdir(label_path)
    try: files.remove('.DS_Store')
    except: pass
    for row in files:
        file_path=os.path.join(label_path,row)
        feature_vector = scipy.io.loadmat(file_path)['feature_vector']
        features_wavelet.append(feature_vector[0])
features_wavelet=np.array(features_wavelet)



Data=pd.DataFrame(np.concatenate([features_graph, features_wavelet, label_ILD],axis=1))
Data[Data.shape[1]-1].value_counts()




3.0    11629
1.0     7828
5.0     5251
2.0     2986
4.0     1981
Name: 1401, dtype: int64

In [97]:

for same_class_size in [False ,True ]:
    if same_class_size: name="_same_class_size"
    else: name=""

    ########### select Train and Test sets #############
    # split data between train and test 
    # we choose 25% of data for Test
    # after selecting Train , test we shuffles each set using unison_shuffled_copies
    ####################################################
    Data=pd.DataFrame(np.concatenate([features_graph, features_wavelet, label_ILD],axis=1))
    if same_class_size:
        class_size=np.min(Data[Data.shape[1]-1].value_counts())
        Data=Data.groupby(Data.shape[1]-1).apply(lambda s: s.sample(n=class_size,replace=False,random_state=0))
        Data = Data.reset_index(level=[None])
        Data=Data.set_index('level_1')

    Data=Data.sample(frac=1,random_state=5) ## shuffle
    Train=Data.sample(frac=0.75,replace=False,random_state=0)
    Test= Data.drop(index=Train.index)

    x_train =  Train.loc[:,[i for i in range(Train.shape[1]-1)]].values
    y_train = Train.loc[:,[Train.shape[1]-1]].values.ravel()

    x_test =  Test.loc[:,[i for i in range(Test.shape[1]-1)]].values
    y_test = Test.loc[:,[Test.shape[1]-1]].values.ravel()

    ########### Data preprocessing ##############
    #  Data preprocessing is:
    #       1) zero-mean and scale variances to one 
    #       2) PCA for 0.95% of total varince
    #############################################
    scaler = StandardScaler()
    pca = PCA()
    x_train = scaler.fit_transform(x_train)
    pca.fit(x_train)
    cumsum = np.cumsum(pca.explained_variance_ratio_)
    pca_num = np.argmax(cumsum > 0.95)
    pca = PCA(n_components=pca_num)
    x_train = pca.fit_transform(x_train)


    ########### TRAINING ##############
    clf = svm.SVC(kernel='rbf',decision_function_shape='ovo' ,class_weight='balanced'  ,max_iter=-1)
    clf.fit(x_train, y_train)

    #**** predict label for Test data
    x_test=scaler.transform(x_test)
    x_test=pca.transform(x_test)
    label_predict_test=clf.predict(x_test)

    #*****
    cm = ConfusionMatrix(actual_vector=y_test, predict_vector=label_predict_test)

    #*****
    accuracy=np.array(list((cm.ACC).values()))
    precision=np.array(list((cm.PPV).values()))
    recall=np.array(list((cm.TPR).values()))
    true_negative_rate=np.array(list((cm.TNR).values()))
    AUC=np.array(list((cm.AUC).values()))
    F1=np.array(list((cm.F1).values()))
    overall_accuracy=round(100*cm.Overall_ACC,2)
    overal_F1=round(100*cm.F1_Macro,2)

    df=pd.DataFrame(
        {"Accuracy":np.round(100*accuracy,2),
        "Recall":np.round(100*recall,2),
        "Precision":np.round(100*precision,2),
        "TN rate":np.round(100*true_negative_rate,2),
        "AUC":np.round(100*AUC,2),
        "F1":np.round(100*F1,2)})
    df=df.rename(index=dict((v-1,k) for k,v in labels_dict.items()))

    print('\nFeature vector length'+name+' :',Train.shape[1]-1)
    print('SVM, RBF kernel,PCA components: ',pca_num)
    print('\tAccuracy on Test: ' ,overall_accuracy,'%')
    print('\tF1_score on Test: ',overal_F1,'%')





Feature vector length : 1401
SVM, RBF kernel,PCA components:  583
	Accuracy on Test:  95.61 %
	F1_score on Test:  95.56 %

Feature vector length_same_class_size : 1401
SVM, RBF kernel,PCA components:  567
	Accuracy on Test:  91.72 %
	F1_score on Test:  91.75 %
