In [7]:
import os
from pycm import *
import scipy.io
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

try: os.mkdir("result")
except: pass

labels_name=['healthy', 'ground', 'micronodules', 'emphysema', 'fibrosis']
labels_dict={'healthy': 1, 'ground': 2, 'micronodules': 3, 'emphysema': 4, 'fibrosis': 5}


def plot_CM(CM,labels_name,name,is_save):
    fig=plt.figure(figsize=(12, 12))
    plt.imshow(CM, cmap=plt.cm.Blues);
    for i in range(CM.shape[0]):
        for j in range(CM.shape[0]):
            if(CM[i, j] > CM.max()/2) : color="white"
            else: color="black"
            plt.text(j, i,CM[i, j] ,horizontalalignment="center",color=color,fontsize=17)
    plt.xticks(np.arange(CM.shape[0]), labels_name,fontsize='x-large',rotation=-30,fontweight='bold')
    plt.yticks(np.arange(CM.shape[0]),  labels_name,fontsize='x-large',fontweight='bold')
    plt.title(name,fontsize=18,fontweight='bold'); plt.ylabel('True label',fontsize=18); plt.xlabel('Predicted label',fontsize=18);
    if is_save:
        fig.savefig("result/"+name+'.jpg', dpi=3*fig.dpi)
        plt.close(fig)




### read features and concert to dataframe
feature_name='texture_features'
label_ILD=[]
features=[]
for ll in labels_name:
    label_path=os.path.join(feature_name+'/',ll)
    files=os.listdir(label_path)
    try: files.remove('.DS_Store')
    except: pass
    for row in files:
        file_path=os.path.join(label_path,row)
        feature_vector = scipy.io.loadmat(file_path)['feature_vector']
        features.append(feature_vector[0])
    label_ILD+=[labels_dict[ll]]*len(files)
features=np.array(features)
label_ILD=np.array(label_ILD)
label_ILD = np.reshape(label_ILD,(features.shape[0],1))


for same_class_size in [ True]:
    if same_class_size: name="_same_class_size"
    else: name=""

    ########### select Train and Test sets #############
    # split data between train and test 
    # we choose 25% of data for Test
    # after selecting Train , test we shuffles each set using unison_shuffled_copies
    ####################################################
    Data=pd.DataFrame(np.concatenate([features, label_ILD],axis=1))
    if same_class_size:
        class_size=np.min(Data[Data.shape[1]-1].value_counts())
        Data=Data.groupby(Data.shape[1]-1).apply(lambda s: s.sample(n=class_size,replace=False,random_state=0))
        Data = Data.reset_index(level=[None])
        Data=Data.set_index('level_1')

    Data=Data.sample(frac=1,random_state=5) ## shuffle
    Train=Data.sample(frac=0.75,replace=False,random_state=0)
    Test= Data.drop(index=Train.index)

    x_train =  Train.loc[:,[i for i in range(Train.shape[1]-1)]].values
    y_train = Train.loc[:,[Train.shape[1]-1]].values.ravel()

    x_test =  Test.loc[:,[i for i in range(Test.shape[1]-1)]].values
    y_test = Test.loc[:,[Test.shape[1]-1]].values.ravel()

    ########### Data preprocessing ##############
    #  Data preprocessing is:
    #       1) zero-mean and scale variances to one 
    #       2) PCA for 0.95% of total varince
    #############################################
    scaler = StandardScaler()
    pca = PCA()
    x_train = scaler.fit_transform(x_train)
    pca.fit(x_train)
    cumsum = np.cumsum(pca.explained_variance_ratio_)
    pca_num = np.argmax(cumsum > 0.95)
    pca = PCA(n_components=pca_num)
    x_train = pca.fit_transform(x_train)
    x_test=scaler.transform(x_test)
    x_test=pca.transform(x_test)



In [29]:


n_pcs= pca.components_.shape[0]

# get the index of the most important feature on EACH component
# LIST COMPREHENSION HERE
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]

important_features=[]
for i,row in enumerate(most_important):
    if row <= (256+2+256+2):
        if (row-(256+2+256+2)*0) in [0,1,258,259]:
            important_features.append(['HVG_lattice_P',round(pca.explained_variance_ratio_[i]*100,2)])
        else:
            important_features.append(['HVG_lattice_Z',round(pca.explained_variance_ratio_[i]*100,2)])

    elif row <= 2*(256+2+256+2):
        if (row-(256+2+256+2)*1) in [0,1,258,259]:
            important_features.append(['HVG_Nolattice_P',round(pca.explained_variance_ratio_[i]*100,2)])
        else:
            important_features.append(['HVG_Nolattice_Z',round(pca.explained_variance_ratio_[i]*100,2)])

    elif row <= 3*(256+2+256+2):
        if (row-(256+2+256+2)*2) in [0,1,258,259]:
            important_features.append(['IVG_lattice_P',round(pca.explained_variance_ratio_[i]*100,2)])
        else:
            important_features.append(['IVG_lattice_Z',round(pca.explained_variance_ratio_[i]*100,2)])

    elif row <= 4*(256+2+256+2):
        if (row-(256+2+256+2)*3) in [0,1,258,259]:
            important_features.append(['IVG_Nolattice_P',round(pca.explained_variance_ratio_[i]*100,2)])
        else:
            important_features.append(['IVG_Nolattice_Z',round(pca.explained_variance_ratio_[i]*100,2)])

    else:
        important_features.append(['wavelet',round(pca.explained_variance_ratio_[i]*100,2)])
    


important_features



[['IVG_lattice_Z', 7.48],
 ['IVG_lattice_Z', 4.6],
 ['wavelet', 3.04],
 ['IVG_Nolattice_Z', 2.28],
 ['IVG_lattice_Z', 1.7],
 ['IVG_lattice_Z', 1.12],
 ['IVG_lattice_Z', 1.04],
 ['IVG_lattice_Z', 0.94],
 ['HVG_lattice_Z', 0.66],
 ['wavelet', 0.56],
 ['IVG_Nolattice_Z', 0.52],
 ['wavelet', 0.45],
 ['HVG_Nolattice_Z', 0.43],
 ['IVG_lattice_Z', 0.42],
 ['HVG_Nolattice_Z', 0.4],
 ['IVG_Nolattice_Z', 0.38],
 ['HVG_Nolattice_Z', 0.38],
 ['HVG_Nolattice_Z', 0.36],
 ['HVG_Nolattice_Z', 0.35],
 ['HVG_Nolattice_Z', 0.34],
 ['IVG_Nolattice_Z', 0.33],
 ['IVG_Nolattice_Z', 0.32],
 ['IVG_lattice_Z', 0.32],
 ['IVG_Nolattice_Z', 0.3],
 ['HVG_Nolattice_Z', 0.3],
 ['HVG_Nolattice_Z', 0.3],
 ['HVG_Nolattice_Z', 0.29],
 ['HVG_Nolattice_Z', 0.29],
 ['IVG_Nolattice_Z', 0.28],
 ['HVG_Nolattice_Z', 0.28],
 ['HVG_Nolattice_Z', 0.28],
 ['HVG_Nolattice_Z', 0.28],
 ['HVG_Nolattice_Z', 0.28],
 ['IVG_Nolattice_Z', 0.27],
 ['HVG_Nolattice_Z', 0.27],
 ['IVG_Nolattice_Z', 0.27],
 ['HVG_Nolattice_Z', 0.27],
 ['HVG_Nolat