In [228]:
# import pandas as pd
import numpy as np

## Dataset 1: Gastrointestinal Lesions in Regular Colonoscopy Data Set

In [229]:
filePath = 'data.txt'
data = np.genfromtxt(filePath, delimiter=',', skip_header=1)

In [230]:
# transpose data to get features in columns and samples in rows
data_transpose = np.transpose(data)
data_list = data_transpose.tolist()

# now let's seperate data into "White Light Frame (WL)" and "NBI Frame (NBI)"
# 1 for WL and 2 for NBI
data_WL, data_NBI = [],[]
for i in range(len(data_list)):
    if data_list[i][1] == 1:
        data_WL.append(data_list[i])
    elif data_list[i][1] == 2:
        data_NBI.append(data_list[i])

# checking if the separation was done correctely
print(False in [row[1]==1 for row in data_WL])  # should be False
print(False in [row[1]==2 for row in data_NBI]) # should be False
print((len(data_WL)+len(data_NBI))==len(data_list)) # should be True

False
False
True


In [231]:
# separating features and targets out of data_WL and data_NBI
fea_WL = [row[2:] for row in data_WL]
class_WL = [row[0] for row in data_WL]

fea_NBI = [row[2:] for row in data_NBI]
class_NBI = [row[0] for row in data_NBI]

In [232]:
# %matplotlib qt

---

In [233]:
from sklearn.model_selection import LeaveOneOut
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix

# X -> features, y -> label
X = np.array(fea_NBI + fea_WL)
y = np.array(class_NBI + class_WL)

# create loocv procedure
cv = LeaveOneOut()

y_true, y_pred = [],[]
fea_importances = []

# enumerate splits
for train_ix, test_ix in cv.split(X):
	
    # split data
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]   
    # fit model
    # model = RandomForestClassifier(random_state=1)
    model = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy')
    model.fit(X_train, y_train) 
    fea_importances.append(model.feature_importances_)

    # evaluate model
    yhat = model.predict(X_test)
    # store
    y_true.append(y_test[0])
    y_pred.append(yhat[0])

# calculate accuracy
cm = confusion_matrix(y_true, y_pred)
print(cm)
acc = accuracy_score(y_true, y_pred)
print('Accuracy: %.3f' % acc)

[[35  0  7]
 [ 0 20 10]
 [ 6  0 74]]
Accuracy: 0.849


In [234]:
import matplotlib.pyplot as plt

mean_fea_imp = np.mean(np.array(fea_importances), axis=0)
feature_importance_normalized = (mean_fea_imp)/np.max(mean_fea_imp)

# Plotting a Bar Graph to compare the models
plt.bar(range(1,699), feature_importance_normalized)
plt.xlabel('Feature Labels')
plt.ylabel('Feature Importances')
plt.title('Comparison of different Feature Importances')
plt.show()

In [235]:
import networkx as nx

from sklearn import preprocessing
from sklearn.metrics.pairwise import euclidean_distances

thres = (feature_importance_normalized > 0.4)
print(np.shape(np.transpose(X)[thres]))
sel_fea = np.transpose(np.transpose(X)[thres])
# print(sel_fea)

X_normalized = preprocessing.normalize(sel_fea, norm='l2')
print(X_normalized)
euclidean_dist = euclidean_distances(X_normalized)
squared_euclidean = np.square(euclidean_dist)
print(np.shape(squared_euclidean))

# adj_matrix = 10**(1/similarity_matrix)
adj_matrix = squared_euclidean
np.fill_diagonal(adj_matrix, 0)

print('mean = ', np.mean(adj_matrix[adj_matrix != np.min(adj_matrix)]))
print('max = ', np.max(adj_matrix))
print('min = ', np.min(adj_matrix[adj_matrix != np.min(adj_matrix)]))

adj_matrix[adj_matrix >= -0.01+np.mean(adj_matrix[adj_matrix != np.min(adj_matrix)])] = 0

print(np.count_nonzero(adj_matrix))

# G = nx.from_numpy_matrix(adj_matrix, create_using=nx.MultiGraph())
G = nx.from_numpy_matrix(adj_matrix)

(8, 152)
[[3.74511844e-01 8.83775113e-01 9.83321211e-02 ... 1.48712158e-01
  1.52050595e-01 1.39115672e-04]
 [3.91944404e-01 8.66749537e-01 1.14755534e-01 ... 1.78545144e-01
  1.57172336e-01 1.85966440e-04]
 [3.85837762e-01 8.76552187e-01 9.84911839e-02 ... 1.58088993e-01
  1.57701994e-01 6.54434250e-05]
 ...
 [3.81101050e-01 8.72844341e-01 1.27033683e-01 ... 1.66933907e-01
  1.57659801e-01 5.21549836e-05]
 [4.13582163e-01 8.70664068e-01 1.01722474e-01 ... 1.40537628e-01
  1.39199175e-01 2.42581331e-04]
 [3.46186403e-01 9.18626916e-01 7.38168047e-02 ... 9.44727001e-02
  9.51131930e-02 6.31622056e-05]]
(152, 152)
mean =  0.013825795822924033
max =  0.16386584564512396
min =  8.488796042849246e-05
6476


In [236]:
G.remove_nodes_from(list(nx.isolates(G)))

color_map = []
for i in range(len(G)):
    if y_pred[i] == 1:   # 1 -> hyperplasic
        color_map.append('blue')
    elif y_pred[i] == 2: # 2 -> serrated adenoma
        color_map.append('red')
    elif y_pred[i] == 3:
        color_map.append('green')
        
    

nx.draw(G, pos=nx.spring_layout(G),node_size=20, node_color=color_map, width=0.05)
plt.savefig('graph_1_pred.png', dpi = 600)
# print("Cluatering Coefficient of each nodes" ,nx.clustering(G))
# print("No of triangels:", nx.triangles(G))
print("Transitivity:" , nx.transitivity(G))
print("Average Clustering coeff :",nx.average_clustering(G))
# print("Generalized Degree of nodes:", nx.generalized_degree(G))
print("diameter:", nx.diameter(G, e=None, usebounds=False))

Transitivity: 0.7121981586448675
Average Clustering coeff : 0.7249874787574186
diameter: 8


In [237]:
# G.remove_nodes_from(list(nx.isolates(G)))

color_map = []
for i in range(len(G)):
    if y[i] == 1:   # 1 -> hyperplasic
        color_map.append('blue')
    elif y[i] == 2: # 2 -> serrated adenoma
        color_map.append('red')
    elif y[i] == 3:
        color_map.append('green')
        
    

nx.draw(G,pos=nx.spring_layout(G), node_size=20, node_color=color_map, width = 0.1)
plt.savefig('graph_1_true.png', dpi = 600)


## Dataset 2: LSVT Voice Rehabilitation Data Set

In [238]:
filePath = 'LSVT_voice_rehabilitation.xlsx'

In [239]:
import pandas as pd

xls = pd.ExcelFile(filePath)
df1 = pd.read_excel(xls, 'Data')
df2 = pd.read_excel(xls, 'Binary response')
df3 = pd.read_excel(xls, 'Subject demographics')

In [240]:
fea = df1.to_numpy()
labels = df2.to_numpy()
participants = df3.to_numpy()

In [241]:
# X -> features, y -> label
X = fea
# print(np.shape(X))
y = np.transpose(labels)[0]

import seaborn as sns
import matplotlib.pyplot as plt

plt.savefig('corr_dataset-2.png' , dpi = 900)

In [242]:
from sklearn.model_selection import LeaveOneOut
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix

# X -> features, y -> label
X = fea
y = np.transpose(labels)[0]

# create loocv procedure
cv = LeaveOneOut()

y_true, y_pred = [],[]
fea_importances = []

# enumerate splits
for train_ix, test_ix in cv.split(X):
	
    # split data
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]   
    # fit model
    # model = RandomForestClassifier(random_state=1)
    model = ExtraTreesClassifier(n_estimators=100, random_state=0, criterion='entropy')
    model.fit(X_train, y_train) 
    fea_importances.append(model.feature_importances_)

    # evaluate model
    yhat = model.predict(X_test)
    # store
    y_true.append(y_test[0])
    y_pred.append(yhat[0])

# calculate accuracy
cm = confusion_matrix(y_true, y_pred)
print(cm)
acc = accuracy_score(y_true, y_pred)
print('Accuracy: %.3f' % acc)

[[27 15]
 [ 3 81]]
Accuracy: 0.857


In [243]:
import matplotlib.pyplot as plt

mean_fea_imp = np.mean(np.array(fea_importances), axis=0)
feature_importance_normalized = (mean_fea_imp)/np.max(mean_fea_imp)

# Plotting a Bar Graph to compare the models
plt.bar(range(1,311), feature_importance_normalized)
plt.xlabel('Feature Labels')
plt.ylabel('Feature Importances')
plt.title('Comparison of different Feature Importances')
plt.show()

In [244]:
import networkx as nx

from sklearn import preprocessing
from sklearn.metrics.pairwise import euclidean_distances

thres = (feature_importance_normalized > 0.3)
print(np.shape(np.transpose(X)[thres]))
sel_fea = np.transpose(np.transpose(X)[thres])
# print(sel_fea)

X_normalized = preprocessing.normalize(sel_fea, norm='l2')
print(X_normalized)
euclidean_dist = euclidean_distances(X_normalized)
squared_euclidean = np.square(euclidean_dist)
print(np.shape(squared_euclidean))

# adj_matrix = 10**(1/similarity_matrix)
adj_matrix = squared_euclidean
np.fill_diagonal(adj_matrix, 0)

print('mean = ', np.mean(adj_matrix[adj_matrix != np.min(adj_matrix)]))
print('max = ', np.max(adj_matrix))
print('min = ', np.min(adj_matrix[adj_matrix != np.min(adj_matrix)]))

adj_matrix[adj_matrix >= -0.279+np.mean(adj_matrix[adj_matrix != np.min(adj_matrix)])] = 0

print(np.count_nonzero(adj_matrix))

G = nx.from_numpy_matrix(adj_matrix)

(31, 126)
[[ 1.51183227e-05  7.15444471e-06  8.24201491e-08 ... -8.35047796e-01
  -3.34973122e-01 -1.33541161e-01]
 [ 2.96706216e-05  1.05195633e-05  1.24157866e-07 ... -8.51295550e-01
  -3.61641487e-01 -1.66481759e-01]
 [ 1.42144754e-04  1.64806919e-04  1.94196518e-06 ... -8.83311892e-01
  -3.34176368e-01 -1.09303634e-01]
 ...
 [ 1.11918160e-05  7.81871795e-06  9.20350315e-08 ... -7.92920876e-01
  -2.88957979e-01 -1.12876471e-01]
 [ 1.89482027e-05  9.22834523e-06  1.09835715e-07 ... -8.49183312e-01
  -3.35756749e-01 -1.32295457e-01]
 [ 5.24132505e-05  2.70669255e-05  3.21557663e-07 ... -8.73436209e-01
  -3.10635190e-01 -1.23687156e-01]]
(126, 126)
mean =  0.28911450681076456
max =  1.7261245547419248
min =  3.634076633540761e-05
2340


In [245]:
G.remove_nodes_from(list(nx.isolates(G)))

color_map = []
for i in range(len(G)):
    if y_pred[i] == 1:   # 1 -> hyperplasic
        color_map.append('blue')
    elif y_pred[i] == 2: # 2 -> serrated adenoma
        color_map.append('red')
    

nx.draw_spring(G, node_size=20, node_color=color_map, width=0.1)
plt.savefig('graph_2_pred.png', dpi = 600)


In [246]:
color_map = []
for i in range(len(G)):
    if y[i] == 1:   # 1 -> hyperplasic
        color_map.append('blue')
    elif y[i] == 2: # 2 -> serrated adenoma
        color_map.append('red')
    

nx.draw_spring(G,  node_size=20, node_color=color_map, width = 0.1)
plt.savefig('graph_2true2.png', dpi = 600)



In [251]:
G.remove_nodes_from(list(nx.isolates(G)))

# print("Cluatering Coefficient of each nodes" ,nx.clustering(G))
# print("No of triangels:", nx.triangles(G))
print("Transitivity:" , nx.transitivity(G))
print("Average Clustering coeff :",nx.average_clustering(G))
# print("Generalized Degree of nodes:", nx.generalized_degree(G))
diameter = max([max(j.values()) for (i,j) in nx.shortest_path_length(G)])
# print("diameter:", nx.diameter(G, e=None, usebounds=False))
print("diameter:", diameter)

Transitivity: 0.7632939115199299
Average Clustering coeff : 0.7259376829614688
diameter: 13


In [203]:
diameter

13