In [1]:
import skimage
print('skimage:', skimage.__version__)

import sklearn
print('sklearn:', sklearn.__version__)

skimage: 0.20.0
sklearn: 1.2.2


In [64]:
from skimage import io
from skimage.morphology import erosion, dilation
from skimage.feature import graycomatrix, graycoprops

import pandas as pd



import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

import sys
import os
import glob
import math


from scipy.stats import mannwhitneyu, f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn.metrics import recall_score, make_scorer, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
 


# PREDICTING FLG ANALYSIS

### 1 Age_and_thenar
### 2 Age_and_palmar
### 3 Expanded variables


In [65]:
"""
SCORING
"""


specificity = make_scorer(recall_score, pos_label=0) 
sensitivity = make_scorer(recall_score, pos_label=1)

make_scorer(recall_score, pos_label=0) 
make_scorer(recall_score, pos_label=1)

scoring = ['accuracy', 'roc_auc', 'recall', sensitivity, specificity]


sorted(sklearn.metrics.SCORERS.keys())

def model_test_cv(kfoldsplits):
    results = []
    for x in scoring:
        for name, model in models:
            kfold = KFold(n_splits=kfoldsplits) #random_state=seed)
            cv_results = cross_val_score(model, trainDataGlobal, trainLabelsGlobal, cv=kfold, scoring=x)
            results.append(cv_results)
            names.append(name)
            print(f"{name}: {x} mean is {cv_results.mean()*100:.1f}% with SD {cv_results.std()*100.0:.1f}")


    

In [72]:
### Load metadata
mcv4 = pd.read_excel("merged_scores_2023_updated.xlsx")



In [66]:
thenar_scores = list(mcv4['thenar_scores'])
palmar_scores = list(mcv4['palmar_scores'])
age = list(mcv4['age'])

globallabels = mcv4['flg_binary']

sex = list(mcv4['sex'])
easi = list(mcv4['easi'])
asthma = list(mcv4['h_asthmadr'])
allergy = list(mcv4['h_allergydr'])
hayfever = list(mcv4['h_hayfeverdr'])
igelevel = list(mcv4['igelevel'])

# Prepare features and labels (hyperlinearity and age only)

In [103]:
# scale hyperlinearity
thenar_scores_10000 = [x/10_000 for x in thenar_scores]

In [104]:
age_and_thenar = []
all_flg = []
for i in range(len(thenar_scores)):
    features = [thenar_scores_10000[i], age[i]]
    for x in features:
        counter = 0
        if np.isnan(x):
            counter = counter + 1
            #error_count = error_count + 1
            break
    if counter == 0:
        age_and_thenar.append(features)
        all_flg.append(globallabels[i])
all_flg_df = pd.DataFrame(all_flg, columns = ['FLG labels'])
all_flg_df.to_csv("all_flg.csv")



In [106]:
palmar_scores_10000 = [x/10000 for x in palmar_scores]

In [107]:
age_and_palmar = []
for i in range(len(palmar_scores)):
    features = [palmar_scores_10000[i], age[i]]
    for x in features:
        counter = 0
        if np.isnan(x):
            counter = counter + 1
            #error_count = error_count + 1
            break
    if counter == 0:
        age_and_palmar.append(features)
        #all_flg.append(globallabels[i])

In [108]:
print(len(all_flg)) # labels
print(len(age_and_thenar)) # features: thenar hyperlinearity and age
print(len(age_and_palmar)) # features: palmar hyperlinearity and age


531
531
531


## Thenar hyperlinearity and age

In [70]:
datasetx = age_and_thenar
labelsx = all_flg

test_size=0.1
seed=9

print("[INFO] training classifier...")
models = []
models.append(('LogReg', LogisticRegression(random_state=seed, max_iter=400)))
models.append(('SVM-L', LinearSVC(random_state=seed, dual=False)))
print("done")

trainDataGlobal, testDataGlobal, trainLabelsGlobal, testLabelsGlobal = train_test_split(np.array(datasetx),
                                                                                          np.array(labelsx),
                                                                                          test_size=test_size,
                                                                                          random_state=seed)
                                                                           #shuffle=True) 
                                                                                                  
print("[STATUS] splitted train and test data...")
print("Train data  : {}".format(trainDataGlobal.shape))
print("Test data   : {}".format(testDataGlobal.shape))
print("Train labels: {}".format(trainLabelsGlobal.shape))
print("Test labels : {}".format(testLabelsGlobal.shape))


names   = []
model_test_cv(kfoldsplits=10)                    #

[INFO] training classifier...
done
[STATUS] splitted train and test data...
Train data  : (477, 2)
Test data   : (54, 2)
Train labels: (477,)
Test labels : (54,)
LogReg: accuracy mean is 64.5% with SD 9.4
SVM-L: accuracy mean is 74.0% with SD 5.2
LogReg: roc_auc mean is 66.5% with SD 9.3
SVM-L: roc_auc mean is 76.0% with SD 4.3
LogReg: recall mean is 52.9% with SD 11.0
SVM-L: recall mean is 51.2% with SD 7.8
LogReg: make_scorer(recall_score, pos_label=1) mean is 52.9% with SD 11.0
SVM-L: make_scorer(recall_score, pos_label=1) mean is 51.2% with SD 7.8
LogReg: make_scorer(recall_score, pos_label=0) mean is 72.1% with SD 13.2
SVM-L: make_scorer(recall_score, pos_label=0) mean is 89.5% with SD 6.0


## Palmar hyperlinearity and age

In [71]:
"""
AGE_AND_PALMAR
"""
datasetx = age_and_palmar
labelsx = all_flg

test_size=0.1
seed=9

print("[INFO] training classifier...")
models = []
models.append(('LogReg', LogisticRegression(random_state=seed, max_iter=400)))
models.append(('SVM-L', LinearSVC(random_state=seed, dual=False)))
print("done")

trainDataGlobal, testDataGlobal, trainLabelsGlobal, testLabelsGlobal = train_test_split(np.array(datasetx),
                                                                                          np.array(labelsx),
                                                                                          test_size=test_size,
                                                                                          random_state=seed)
                                                                           #shuffle=True) 
                                                                                                  
print("[STATUS] splitted train and test data...")
print("Train data  : {}".format(trainDataGlobal.shape))
print("Test data   : {}".format(testDataGlobal.shape))
print("Train labels: {}".format(trainLabelsGlobal.shape))
print("Test labels : {}".format(testLabelsGlobal.shape))


names   = []
model_test_cv(kfoldsplits=10)                    #

[INFO] training classifier...
done
[STATUS] splitted train and test data...
Train data  : (477, 2)
Test data   : (54, 2)
Train labels: (477,)
Test labels : (54,)
LogReg: accuracy mean is 62.7% with SD 7.8
SVM-L: accuracy mean is 70.2% with SD 4.8
LogReg: roc_auc mean is 62.8% with SD 8.8
SVM-L: roc_auc mean is 74.5% with SD 6.2
LogReg: recall mean is 51.5% with SD 10.1
SVM-L: recall mean is 47.7% with SD 4.5
LogReg: make_scorer(recall_score, pos_label=1) mean is 51.5% with SD 10.1
SVM-L: make_scorer(recall_score, pos_label=1) mean is 47.7% with SD 4.5
LogReg: make_scorer(recall_score, pos_label=0) mean is 70.3% with SD 9.8
SVM-L: make_scorer(recall_score, pos_label=0) mean is 85.5% with SD 7.4


# LogReg odds ratios

In [121]:
"""
AGE_AND_THENAR
"""
datasetx = age_and_thenar
labelsx = all_flg

test_size=0.1
seed=9

print("[INFO] training classifier...")
models = []
models.append(('LogReg', LogisticRegression(random_state=seed, max_iter=400)))
models.append(('SVM-L', LinearSVC(random_state=seed, dual=False)))
print("done")

trainDataGlobal, testDataGlobal, trainLabelsGlobal, testLabelsGlobal = train_test_split(np.array(datasetx),
                                                                                          np.array(labelsx),
                                                                                          test_size=test_size,
                                                                                          random_state=seed)
                                                                           #shuffle=True) 
                                                                                                  
print("[STATUS] splitted train and test data...")
print("Train data  : {}".format(trainDataGlobal.shape))
print("Test data   : {}".format(testDataGlobal.shape))
print("Train labels: {}".format(trainLabelsGlobal.shape))
print("Test labels : {}".format(testLabelsGlobal.shape))


names   = []
model_test_cv(kfoldsplits=10)                    #

[INFO] training classifier...
done
[STATUS] splitted train and test data...
Train data  : (477, 2)
Test data   : (54, 2)
Train labels: (477,)
Test labels : (54,)
LogReg: accuracy mean is 74.4% with SD 3.1
SVM-L: accuracy mean is 74.4% with SD 3.2
LogReg: roc_auc mean is 76.9% with SD 4.6
SVM-L: roc_auc mean is 76.9% with SD 4.5
LogReg: recall mean is 52.4% with SD 6.4
SVM-L: recall mean is 52.0% with SD 6.7
LogReg: make_scorer(recall_score, pos_label=1) mean is 52.4% with SD 6.4
SVM-L: make_scorer(recall_score, pos_label=1) mean is 52.0% with SD 6.7
LogReg: make_scorer(recall_score, pos_label=0) mean is 89.3% with SD 7.5
SVM-L: make_scorer(recall_score, pos_label=0) mean is 89.6% with SD 7.3


In [122]:
logreg = LogisticRegression(random_state=seed, max_iter=400)
logreg.fit(trainDataGlobal, trainLabelsGlobal)
# Get the model's coefficients
coefficients = logreg.coef_[0]
coefficients
# Calculate and print the odds ratios
odds_ratios = np.exp(coefficients)
for i in range(len(odds_ratios)):
    if i == 0:
        print(f"Odds Ratio for feature {feature_names[i]}: {odds_ratios[i]} (per 10_000 hyperlinearity score)")
    else:
        print(f"Odds Ratio for feature {feature_names[i]}: {odds_ratios[i]}")

Odds Ratio for feature palmar_hyperlinearity_scores: 7.88314155582787 (per 10_000 hyperlinearity score)
Odds Ratio for feature age: 0.9658315648691034


In [132]:
feature_names = ['hyperlinearity_scores',
 'age',
 'sex',
 'easi',
 'asthma',
 'allergy',
 'hayfever']

In [124]:
feature_names = feature_names[:2]

In [126]:
datasetx = age_and_palmar
labelsx = all_flg

test_size=0.1
seed=9

print("[INFO] training classifier...")
models = []
models.append(('LogReg', LogisticRegression(random_state=seed, max_iter=400)))
models.append(('SVM-L', LinearSVC(random_state=seed, dual=False)))
print("done")

trainDataGlobal, testDataGlobal, trainLabelsGlobal, testLabelsGlobal = train_test_split(np.array(datasetx),
                                                                                          np.array(labelsx),
                                                                                          test_size=test_size,
                                                                                          random_state=seed)
                                                                           #shuffle=True) 
                                                                                                  
print("[STATUS] splitted train and test data...")
print("Train data  : {}".format(trainDataGlobal.shape))
print("Test data   : {}".format(testDataGlobal.shape))
print("Train labels: {}".format(trainLabelsGlobal.shape))
print("Test labels : {}".format(testLabelsGlobal.shape))


names   = []
model_test_cv(kfoldsplits=10)                    #

[INFO] training classifier...
done
[STATUS] splitted train and test data...
Train data  : (477, 2)
Test data   : (54, 2)
Train labels: (477,)
Test labels : (54,)
LogReg: accuracy mean is 70.7% with SD 5.2
SVM-L: accuracy mean is 70.4% with SD 5.0
LogReg: roc_auc mean is 74.2% with SD 6.4
SVM-L: roc_auc mean is 74.3% with SD 6.3
LogReg: recall mean is 48.7% with SD 5.2
SVM-L: recall mean is 48.1% with SD 5.3
LogReg: make_scorer(recall_score, pos_label=1) mean is 48.7% with SD 5.2
SVM-L: make_scorer(recall_score, pos_label=1) mean is 48.1% with SD 5.3
LogReg: make_scorer(recall_score, pos_label=0) mean is 85.5% with SD 7.4
SVM-L: make_scorer(recall_score, pos_label=0) mean is 85.5% with SD 7.4


In [127]:
logreg = LogisticRegression(random_state=seed, max_iter=400)
logreg.fit(trainDataGlobal, trainLabelsGlobal)
# Get the model's coefficients
coefficients = logreg.coef_[0]
coefficients
# Calculate and print the odds ratios
odds_ratios = np.exp(coefficients)
for i in range(len(odds_ratios)):
    if i == 0:
        print(f"Odds Ratio for feature {feature_names[i]}: {odds_ratios[i]} (per 5000 hyperlinearity score)")
    else:
        print(f"Odds Ratio for feature {feature_names[i]}: {odds_ratios[i]}")


Odds Ratio for feature palmar_hyperlinearity_scores: 5.819976084039307 (per 5000 hyperlinearity score)
Odds Ratio for feature age: 0.9759711514824588


# Expanded variables 

In [133]:
feature_names = ['hyperlinearity_scores',
 'age',
 'sex',
 'easi',
 'asthma',
 'allergy',
 'hayfever']

In [134]:
thenar_scores_10000 = [x/10_000 for x in thenar_scores]


In [135]:
expanded_variables_wo_ige_flg = []
expanded_variables_wo_ige = []
for i in range(len(thenar_scores_10000)):
    features = [thenar_scores_10000[i], age[i], sex[i], easi[i], asthma[i],
                              allergy[i], hayfever[i]]
    for x in features:
        counter = 0
        if np.isnan(x):
            counter = counter + 1
            #error_count = error_count + 1
            break
    if counter == 0:
        expanded_variables_wo_ige.append(features)
        expanded_variables_wo_ige_flg.append(globallabels[i])

print(len(expanded_variables_wo_ige_flg))
print(len(expanded_variables_wo_ige))





528
528


In [136]:
# Thenar
datasetx = expanded_variables_wo_ige
labelsx = expanded_variables_wo_ige_flg

test_size=0.1
seed=9

print("[INFO] training classifier...")
models = []
models.append(('LogReg', LogisticRegression(random_state=seed, max_iter=400)))
models.append(('SVM-L', LinearSVC(random_state=seed, dual=False)))
print("done")

trainDataGlobal, testDataGlobal, trainLabelsGlobal, testLabelsGlobal = train_test_split(np.array(datasetx),
                                                                                          np.array(labelsx),
                                                                                          test_size=test_size,
                                                                                          random_state=seed)
                                                                           #shuffle=True) 
                                                                                                  
print("[STATUS] splitted train and test data...")
print("Train data  : {}".format(trainDataGlobal.shape))
print("Test data   : {}".format(testDataGlobal.shape))
print("Train labels: {}".format(trainLabelsGlobal.shape))
print("Test labels : {}".format(testLabelsGlobal.shape))


names   = []
model_test_cv(kfoldsplits=10)                    #

[INFO] training classifier...
done
[STATUS] splitted train and test data...
Train data  : (475, 7)
Test data   : (53, 7)
Train labels: (475,)
Test labels : (53,)
LogReg: accuracy mean is 74.8% with SD 6.3
SVM-L: accuracy mean is 74.5% with SD 6.6
LogReg: roc_auc mean is 76.6% with SD 9.7
SVM-L: roc_auc mean is 76.7% with SD 9.6
LogReg: recall mean is 52.3% with SD 11.9
SVM-L: recall mean is 52.3% with SD 11.9
LogReg: make_scorer(recall_score, pos_label=1) mean is 52.3% with SD 11.9
SVM-L: make_scorer(recall_score, pos_label=1) mean is 52.3% with SD 11.9
LogReg: make_scorer(recall_score, pos_label=0) mean is 89.1% with SD 5.9
SVM-L: make_scorer(recall_score, pos_label=0) mean is 88.8% with SD 6.1


In [137]:
logreg = LogisticRegression(random_state=seed, max_iter=400)
logreg.fit(trainDataGlobal, trainLabelsGlobal)
# Get the model's coefficients
coefficients = logreg.coef_[0]
coefficients
# Calculate and print the odds ratios
odds_ratios = np.exp(coefficients)
for i in range(len(odds_ratios)):
    if i == 0:
        print(f"Odds Ratio for feature {feature_names[i]}: {odds_ratios[i]} (per 10_000 hyperlinearity score)")
    else:
        print(f"Odds Ratio for feature {feature_names[i]}: {odds_ratios[i]}")

Odds Ratio for feature hyperlinearity_scores: 8.448283133361121 (per 10_000 hyperlinearity score)
Odds Ratio for feature age: 0.9630401552050338
Odds Ratio for feature sex: 0.6465470171163346
Odds Ratio for feature easi: 1.0000683745299495
Odds Ratio for feature asthma: 1.311884690582012
Odds Ratio for feature allergy: 1.1215817408329332
Odds Ratio for feature hayfever: 0.906418632612487


In [98]:
palmar_scores_10000 = [x/10_000 for x in palmar_scores]


In [139]:
expanded_variables_wo_ige_flg_palmar = []
expanded_variables_wo_ige_palmar = []
for i in range(len(palmar_scores_10000)):
    features = [palmar_scores_10000[i], age[i], sex[i], easi[i], asthma[i],
                              allergy[i], hayfever[i]]
    for x in features:
        counter = 0
        if np.isnan(x):
            counter = counter + 1
            #error_count = error_count + 1
            break
    if counter == 0:
        expanded_variables_wo_ige_palmar.append(features)
        expanded_variables_wo_ige_flg_palmar.append(globallabels[i])

print(len(expanded_variables_wo_ige_flg_palmar))
print(len(expanded_variables_wo_ige_palmar))

"""
EXPANDED VARIABLES W/O IGE PALMAR
"""
datasetx = expanded_variables_wo_ige_palmar
labelsx = expanded_variables_wo_ige_flg_palmar

test_size=0.1
seed=9

print("[INFO] training classifier...")
models = []
models.append(('LogReg', LogisticRegression(random_state=seed, max_iter=400)))
models.append(('SVM-L', LinearSVC(random_state=seed, dual=False)))
print("done")

trainDataGlobal, testDataGlobal, trainLabelsGlobal, testLabelsGlobal = train_test_split(np.array(datasetx),
                                                                                          np.array(labelsx),
                                                                                          test_size=test_size,
                                                                                          random_state=seed)
                                                                           #shuffle=True) 
                                                                                                  
print("[STATUS] splitted train and test data...")
print("Train data  : {}".format(trainDataGlobal.shape))
print("Test data   : {}".format(testDataGlobal.shape))
print("Train labels: {}".format(trainLabelsGlobal.shape))
print("Test labels : {}".format(testLabelsGlobal.shape))


names   = []
model_test_cv(kfoldsplits=10)                    #

528
528
[INFO] training classifier...
done
[STATUS] splitted train and test data...
Train data  : (475, 7)
Test data   : (53, 7)
Train labels: (475,)
Test labels : (53,)
LogReg: accuracy mean is 71.0% with SD 8.4
SVM-L: accuracy mean is 71.4% with SD 8.4
LogReg: roc_auc mean is 74.3% with SD 11.5
SVM-L: roc_auc mean is 74.3% with SD 11.6
LogReg: recall mean is 48.7% with SD 13.3
SVM-L: recall mean is 49.8% with SD 12.9
LogReg: make_scorer(recall_score, pos_label=1) mean is 48.7% with SD 13.3
SVM-L: make_scorer(recall_score, pos_label=1) mean is 49.8% with SD 12.9
LogReg: make_scorer(recall_score, pos_label=0) mean is 85.1% with SD 8.8
SVM-L: make_scorer(recall_score, pos_label=0) mean is 85.1% with SD 8.8


In [82]:
feature_names = ["palmar_hyperlinearity_scores", "age", "sex", "easi", "asthma",
                              "allergy", "hayfever"]

In [140]:
logreg = LogisticRegression(random_state=seed, max_iter=400)
logreg.fit(trainDataGlobal, trainLabelsGlobal)
# Get the model's coefficients
coefficients = logreg.coef_[0]
coefficients
# Calculate and print the odds ratios
odds_ratios = np.exp(coefficients)
for i in range(len(odds_ratios)):
    if i == 0:
        print(f"Odds Ratio for feature {feature_names[i]}: {odds_ratios[i]} (per 5000 hyperlinearity score)")
    else:
        print(f"Odds Ratio for feature {feature_names[i]}: {odds_ratios[i]}")


Odds Ratio for feature hyperlinearity_scores: 6.318023256801032 (per 5000 hyperlinearity score)
Odds Ratio for feature age: 0.9730412314530356
Odds Ratio for feature sex: 0.722639703636423
Odds Ratio for feature easi: 1.0045563264179167
Odds Ratio for feature asthma: 1.225375302369595
Odds Ratio for feature allergy: 1.1354999030802664
Odds Ratio for feature hayfever: 0.970917631727967


In [None]:
 [palmar_scores[i], age[i], sex[i], easi[i], asthma[i],
                              allergy[i], hayfever[i]]