In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns

from scipy import stats

In [2]:
df = pd.read_excel("../gallstone/dataset/dataset-uci.xlsx", sheet_name='sheet1').reset_index()

# dataset source and documentation can be found at https://archive.ics.uci.edu/dataset/1150/gallstone-1 

# Load Data 

In [15]:
df.columns = ['ID', 'gallstoneStatus', 'age', 'gender', 'comorbidity', 'coroaryArtDisease', 'hypothyromidism',
             'hyperlipiidemia', 'diabetes', 'height', 'weight', 'bmi', 'totalBodyWater', 'extracellularWater',
             'intracellularWater', 'extracellularFluid', 'totalBodyFatRatio%', 'leanMass%', 'protein%', 'visceralFatRat',
             'boneMass', 'muscleMass', 'obesity%', 'totalFat', 'visceralFatArea', 'visceralMuscleArea', 'hepaticFatAcc', 
             'glucose', 'TC', 'LDL', 'HDL', 'triglyceride', 'AST', 'ALT', 'ALP', 'creatinine', 'GFR', 'reactiveProtein', 
             'hemoglobin', 'vitaminD']

In [16]:
df['genderBin'] = df['gender'].apply(lambda x: 'female' if x == 1 else 'male')
df['gallstoneStatusBin'] = df['gallstoneStatus'].apply(lambda x: x == 1)
df['comordbidBin'] = df['comorbidity'].apply(lambda x: x == 1)

# Descriptive Stats Stuff

In [17]:
patientSumm = ['age', 'height', 'weight', 'bmi' , 'obesity%']

df[patientSumm].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,319.0,48.068966,12.114558,20.0,38.5,49.0,56.0,96.0
height,319.0,167.15674,10.05303,145.0,159.5,168.0,175.0,191.0
weight,319.0,80.56489,15.709069,42.9,69.6,78.8,91.25,143.5
bmi,319.0,28.877116,5.313707,17.4,25.25,28.3,31.85,49.7
obesity%,319.0,35.850125,109.799718,0.4,13.9,25.6,41.75,1954.0


In [18]:
df['gallstoneStatusBin'].value_counts()

gallstoneStatusBin
False    161
True     158
Name: count, dtype: int64

In [19]:
df.groupby(by=['gallstoneStatusBin', 'genderBin'])['ID'].count()

gallstoneStatusBin  genderBin
False               female       67
                    male         94
True                female       90
                    male         68
Name: ID, dtype: int64

### Is There an Association Between Patient Demographics and Gallstone Incidence? 

##### Gender

In [20]:
gender_gall = stats.fisher_exact(pd.crosstab(df['gallstoneStatus'], df['gender']))
gender_gall.statistic, gender_gall.pvalue

(np.float64(1.8568920105355575), np.float64(0.007195029358147645))

##### Height, Weight, Age, BMI, obesity

In [21]:
gallstone_summ_stat = { patient: stats.spearmanr(df[patient], df['gallstoneStatus']) for patient in patientSumm }
gallstone_summ_stat = [(item, gallstone) for item, gallstone in gallstone_summ_stat.items()]
gallstone_summ_stat

[('age',
  SignificanceResult(statistic=np.float64(0.04264314616421671), pvalue=np.float64(0.4478606826300362))),
 ('height',
  SignificanceResult(statistic=np.float64(-0.12100414394945586), pvalue=np.float64(0.030721273922701017))),
 ('weight',
  SignificanceResult(statistic=np.float64(0.051167433182639964), pvalue=np.float64(0.3623539234091636))),
 ('bmi',
  SignificanceResult(statistic=np.float64(0.12113140932311382), pvalue=np.float64(0.030544738979242862))),
 ('obesity%',
  SignificanceResult(statistic=np.float64(-0.003131991330908795), pvalue=np.float64(0.9555651080770513)))]

In [22]:
df.groupby(by=['gallstoneStatusBin'])['bmi'].mean()

gallstoneStatusBin
False    28.238509
True     29.527848
Name: bmi, dtype: float64

In [23]:
df.groupby(by=['gallstoneStatusBin'])['height'].mean()

gallstoneStatusBin
False    168.229814
True     166.063291
Name: height, dtype: float64

### Do those with comordibidies have more gallstone 

In [24]:
comordbid_gals = stats.mannwhitneyu(df[df['comordbidBin'] == True]['gallstoneStatus'], df[df['comordbidBin'] == False]['gallstoneStatus'])
comordbid_gals.statistic, comordbid_gals.pvalue

(np.float64(10884.5), np.float64(0.9939552402033391))

# Results 

In [25]:
df_lab = ['glucose', 'TC', 'LDL', 'HDL', 'triglyceride', 'AST', 'ALP', 'creatinine', 'GFR', 'reactiveProtein', 
          'hemoglobin', 'vitaminD']

df_bioimpend = ['totalBodyWater', 'extracellularWater', 'intracellularWater', 'extracellularFluid',
                'totalBodyFatRatio%', 'leanMass%', 'protein%', 'visceralFatRat', 'boneMass', 'muscleMass',
                'obesity%', 'totalFat', 'visceralFatArea', 'visceralMuscleArea', 'hepaticFatAcc']

df_demographs = ['gallstoneStatus', 'gender', 'age', 'comorbidity', 'coroaryArtDisease', 'diabetes', 'bmi',
                 'height', 'hyperlipiidemia', 'hypothyromidism', 'weight', 'coroaryArtDisease', 'comorbidity']

df_results = df_lab + df_bioimpend + df_demographs

df_results = df[df_results]

### Which features are most important for gallstone?

In [26]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

In [27]:
X = df_results.drop(columns=['gallstoneStatus'])
Y = df_results['gallstoneStatus']

In [28]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=42)

fscore, pvalue = f_classif(Xtrain, Ytrain)

In [29]:
fscores = pd.Series(fscore, index=Xtrain.columns)
features = fscores[fscores > 0.70]
features = features.sort_values(ascending=False)
features 

vitaminD              25.275755
reactiveProtein       15.978928
boneMass              14.945238
hemoglobin            13.644494
extracellularWater    10.882063
extracellularFluid     7.927943
HDL                    7.003220
gender                 6.267352
hyperlipiidemia        6.226669
totalBodyFatRatio%     5.910276
leanMass%              5.909446
creatinine             5.554153
totalBodyWater         5.496420
muscleMass             4.370901
AST                    3.883265
visceralMuscleArea     2.924792
comorbidity            2.380052
comorbidity            2.380052
triglyceride           2.300982
ALP                    2.274790
height                 2.181929
totalFat               1.782059
obesity%               1.771478
protein%               1.319335
visceralFatRat         1.108358
coroaryArtDisease      1.065820
coroaryArtDisease      1.065820
glucose                0.999506
diabetes               0.935591
visceralFatArea        0.868278
dtype: float64

In [30]:
# save features to csv 

featuresList = features.index.tolist()
pd.Series(features).to_csv('fscore_features.csv')

# save results to csv

df_results.to_csv('df_results.csv')

### features bioimpendance only

df_bioimpend = df_bioimpend + df_demographs
df_bioimpend = df_bioimpend.drop(columns=['gallstoneStatus'])

Xbio = df[df_bioimpend]
Ybio = df['gallstoneStatus']


#df_bioimpend  = df[df_bioimpend]
#df_bioimpend.to_csv('df_bioimpend.csv')

fstat_bio, pvalue_bio = f_classif(Xbio, Ybio)

bioimpend_gal = pd.DataFrame({'feature': Xbio.columns, 'fstattistic': fstat_bio, 'pvalue': pvalue_bio})
bioimpend_gal = bioimpend_gal.sort_values(by='fstattistic', ascending=False).reset_index()
bioimpend_gal

# Heatmaps

In [31]:
# TODO group feature importances in bins, then plot in heatmaps using gallstone means as colour
# # TODO separate do table for gallstone non gallstoen means 

In [32]:
features_df = features.to_frame().reset_index()
features_df.columns = ['feature', 'fscore']

df_resultsHM = df_results.groupby(by=['gallstoneStatus']).mean().reset_index()
df_resultsHM = df_resultsHM.T.reset_index()
df_resultsHM.columns = ['feature', 'nonGallstoneMean', 'gallstoneMean']

df_resultsHM = df_resultsHM.merge(features_df, on='feature', how='inner')
df_resultsHM = df_resultsHM.drop_duplicates()
df_resultsHM = df_resultsHM.set_index('feature').sort_index(level='fscore', ascending=False)