In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.utils.class_weight import compute_class_weight


import joblib

import matplotlib.pyplot as plt
import seaborn as sns
import os

import TrainingUtils

In [2]:
#Define the current working directory, parent and granparent directories
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
grandparent_dir = os.path.dirname(parent_dir)

In [3]:
#Choose input type, one of Rec_Only, Age_Sex, Symptoms, Age_Sex_Symptoms
input_type = 'Age_Sex'

In [4]:
#Choose the audio features to use, one of FeatureStates, OpenSmile, or MFCC
audio_feature = 'FeatureStates'

In [5]:
#Choose the model type, one of SVM, LR, or MLP
model_type = 'SVM'

In [6]:
#Load in files used for training
train_files = joblib.load(f'{cwd}/FEMH_train_files.pkl')

In [7]:
#Load audio features and remove the file extension from the filename
if audio_feature!= 'MFCC':
    femh_df = pd.read_csv(f"{parent_dir}/Audio Features/{audio_feature}_FEMH.csv", index_col=0)
else:
    femh_df = pd.read_pickle(f"{parent_dir}/Audio Features/{audio_feature}_FEMH.pkl")
    
femh_df['filename'] = femh_df['file'].str.split('.', expand=True)[0]
femh_df = femh_df.drop(['file'], axis=1)
femh_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,504,505,506,507,508,509,510,511,pathology,filename
0,0.198858,-0.671097,-0.5538,0.430371,0.689427,0.451664,0.513923,-0.669326,3.029441,-0.140924,...,-0.467423,0.029338,0.75236,-0.57808,1.039529,0.202895,1.36336,0.883992,Atrophy,Atrophy-00002mg
1,0.545076,-0.047681,-1.163096,0.268823,0.509433,-0.136339,0.201685,-0.742708,-0.09448,0.041155,...,1.091321,-0.360934,0.273519,-0.935269,-0.079642,-0.262875,-0.492177,0.851083,Atrophy,Atrophy-0001297
2,0.298088,-0.969536,-1.342669,0.438397,-0.010687,0.75673,0.694832,2.412334,0.193469,2.678018,...,-1.58867,-1.013131,0.818641,4.83507,-0.30983,0.503137,1.045041,0.54652,Atrophy,Atrophy-0001apo
3,0.29754,-0.991699,2.728217,-0.110462,0.275403,0.368062,-0.597715,-0.435604,6.930976,-0.512924,...,-1.465304,-1.550032,0.37024,0.287871,1.318349,0.779946,2.578023,0.35936,Atrophy,Atrophy-0001qd3
4,0.56907,-0.708248,-0.53242,0.829588,0.291864,-0.042466,-0.483135,-1.057416,2.88513,0.608033,...,1.912285,1.891305,0.523399,-0.637222,0.400236,0.318714,-0.948448,0.026111,Atrophy,Atrophy-0002ipt


In [8]:
#Load in recording demographics and symptoms
demographics = pd.read_pickle(f'{parent_dir}/Audio Demographics/FEMH_demographics.pkl')
demographics.head()

Unnamed: 0,ID,Sex,Age,Disease category,Narrow pitch range,Decreased volume,Fatigue,Dryness,Lumping,Heartburn,...,Occupational vocal demand,Diabetes,Hypertension,CAD,Head and Neck Cancer,Head injury,CVA,Voice handicap index - 10,pathology,filename
0,00002mg,1,97,3.Atrophy,0,0,0,0,0,0,...,2,0,0,0,0,0,0,12,Atrophy,Atrophy-00002mg
1,0001297,1,86,3.Atrophy,0,0,0,0,0,0,...,4,0,1,0,0,0,1,36,Atrophy,Atrophy-0001297
2,0001apo,2,45,3.Atrophy,0,0,0,1,0,0,...,3,0,0,0,0,0,0,16,Atrophy,Atrophy-0001apo
3,0001qd3,1,75,3.Atrophy,1,1,0,0,1,0,...,3,0,0,1,0,0,0,19,Atrophy,Atrophy-0001qd3
4,0002ipt,1,64,3.Atrophy,0,0,0,1,0,0,...,2,0,0,1,0,0,0,34,Atrophy,Atrophy-0002ipt


In [9]:
#Combines the demographics and audio features based on the input type and replaces the pathologies with either malignant or benign
train_df = TrainingUtils.format_input_dataframe(femh_df, demographics, train_files, input_type=input_type)
print(train_df.shape)
train_df.head()

(1340, 516)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,506,507,508,509,510,511,pathology,filename,Age,Sex
0,0.198858,-0.671097,-0.5538,0.430371,0.689427,0.451664,0.513923,-0.669326,3.029441,-0.140924,...,0.75236,-0.57808,1.039529,0.202895,1.36336,0.883992,Benign,Atrophy-00002mg,97,1
1,0.545076,-0.047681,-1.163096,0.268823,0.509433,-0.136339,0.201685,-0.742708,-0.09448,0.041155,...,0.273519,-0.935269,-0.079642,-0.262875,-0.492177,0.851083,Benign,Atrophy-0001297,86,1
2,0.29754,-0.991699,2.728217,-0.110462,0.275403,0.368062,-0.597715,-0.435604,6.930976,-0.512924,...,0.37024,0.287871,1.318349,0.779946,2.578023,0.35936,Benign,Atrophy-0001qd3,75,1
3,0.56907,-0.708248,-0.53242,0.829588,0.291864,-0.042466,-0.483135,-1.057416,2.88513,0.608033,...,0.523399,-0.637222,0.400236,0.318714,-0.948448,0.026111,Benign,Atrophy-0002ipt,64,1
4,0.195015,-0.90238,-0.70036,0.338424,0.612395,0.594198,0.045864,-1.305794,2.438155,0.67744,...,0.061292,-1.131177,1.715775,0.550369,1.798784,-0.572442,Benign,Atrophy-0002m5r,76,1


In [10]:
#Separate features and target
X_train = train_df.drop(['pathology', 'filename'], axis=1)
y_train = train_df['pathology']

In [11]:
#Define the audio features
audio_features = list(femh_df.columns.drop(['pathology', 'filename']))
audio_features

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138'

In [12]:
#Define the symptom and demographic features if any
symptom_features = list(X_train.columns.drop(audio_features))
symptom_features

['Age', 'Sex']

In [13]:
preprocessor = TrainingUtils.create_preprocessor(audio_feature, audio_features, symptom_features)

In [14]:
# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)

# Create a dictionary mapping the class labels to their weights
class_weight_dict = {cls: weight for cls, weight in zip(np.unique(y_train), class_weights)}
class_weight_dict

{'Benign': 0.5134099616858238, 'Malignant': 19.142857142857142}

In [15]:
pipeline = TrainingUtils.create_classification_pipeline(preprocessor, class_weight_dict, model_type)

In [16]:
param_grid={'classifier__C': [0.1],  
              'classifier__gamma': ['scale'], 
              'classifier__kernel': ['linear', 'rbf']} 

In [17]:
model=TrainingUtils.train_model(pipeline, X_train, y_train, model_type=model_type, param_grid=param_grid)

Training SVM 
 parameter grid: {'classifier__C': [0.1], 'classifier__gamma': ['scale'], 'classifier__kernel': ['linear', 'rbf']}
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END classifier__C=0.1, classifier__gamma=scale, classifier__kernel=linear;, score=(train=0.943, test=0.771) total time=   2.5s
[CV 2/5] END classifier__C=0.1, classifier__gamma=scale, classifier__kernel=linear;, score=(train=0.902, test=0.892) total time=   2.3s
[CV 3/5] END classifier__C=0.1, classifier__gamma=scale, classifier__kernel=linear;, score=(train=0.928, test=0.643) total time=   2.8s
[CV 4/5] END classifier__C=0.1, classifier__gamma=scale, classifier__kernel=linear;, score=(train=0.926, test=0.798) total time=   2.3s
[CV 5/5] END classifier__C=0.1, classifier__gamma=scale, classifier__kernel=linear;, score=(train=0.924, test=0.791) total time=   1.4s
[CV 1/5] END classifier__C=0.1, classifier__gamma=scale, classifier__kernel=rbf;, score=(train=0.948, test=0.794) total time=   2.6

In [18]:
# joblib.dump(model, f'{parent_dir}/Pre-Trained Models/{model_type}_{audio_feature}_{input_type}.pkl', compress=True)