This notebook is for trying out the following idea:


*   First, train a regression model on a dataset with the symptoms and differential diagnoses to predict the DD from just the symptoms.
*   Then, train a separate classification model to predict the pathology purely from the differential diagnosis.
*   Finally, combine the two models and apply the prediction of the second model to the output of the first.

Conclusion:


*   Differential diagnoses are pretty good at predicting the pathology
*   However the first step does not work very well, and the two steps combined seem to give an overall recall of around 53%.




## Loading the data

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
!pip install git+https://github.com/nina-adhikari/disease_prediction --force-reinstall

Collecting git+https://github.com/nina-adhikari/disease_prediction
  Cloning https://github.com/nina-adhikari/disease_prediction to /tmp/pip-req-build-vlq15qbx
  Running command git clone --filter=blob:none --quiet https://github.com/nina-adhikari/disease_prediction /tmp/pip-req-build-vlq15qbx
  Resolved https://github.com/nina-adhikari/disease_prediction to commit 7417f17bdfd1ea4710ef331f1259762809f62140
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: disease-prediction
  Building wheel for disease-prediction (setup.py) ... [?25l[?25hdone
  Created wheel for disease-prediction: filename=disease_prediction-0.1-py3-none-any.whl size=4658 sha256=59c7dd8f05177879e207aecbdb2aea996483c82741abb9b256d7ba47f59bb06e
  Stored in directory: /tmp/pip-ephem-wheel-cache-breo7lxf/wheels/c1/30/69/a4efc8ebfadf754cf631ddaf3e9e848bd514c4db078acf14f5
Successfully built disease-prediction
Installing collected packages: disease-prediction
  Attempting uninstall

In [None]:
from disease_prediction.data import datasets as ds

In [None]:
DRIVE = '/content/drive/MyDrive/Disease-Prediction/ddx-dataset/'

SUBSETS = ['train', 'test', 'validate']

In [None]:
df = ds.load_datasets(subsets=SUBSETS,ddx=True, directory=DRIVE)

In [None]:
for subset in SUBSETS:
    df[subset].set_index('index', inplace=True)

In [None]:
d = {'Y': 1, 'N': 0}

# Convert Y/N to 1/0
for subset in SUBSETS:
    df[subset]['lesion_larger_than_1cm'] = df[subset]['lesion_larger_than_1cm'].map(d)

Optionally, drop the rows where one of our ten diseases is not a pathology:

In [None]:
for subset in SUBSETS:
    df[subset] = df[subset][df[subset]['PATHOLOGY'].isin(ds.DISEASES)]

In [None]:
df['train']

Unnamed: 0_level_0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,INITIAL_EVIDENCE,swollen_nodes,std,sweating,diarrhea,pain,...,menarche_12,breastfed_9,J81,Z99.2,i10,i25.1,ww_effort,cancer_metastatic,osteoporosis,ww_movement
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,21,"{'Acute COPD exacerbation / infection': None, ...",M,HIV (initial infection),sweating,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,8,"{'Acute COPD exacerbation / infection': None, ...",M,Allergic sinusitis,itchy_nose,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
13,49,"{'Acute COPD exacerbation / infection': None, ...",F,Anaphylaxis,lost_consciousness,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,69,"{'Acute COPD exacerbation / infection': None, ...",M,Tuberculosis,cough,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
19,30,"{'Acute COPD exacerbation / infection': None, ...",F,Tuberculosis,cough_blood,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023490,30,"{'Acute COPD exacerbation / infection': None, ...",M,HIV (initial infection),nausea,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1023491,7,"{'Acute COPD exacerbation / infection': None, ...",F,HIV (initial infection),nausea,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1023492,66,"{'Acute COPD exacerbation / infection': None, ...",F,HIV (initial infection),pain,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1023493,54,"{'Acute COPD exacerbation / infection': None, ...",M,HIV (initial infection),sweating,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Preprocessing

In [None]:
import pandas as pd

differentials = {}

for subset in SUBSETS:
    temp = pd.DataFrame(df[subset].index.tolist()).join(pd.DataFrame(df[subset]['DIFFERENTIAL_DIAGNOSIS'].values.tolist()), how='left', validate='1:1').set_index(0)
    differentials[subset] = df[subset].join(temp[ds.DISEASES].add_prefix('DD_'), how='left', validate='1:1', ).drop(columns=['DIFFERENTIAL_DIAGNOSIS']).fillna(0)

In [None]:
differentials['train']

Unnamed: 0_level_0,AGE,SEX,PATHOLOGY,INITIAL_EVIDENCE,swollen_nodes,std,sweating,diarrhea,pain,pain_char,...,DD_HIV (initial infection),DD_Whooping cough,DD_Chagas,DD_Tuberculosis,DD_Influenza,DD_SLE,DD_Sarcoidosis,DD_Anaphylaxis,DD_Allergic sinusitis,DD_Localized edema
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,21,M,HIV (initial infection),sweating,0,0,0,0,0,,...,0.518950,0.0,0.321782,0.000000,0.000000,0.0,0.024300,0.000000,0.0,0.0
10,8,M,Allergic sinusitis,itchy_nose,0,0,0,0,1,heavy,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.0,0.0
13,49,F,Anaphylaxis,lost_consciousness,0,0,0,0,0,,...,0.055240,0.0,0.042723,0.000000,0.000000,0.0,0.000000,0.088188,0.0,0.0
18,69,M,Tuberculosis,cough,0,0,0,0,1,heavy,...,0.000000,0.0,0.035444,0.350517,0.000000,0.0,0.000000,0.000000,0.0,0.0
19,30,F,Tuberculosis,cough_blood,0,0,0,0,1,a knife stroke,...,0.000000,0.0,0.000000,0.389381,0.000000,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023490,30,M,HIV (initial infection),nausea,0,0,0,0,0,,...,0.271761,0.0,0.254517,0.000000,0.193150,0.0,0.000000,0.000000,0.0,0.0
1023491,7,F,HIV (initial infection),nausea,0,0,0,0,0,,...,0.434512,0.0,0.237610,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0
1023492,66,F,HIV (initial infection),pain,0,0,0,0,0,,...,0.345198,0.0,0.219237,0.000000,0.278058,0.0,0.016098,0.000000,0.0,0.0
1023493,54,M,HIV (initial infection),sweating,0,0,0,0,0,,...,0.396437,0.0,0.254620,0.000000,0.313172,0.0,0.000000,0.000000,0.0,0.0


In [None]:
extra_columns = set(differentials['train'].columns) - set(differentials['test'].columns)

In [None]:
extra_columns == set(differentials['train'].columns) - set(differentials['validate'].columns)

True

Drop the columns in the train set which are not present in the validation and test sets:

In [None]:
differentials['train'] = differentials['train'].drop(columns=extra_columns)

In [None]:
for subset in SUBSETS:
    print(len(differentials[subset].columns))

105
105
105


## Predicting DD's from symptoms

In [None]:
DD_DISEASES = [col for col in differentials['train'].columns if col.startswith('DD_')]

In [None]:
X_first = {}
y_first = {}

for subset in SUBSETS:
    X_first[subset] = differentials[subset].drop(columns=['PATHOLOGY']+DD_DISEASES)
    y_first[subset] = differentials[subset][DD_DISEASES].copy()

In [None]:
CATEGORICAL_FEATURES = [col for col in X_first['train'].columns if X_first['train'][col].dtype == 'object']

CATEGORICAL_FEATURES

['SEX',
 'INITIAL_EVIDENCE',
 'pain_char',
 'pain_somewhere',
 'pain_radiate',
 'lesion_color',
 'lesions_peeling',
 'lesion_location',
 'trav1',
 'swelling_location']

In [None]:
NUMERICAL_FEATURES = [col for col in X_first['train'].columns if (set(X_first['train'][col].unique()) != set([0,1])) and (X_first['train'][col].dtype != 'object')]

NUMERICAL_FEATURES

['AGE',
 'pain_intensity',
 'pain_precise',
 'pain_sudden',
 'lesion_pain_swollen',
 'lesion_pain_intense',
 'itching_severity']

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold

SEED = 42

### Linear regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

pipe = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), NUMERICAL_FEATURES),
            ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES),
            ],
        remainder='passthrough'
    ),
    VarianceThreshold(),
    #SelectFromModel(LinearRegression()),
    LinearRegression()
)




In [None]:
pipe.fit(X_first['train'], y_first['train'])

In [None]:
pipe.score(X=X_first['validate'], y=y_first['validate'])

0.45513734537277967

### Decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=SEED)

pipe_tree = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), NUMERICAL_FEATURES),
            ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES),
            ],
        remainder='passthrough'
    ),
    dt
)

pipe_tree.fit(X_first['train'], y_first['train'])

In [None]:
pipe_tree.score(X=X_first['validate'], y=y_first['validate'])

0.3313552199656854

### SGD Regression

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV

sgd = SGDRegressor(random_state=SEED, shuffle=True, penalty='elasticnet')

param_grid = {
    'alpha': [0.0001,0.1],
    #'average': False,
    #'early_stopping': False,
    #'epsilon': 0.1,
    #'eta0': 0.01,
    #'l1_ratio': 0.15,
    #'learning_rate': 'invscaling',
    'loss': ('squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'),
    #'n_iter_no_change': 5,
    'penalty': ('l2', 'l1', 'elasticnet'),
    #'power_t': 0.25,
    #'tol': 0.001,
    #'validation_fraction': 0.1,
    }

gs = GridSearchCV(
    sgd,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1,
)

pipe_sgd = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), NUMERICAL_FEATURES),
            ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES),
            ],
        remainder='passthrough'
    ),
    #gs,
    MultiOutputRegressor(sgd, ),

)

In [None]:
pipe_sgd.fit(X_first['train'], y_first['train'])

In [None]:
pipe_sgd.score(X=X_first['validate'], y=y_first['validate'])

0.45460216065452624

In [None]:
pipe_sgd.steps[1][1].best_params_

{'alpha': 0.0001, 'loss': 'squared_error', 'penalty': 'elasticnet'}

### AdaBoot Regression

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression

ada = AdaBoostRegressor(random_state=SEED, n_estimators=5, estimator=LinearRegression(), loss='square')

pipe_ada = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), NUMERICAL_FEATURES),
            ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES),
            ],
        remainder='passthrough'
    ),
    MultiOutputRegressor(ada, ),
)

In [None]:
pipe_ada.fit(X_first['train'], y_first['train'])

In [None]:
pipe_ada.score(X=X_first['validate'], y=y_first['validate'])

0.4539395478233711

### Model comparison

In [None]:
X = X_first['train'].iloc[0:3]
y = y_first['train'].iloc[0:3]

n_train = X.shape[0]

In [None]:
n_train

3

In [None]:
y

Unnamed: 0_level_0,DD_HIV (initial infection),DD_Whooping cough,DD_Chagas,DD_Tuberculosis,DD_Influenza,DD_SLE,DD_Sarcoidosis,DD_Anaphylaxis,DD_Allergic sinusitis,DD_Localized edema
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.51895,0.0,0.321782,0.0,0.0,0.0,0.0243,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,0.05524,0.0,0.042723,0.0,0.0,0.0,0.0,0.088188,0.0,0.0


In [None]:
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.svm import LinearSVR
import numpy as np
from itertools import combinations
from math import factorial

param_grid = [
    {"kernel": ["linear"]},
    {"kernel": ["poly"], "degree": [2, 3]},
    {"kernel": ["rbf"]},
]

svr = LinearSVR(random_state=SEED)

cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=10, random_state=0)

search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring="roc_auc", cv=cv)
search.fit(X, y)

ValueError: Supported target types are: ('binary', 'multiclass'). Got 'continuous-multioutput' instead.

In [None]:
pairwise_bayesian = []

for model_i, model_k in combinations(range(len(model_scores)), 2):
    model_i_scores = model_scores.iloc[model_i].values
    model_k_scores = model_scores.iloc[model_k].values
    differences = model_i_scores - model_k_scores
    t_post = t(
        df, loc=np.mean(differences), scale=corrected_std(differences, n_train, n_test)
    )
    worse_prob = t_post.cdf(rope_interval[0])
    better_prob = 1 - t_post.cdf(rope_interval[1])
    rope_prob = t_post.cdf(rope_interval[1]) - t_post.cdf(rope_interval[0])

    pairwise_bayesian.append([worse_prob, better_prob, rope_prob])

pairwise_bayesian_df = pd.DataFrame(
    pairwise_bayesian, columns=["worse_prob", "better_prob", "rope_prob"]
).round(3)

pairwise_comp_df = pairwise_comp_df.join(pairwise_bayesian_df)
pairwise_comp_df

## Predicting disease from symptoms

In [None]:
X_second = {}
y_second = {}

for subset in SUBSETS:
    X_second[subset] = differentials[subset][DD_DISEASES].copy()
    y_second[subset] = differentials[subset]['PATHOLOGY'].copy()

In [None]:
X_second['train']

Unnamed: 0_level_0,DD_HIV (initial infection),DD_Whooping cough,DD_Chagas,DD_Tuberculosis,DD_Influenza,DD_SLE,DD_Sarcoidosis,DD_Anaphylaxis,DD_Allergic sinusitis,DD_Localized edema
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.518950,0.0,0.321782,0.000000,0.000000,0.0,0.024300,0.000000,0.0,0.0
10,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.0,0.0
13,0.055240,0.0,0.042723,0.000000,0.000000,0.0,0.000000,0.088188,0.0,0.0
18,0.000000,0.0,0.035444,0.350517,0.000000,0.0,0.000000,0.000000,0.0,0.0
19,0.000000,0.0,0.000000,0.389381,0.000000,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1023490,0.271761,0.0,0.254517,0.000000,0.193150,0.0,0.000000,0.000000,0.0,0.0
1023491,0.434512,0.0,0.237610,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0
1023492,0.345198,0.0,0.219237,0.000000,0.278058,0.0,0.016098,0.000000,0.0,0.0
1023493,0.396437,0.0,0.254620,0.000000,0.313172,0.0,0.000000,0.000000,0.0,0.0


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report as cr

lr = LogisticRegression(max_iter=100)


lr.fit(X_second['train'], y_second['train'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
print(cr(y_second['validate'], lr.predict(X_second['validate'])))

                         precision    recall  f1-score   support

     Allergic sinusitis       1.00      1.00      1.00      2136
            Anaphylaxis       0.93      1.00      0.97      3754
                 Chagas       0.93      0.76      0.84      1124
HIV (initial infection)       0.97      0.98      0.97      3852
              Influenza       1.00      0.98      0.99      3590
        Localized edema       0.98      1.00      0.99      3694
     Pulmonary embolism       1.00      0.81      0.89       956
                    SLE       0.97      0.99      0.98      1579
            Sarcoidosis       0.96      0.99      0.98      3028
           Tuberculosis       1.00      0.99      1.00      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.97     26265
              macro avg       0.98      0.95      0.96     26265
           weighted avg       0.97      0.97      0.97     26265



In [None]:
print(cr(y_second['validate'], lr.predict(pipe.predict(X_first['validate']))))



                         precision    recall  f1-score   support

     Allergic sinusitis       0.85      0.84      0.84      2136
            Anaphylaxis       0.92      0.38      0.54      3754
                 Chagas       0.29      0.32      0.31      1124
HIV (initial infection)       0.41      0.78      0.54      3852
              Influenza       0.98      0.16      0.28      3590
        Localized edema       0.76      0.60      0.67      3694
     Pulmonary embolism       0.09      0.45      0.15       956
                    SLE       1.00      0.32      0.48      1579
            Sarcoidosis       0.62      0.55      0.58      3028
           Tuberculosis       0.54      0.52      0.53      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.52     26265
              macro avg       0.68      0.54      0.54     26265
           weighted avg       0.71      0.52      0.53     26265



In [None]:
pipe.predict(X_first['validate'])

array([[ 4.49161530e-02,  8.82387161e-04,  1.14204407e-01, ...,
         5.58061600e-02,  8.88061523e-03,  2.12120056e-01],
       [ 4.93240356e-03, -7.61032104e-04,  1.27983093e-02, ...,
         6.99043274e-03,  5.32890320e-01,  1.81579590e-02],
       [ 5.31635284e-02,  6.96420670e-04,  1.16756439e-01, ...,
         5.93590736e-02,  6.39343262e-03,  2.19760895e-01],
       ...,
       [ 2.52056122e-02, -4.03881073e-04,  2.23274231e-02, ...,
         6.97755814e-03,  1.25076294e-01, -2.52151489e-03],
       [-2.00843811e-03,  5.41210175e-04,  2.02140808e-02, ...,
        -9.74178314e-04,  3.40270996e-03,  2.26974487e-04],
       [ 1.85312271e-01,  5.70774078e-04,  1.74026489e-01, ...,
         2.35033035e-03,  3.52478027e-03, -1.31378174e-02]])

# Appendix

## Checking how well differential diagnosis predicts pathology

In [None]:
import pandas as pd
import numpy as np

In [None]:
X_train = df['train']['DIFFERENTIAL_DIAGNOSIS'].copy()
y_train = df['train']['PATHOLOGY'].copy()

X_val = df['validate']['DIFFERENTIAL_DIAGNOSIS'].copy()
y_val = df['validate']['PATHOLOGY'].copy()

In [None]:
X_train

index
1          {'Acute COPD exacerbation / infection': None, ...
10         {'Acute COPD exacerbation / infection': None, ...
13         {'Acute COPD exacerbation / infection': None, ...
18         {'Acute COPD exacerbation / infection': None, ...
19         {'Acute COPD exacerbation / infection': None, ...
                                 ...                        
1023490    {'Acute COPD exacerbation / infection': None, ...
1023491    {'Acute COPD exacerbation / infection': None, ...
1023492    {'Acute COPD exacerbation / infection': None, ...
1023493    {'Acute COPD exacerbation / infection': None, ...
1023494    {'Acute COPD exacerbation / infection': None, ...
Name: DIFFERENTIAL_DIAGNOSIS, Length: 209143, dtype: object

In [None]:
y_train

index
1          HIV (initial infection)
10              Allergic sinusitis
13                     Anaphylaxis
18                    Tuberculosis
19                    Tuberculosis
                    ...           
1023490    HIV (initial infection)
1023491    HIV (initial infection)
1023492    HIV (initial infection)
1023493    HIV (initial infection)
1023494    HIV (initial infection)
Name: PATHOLOGY, Length: 209143, dtype: object

In [None]:
X_train = pd.DataFrame(X_train.values.tolist()).fillna(0)
X_val = pd.DataFrame(X_val.values.tolist()).fillna(0)

In [None]:
X_train = X_train[ds.DISEASES]
X_val = X_val[ds.DISEASES]

In [None]:
X_train

Unnamed: 0,HIV (initial infection),Whooping cough,Chagas,Tuberculosis,Influenza,SLE,Sarcoidosis,Anaphylaxis,Allergic sinusitis,Localized edema
0,0.518950,0.0,0.321782,0.000000,0.000000,0.0,0.024300,0.000000,0.0,0.0
1,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.0,0.0
2,0.055240,0.0,0.042723,0.000000,0.000000,0.0,0.000000,0.088188,0.0,0.0
3,0.000000,0.0,0.035444,0.350517,0.000000,0.0,0.000000,0.000000,0.0,0.0
4,0.000000,0.0,0.000000,0.389381,0.000000,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
209138,0.271761,0.0,0.254517,0.000000,0.193150,0.0,0.000000,0.000000,0.0,0.0
209139,0.434512,0.0,0.237610,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0
209140,0.345198,0.0,0.219237,0.000000,0.278058,0.0,0.016098,0.000000,0.0,0.0
209141,0.396437,0.0,0.254620,0.000000,0.313172,0.0,0.000000,0.000000,0.0,0.0


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report as cr

lr = LogisticRegression(max_iter=10, solver='saga')

new_cols = [col for col in X_train.columns if col not in ['Spontaneous pneumothorax', 'Spontaneous rib fracture']]
new_train = X_train[new_cols].copy()
new_val = X_val[new_cols].copy()

lr.fit(new_train, y_train)



In [None]:
print(cr(y_val, lr.predict(new_val)))

                         precision    recall  f1-score   support

     Allergic sinusitis       1.00      1.00      1.00      2136
            Anaphylaxis       0.94      1.00      0.97      3754
                 Chagas       0.93      0.76      0.83      1124
HIV (initial infection)       0.97      0.98      0.97      3852
              Influenza       1.00      0.98      0.99      3590
        Localized edema       0.98      1.00      0.99      3694
     Pulmonary embolism       1.00      0.81      0.89       956
                    SLE       0.97      0.99      0.98      1579
            Sarcoidosis       0.96      0.99      0.98      3028
           Tuberculosis       1.00      0.99      1.00      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.97     26265
              macro avg       0.98      0.95      0.96     26265
           weighted avg       0.97      0.97      0.97     26265



Pretty well!

In [None]:
lr.decision_function

array([[-5.07343657e-03, -2.58905124e-02, -3.51715777e-03,
        -3.85523644e-03,  4.57232716e-02, -3.07439381e-03,
        -3.73991759e-03,  3.65924267e-02, -3.97838306e-02,
         3.68885465e-03,  5.52618014e-02, -1.15681427e-02,
        -2.73429836e-04, -3.60371360e-02, -2.11337191e-02,
         1.50891819e-02, -4.15541151e-03, -7.77129046e-03,
        -6.07512022e-03, -1.10815251e-03, -4.81046489e-03,
         4.31906975e-02, -3.08351741e-02, -2.46430905e-02,
        -2.45048049e-02, -1.07806837e-02, -6.66724580e-03,
         8.15042227e-02, -1.94810451e-02,  3.93736084e-04,
        -1.07816040e-02, -1.32240097e-02,  1.45128440e-02,
         2.93163804e-02, -2.92849264e-02,  5.65798467e-02,
         2.54727180e-02, -9.67682508e-03, -2.55158767e-02,
        -7.85132841e-03, -2.53538607e-02,  6.54349274e-02,
        -1.91058015e-02,  4.58447161e-02,  8.50319573e-02,
        -5.03511003e-03, -2.53677770e-03],
       [-2.02456992e-02, -1.28261074e+00,  1.21019913e+00,
         3.22

In [None]:
# Create a DataFrame with feature names and coefficients
coefficients_df = pd.DataFrame({'Feature': new_cols, 'Coefficient': lr.coef_[0]})

# Display the DataFrame
coefficients_df

Unnamed: 0,Feature,Coefficient
0,HIV (initial infection),-0.011551
1,Whooping cough,-0.003758
2,Chagas,0.019939
3,Tuberculosis,-0.00964
4,Influenza,-0.009631
5,SLE,-0.018685
6,Sarcoidosis,-0.006695
7,Anaphylaxis,0.028034
8,Allergic sinusitis,-0.004847
9,Localized edema,0.079138


## Using feature selection

### SVC -> Gradient Boosting

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier

SEED = 42
C = 1/np.sqrt(X_train.shape[0])

pipe = make_pipeline(
    SelectFromModel(
        LinearSVC(
            random_state = SEED,
            max_iter = 10,
            C = C
        ),
        threshold = 'median'
    ),
    GradientBoostingClassifier(random_state = SEED, n_estimators=10)
)


In [None]:
pipe.fit(new_train, y_train)



In [None]:
print(cr(y_val, pipe.predict(new_val)))

                         precision    recall  f1-score   support

     Allergic sinusitis       1.00      1.00      1.00      2136
            Anaphylaxis       0.97      1.00      0.98      3754
                 Chagas       0.99      0.75      0.85      1124
HIV (initial infection)       0.96      0.99      0.98      3852
              Influenza       0.99      0.99      0.99      3590
        Localized edema       1.00      1.00      1.00      3694
     Pulmonary embolism       1.00      0.96      0.98       956
                    SLE       0.99      1.00      0.99      1579
            Sarcoidosis       0.97      1.00      0.99      3028
           Tuberculosis       1.00      1.00      1.00      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.98     26265
              macro avg       0.99      0.97      0.98     26265
           weighted avg       0.98      0.98      0.98     26265



In [None]:
pipe.named_steps['gradientboostingclassifier'].feature_importances_

array([1.67967469e-01, 4.16153175e-03, 8.33855980e-04, 5.73154905e-03,
       2.71947271e-02, 8.10618623e-03, 9.60285689e-03, 3.46370063e-03,
       1.72679280e-01, 5.56007800e-02, 1.03530049e-01, 1.76453628e-02,
       1.35813791e-01, 1.66030255e-06, 3.70097449e-04, 1.69959234e-02,
       4.21747867e-02, 1.23054322e-01, 7.51847266e-04, 7.32594873e-04,
       6.49531408e-02, 9.52565187e-03, 8.55394148e-03, 2.05548952e-02])

### SVC -> Logistic

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

SEED = 42
C = 1/np.sqrt(X_train.shape[0])

pipe2 = make_pipeline(
    SelectFromModel(
        LinearSVC(
            random_state = SEED,
            max_iter = 10,
            #C = C
        ),
        max_features = 25
    ),
    LogisticRegression(
        random_state = SEED,
        max_iter = 10,
        solver = 'saga'
    ),
)


In [None]:
pipe2.fit(new_train, y_train)



In [None]:
print(cr(y_val, pipe2.predict(new_val)))

                         precision    recall  f1-score   support

     Allergic sinusitis       1.00      1.00      1.00      2136
            Anaphylaxis       1.00      1.00      1.00      3754
                 Chagas       0.97      0.83      0.89      1124
HIV (initial infection)       0.97      0.99      0.98      3852
              Influenza       0.97      0.99      0.98      3590
        Localized edema       1.00      1.00      1.00      3694
     Pulmonary embolism       1.00      0.99      0.99       956
                    SLE       0.99      0.99      0.99      1579
            Sarcoidosis       0.99      1.00      0.99      3028
           Tuberculosis       0.97      0.72      0.82      2007
         Whooping cough       0.53      1.00      0.69       545

               accuracy                           0.97     26265
              macro avg       0.94      0.96      0.94     26265
           weighted avg       0.98      0.97      0.97     26265



In [None]:
pipe2.named_steps['linearsvc'].feature_importances_

array([1.67967469e-01, 4.16153175e-03, 8.33855980e-04, 5.73154905e-03,
       2.71947271e-02, 8.10618623e-03, 9.60285689e-03, 3.46370063e-03,
       1.72679280e-01, 5.56007800e-02, 1.03530049e-01, 1.76453628e-02,
       1.35813791e-01, 1.66030255e-06, 3.70097449e-04, 1.69959234e-02,
       4.21747867e-02, 1.23054322e-01, 7.51847266e-04, 7.32594873e-04,
       6.49531408e-02, 9.52565187e-03, 8.55394148e-03, 2.05548952e-02])