## Loading the data

In [2]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [3]:
DRIVE = '/content/drive/MyDrive/Disease-Prediction/ddx-dataset/'

In [4]:
!pip install git+https://github.com/nina-adhikari/disease_prediction

Collecting git+https://github.com/nina-adhikari/disease_prediction
  Cloning https://github.com/nina-adhikari/disease_prediction to /tmp/pip-req-build-agdmolke
  Running command git clone --filter=blob:none --quiet https://github.com/nina-adhikari/disease_prediction /tmp/pip-req-build-agdmolke
  Resolved https://github.com/nina-adhikari/disease_prediction to commit aaed0a65174aa3d0771a78936378945bcb167bad
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: disease-prediction
  Building wheel for disease-prediction (setup.py) ... [?25l[?25hdone
  Created wheel for disease-prediction: filename=disease_prediction-0.1-py3-none-any.whl size=11270 sha256=c5e21d3ecd46056fac1a4d84e8da3bdf8582f85431bd0a91f27631d96e254803
  Stored in directory: /tmp/pip-ephem-wheel-cache-q6igj9tt/wheels/c1/30/69/a4efc8ebfadf754cf631ddaf3e9e848bd514c4db078acf14f5
Successfully built disease-prediction
Installing collected packages: disease-prediction
Successfully installe

In [5]:
from disease_prediction.data import datasets as ds

In [6]:
SUBSETS = ['train', 'test', 'validate']

In [7]:
df = ds.load_datasets(subsets=SUBSETS, directory=DRIVE)

In [None]:
df['test']

Unnamed: 0_level_0,AGE,SEX,PATHOLOGY,INITIAL_EVIDENCE,sweating,pain,pain_char,pain_somewhere,pain_intensity,pain_precise,...,stridor,patho_endo,breastfed_9,anorexia,new_fatigue,vomiting_cough,coughing_fits,vaccination,cont_pertussis,wheezing_inhale
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22,10,F,Influenza,cough,0,1,sharp,belly,3,1,...,0,0,0,0,0,0,0,0,0,0
24,79,M,Influenza,sore_throat,1,1,exhausting,temple(R),6,8,...,0,0,0,0,0,0,0,0,0,0
35,20,M,Influenza,fever,0,1,a cramp,pubis,4,5,...,0,0,0,0,0,0,0,0,0,0
38,3,F,Localized edema,pain,0,1,sensitive,temple(L),2,3,...,0,0,0,0,0,0,0,0,0,0
39,87,M,Localized edema,pain,0,1,sharp,sole(L),3,8,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134509,9,M,Localized edema,gained_weight,0,0,,nowhere,0,0,...,0,0,0,0,0,0,0,0,0,0
134520,17,F,Anaphylaxis,nausea,0,0,,nowhere,0,0,...,0,0,0,0,0,0,0,0,0,0
134521,4,F,Influenza,muscle_pain,0,0,,nowhere,0,0,...,0,0,0,0,0,0,0,0,0,0
134522,21,M,Allergic sinusitis,eye_itching,0,0,,nowhere,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
d = {'Y': 1, 'N': 0}

# drop the columns that have a single value in all three datasets and convert Y/N to 1/0
for subset in SUBSETS:
    df[subset].drop(columns=['pain_radiate', 'lesions_peeling'], inplace=True)
    df[subset]['lesion_larger_than_1cm'] = df[subset]['lesion_larger_than_1cm'].map(d)

In [9]:
for subset in SUBSETS:
    df[subset].set_index('index', inplace=True)

## Using fewer samples

### Feature selection

Here are the features we consider "important". We have removed all the questions about antecedents and family history etc

In [10]:
SYMPTOM_PHRASES = {
    'pain_char': 'pain can be characterized as',
    'lesion_color': 'rash color',
    'pain_somewhere': 'pain location',
    'lesion_location': 'rash location',
    'swelling_location': 'swelling location',
    'swollen_nodes': 'swollen lymph nodes',
    'sweating': 'increased sweating',
    'diarrhea': 'diarrhea',
    'pain': 'pain',
    'pain_intensity': 'pain intensity',
    'fever': 'fever presence',
    'lesions': 'skin lesions or rashes',
    'lesion_pain_swollen': 'swollen rash',
    'lesion_pain_intense': 'rash pain intensity',
    'itching_severity': 'itching severity',
    'nausea': 'nausea or vomiting',
    'weight_loss': 'weight loss',
    'itchy_nose': 'itchy nose or throat',
    'eye_itching': 'eye itching',
    'runny_nose': 'nasal congestion or runny nose',
    'short_breath': 'shortness of breath',
    'swelling': 'swelling',
    'lost_consciousness': 'loss of consciousness or fainting',
    'stridor': 'high-pitched breathing sound',
    'cough': 'cough presence',
    'cough_blood': 'coughing up blood',
    'ulcers': 'mouth ulcers or sores',
    'anorexia': 'unintentional weight loss or appetite loss',
    'new_fatigue': 'new fatigue or muscle aches',
    'convulsion': 'muscle contractions or absence episodes',
    'red_eye': 'redness in eyes',
    'gained_weight': 'recent weight gain',
    'dizziness': 'lightheadedness or faint feeling',
    'wheezing_exhale': 'wheezing on exhale',
    'fatigue_ext': 'extreme fatigue affecting activities',
    'sore_throat': 'sore throat',
    'muscle_pain': 'diffuse muscle pain',
    'lost_appetite': 'loss of appetite or early fullness',
    'vomiting_cough': 'vomiting after coughing',
    'coughing_fits': 'intense coughing fits',
    'chills': 'chills or shivers',
    'vag_discharge': 'vaginal discharge',
    'wheezing_inhale': 'wheezing on inhale or after coughing',
    'fatigue': 'constant fatigue or non-restful sleep',
    'confusion': 'confusion or disorientation',
    'bruising': 'unusual bleeding or bruising',
    'contact_allergy': 'had an allergic reaction',
}

In [11]:
main_cols = ['AGE', 'SEX', 'PATHOLOGY', 'INITIAL_EVIDENCE', ]

In [12]:
cols_to_keep = list(SYMPTOM_PHRASES.keys()) + main_cols

Drop the columns we don't like:

In [13]:
for subset in SUBSETS:
    for col in df[subset].columns:
        if col not in cols_to_keep:
            df[subset].drop(columns=[col], inplace=True)

In [13]:
df['test'].columns

Index(['AGE', 'SEX', 'PATHOLOGY', 'INITIAL_EVIDENCE', 'sweating', 'pain',
       'pain_char', 'pain_somewhere', 'pain_intensity', 'fatigue_ext', 'fever',
       'sore_throat', 'lesions', 'lesion_color', 'lesion_pain_swollen',
       'lesion_location', 'lesion_pain_intense', 'itching_severity',
       'muscle_pain', 'lost_appetite', 'cough', 'runny_nose', 'gained_weight',
       'swelling', 'swelling_location', 'swollen_nodes', 'diarrhea',
       'weight_loss', 'convulsion', 'short_breath', 'itchy_nose',
       'eye_itching', 'cough_blood', 'fatigue', 'ulcers', 'red_eye',
       'vag_discharge', 'nausea', 'contact_allergy', 'wheezing_exhale',
       'dizziness', 'lost_consciousness', 'chills', 'stridor', 'anorexia',
       'new_fatigue', 'vomiting_cough', 'coughing_fits', 'wheezing_inhale'],
      dtype='object')

In [14]:
def get_numerical_features(data_frame):
    columns = []
    for col in data_frame.columns:
        if data_frame[col].dtype == 'int64' and set(data_frame[col].unique()) != set([0,1]):
            columns.append(col)
    return columns

In [15]:
def get_boolean_features(data_frame):
    columns = []
    numericals = get_numerical_features(data_frame)
    for col in data_frame.columns:
        if data_frame[col].dtype == 'int64' and set(data_frame[col].unique()) == set([0,1]):
            columns.append(col)
    return columns


Create a new column which contains a list of all the symptoms that are not initial evidence:

In [16]:
from collections import defaultdict

def get_keys_with_value_one(dictionary):
    keys = []
    for key, val in dictionary.items():
        if val == 1:
            keys.append(key)
    return keys

def get_dict_from_booleans(data_frame):
    dfq = defaultdict(list)
    bools = get_boolean_features(data_frame)
    for (key, val) in get_keys_with_value_one(data_frame[bools].stack()):
        dfq[key].append(val)

    return dict(dfq)

In [17]:
for subset in SUBSETS:
    df[subset]['OTHER_SYMPTOMS'] = get_dict_from_booleans(df[subset])

In [18]:
df['test']

Unnamed: 0_level_0,AGE,SEX,PATHOLOGY,INITIAL_EVIDENCE,sweating,pain,pain_char,pain_somewhere,pain_intensity,fatigue_ext,...,dizziness,lost_consciousness,chills,stridor,anorexia,new_fatigue,vomiting_cough,coughing_fits,wheezing_inhale,OTHER_SYMPTOMS
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22,10,F,Influenza,cough,0,1,sharp,belly,3,0,...,1,1,0,0,0,0,0,0,0,"[pain, lesions, swelling, diarrhea, short_brea..."
24,79,M,Influenza,sore_throat,1,1,exhausting,temple(R),6,0,...,0,0,0,0,0,0,0,0,0,"[sweating, pain, fever, lesions, muscle_pain, ..."
35,20,M,Influenza,fever,0,1,a cramp,pubis,4,0,...,1,1,0,0,0,0,0,0,0,"[pain, lesions, swelling, diarrhea, short_brea..."
38,3,F,Localized edema,pain,0,1,sensitive,temple(L),2,1,...,0,0,0,0,0,0,0,0,0,"[pain, fatigue_ext, fever, lesions, swollen_no..."
39,87,M,Localized edema,pain,0,1,sharp,sole(L),3,0,...,0,0,0,0,0,0,0,0,0,"[pain, gained_weight, swelling]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134509,9,M,Localized edema,gained_weight,0,0,,nowhere,0,0,...,0,0,0,0,0,0,0,0,0,
134520,17,F,Anaphylaxis,nausea,0,0,,nowhere,0,0,...,0,0,0,0,0,0,0,0,0,
134521,4,F,Influenza,muscle_pain,0,0,,nowhere,0,0,...,0,0,0,0,0,0,0,0,0,
134522,21,M,Allergic sinusitis,eye_itching,0,0,,nowhere,0,0,...,0,0,0,0,0,0,0,0,0,


In [18]:
irrelevant = [0, 'NA', 'nowhere', 'N', 'not specified']

main_cols += ['OTHER_SYMPTOMS']

### Sample selection

In [19]:
def get_dict_from_row(row):
    d = {}
    for (key, val) in dict(row).items():
        if key not in main_cols and val not in irrelevant:
            d[key] = val
    if type(row['OTHER_SYMPTOMS']) == list:
        ns = row['OTHER_SYMPTOMS'] + [d]
    elif d == {}:
        ns = []
    else:
        ns = [d]
    return ns

In [20]:
for subset in SUBSETS:
    df[subset]['NEW_SYMPTOMS'] = df[subset].apply(get_dict_from_row, axis=1)

In [22]:
df['train']

Unnamed: 0_level_0,AGE,SEX,PATHOLOGY,INITIAL_EVIDENCE,swollen_nodes,sweating,diarrhea,pain,pain_char,pain_somewhere,...,muscle_pain,lost_appetite,vomiting_cough,coughing_fits,chills,vag_discharge,wheezing_inhale,fatigue,OTHER_SYMPTOMS,NEW_SYMPTOMS
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,21,M,HIV (initial infection),sweating,0,0,0,0,,nowhere,...,0,0,0,0,0,0,0,0,"[itchy_nose, eye_itching, runny_nose]","[itchy_nose, eye_itching, runny_nose, {'itchy_..."
10,8,M,Allergic sinusitis,itchy_nose,0,0,0,1,heavy,sole(L),...,0,0,0,0,0,0,0,0,"[pain, swelling]","[pain, swelling, {'pain': 1, 'pain_char': 'hea..."
13,49,F,Anaphylaxis,lost_consciousness,0,0,0,0,,nowhere,...,0,0,0,0,0,0,0,0,"[eye_itching, runny_nose]","[eye_itching, runny_nose, {'eye_itching': 1, '..."
18,69,M,Tuberculosis,cough,0,0,0,1,heavy,occiput,...,1,1,0,0,0,0,0,0,"[pain, fever, lesions, runny_nose, cough, fati...","[pain, fever, lesions, runny_nose, cough, fati..."
19,30,F,Tuberculosis,cough_blood,0,0,0,1,sharp,epigastric,...,0,0,0,0,0,0,0,0,"[pain, lesions, nausea, contact_allergy, short...","[pain, lesions, nausea, contact_allergy, short..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023490,30,M,HIV (initial infection),nausea,0,0,0,0,,nowhere,...,0,0,0,0,0,0,0,0,,[]
1023491,7,F,HIV (initial infection),nausea,0,0,0,0,,nowhere,...,0,0,0,0,0,0,0,0,,[]
1023492,66,F,HIV (initial infection),pain,0,0,0,0,,nowhere,...,0,0,0,0,0,0,0,0,,[]
1023493,54,M,HIV (initial infection),sweating,0,0,0,0,,nowhere,...,0,0,0,0,0,0,0,0,,[]


In [21]:
for subset in SUBSETS:
    df[subset].drop(columns=['OTHER_SYMPTOMS'], inplace=True)
    df[subset].rename(columns={'NEW_SYMPTOMS': 'OTHER_SYMPTOMS'}, inplace=True)

Keep only those samples that have at least one symptom from our list of important symptoms:

In [352]:
X = {}
y = {}

for subset in SUBSETS:
    X[subset] = df[subset].loc[df[subset]['OTHER_SYMPTOMS'].astype(bool)].drop(columns=['PATHOLOGY'])
    y[subset] = df[subset].loc[df[subset]['OTHER_SYMPTOMS'].astype(bool)]['PATHOLOGY'].copy()

In [353]:
for subset in SUBSETS:
    X[subset] = X[subset].drop(columns=['OTHER_SYMPTOMS'])

### Prep for sklearn algorithms

In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report as cr
from sklearn.feature_selection import SelectFromModel

SEED = 42

In [None]:
CATEGORICAL_FEATURES = [col for col in X['train'].columns if X['train'][col].dtype == 'object']

CATEGORICAL_FEATURES

['SEX',
 'INITIAL_EVIDENCE',
 'pain_char',
 'pain_somewhere',
 'lesion_color',
 'lesion_location',
 'swelling_location']

In [None]:
NUMERICAL_FEATURES = [col for col in X['train'].columns if (set(X['train'][col].unique()) != set([0,1])) and (X['train'][col].dtype != 'object')]

NUMERICAL_FEATURES

['AGE',
 'pain_intensity',
 'lesion_pain_swollen',
 'lesion_pain_intense',
 'itching_severity']

### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    #max_iter=10,
    solver='saga',
    class_weight='balanced',
    penalty='l2',
    l1_ratio=0.1,
    random_state=SEED
)

lr_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), NUMERICAL_FEATURES),
            ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES)
        ],
        remainder='passthrough'
    ),
    lr
)

In [None]:
lr_pipeline.fit(X['train'], y['train'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
print(cr(y['validate'], lr_pipeline.predict(X['validate'])))

                         precision    recall  f1-score   support

     Allergic sinusitis       0.81      0.86      0.83       399
            Anaphylaxis       0.94      0.40      0.57       717
                 Chagas       0.26      0.40      0.32       228
HIV (initial infection)       0.50      0.13      0.21       733
              Influenza       0.63      0.38      0.47       731
        Localized edema       0.53      0.91      0.67       728
                    SLE       0.35      0.64      0.45       311
            Sarcoidosis       0.61      0.63      0.62       547
           Tuberculosis       0.49      0.84      0.62       405
         Whooping cough       1.00      1.00      1.00       111

               accuracy                           0.56      4910
              macro avg       0.61      0.62      0.58      4910
           weighted avg       0.62      0.56      0.54      4910



### Gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils.class_weight import compute_sample_weight

weighted_y = {}

for subset in SUBSETS:
    weighted_y[subset] = compute_sample_weight(class_weight='balanced', y=y[subset])

gb = GradientBoostingClassifier(
    random_state = SEED,
    #n_estimators=10,
    loss='log_loss',
    #criterion= 'friedman_mse',
    max_depth=10,
    #learning_rate=0.1,
    #min_samples_leaf=5,
    #min_samples_split=5,
    #max_features='sqrt',
    #min_weight_fraction_leaf=0.5,
    #subsample=1,
)

gb_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), NUMERICAL_FEATURES),
            ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES)
        ],
        remainder='passthrough'
    ),
    gb
)

In [None]:
gb_pipeline.fit(X['train'], y['train'], gradientboostingclassifier__sample_weight=weighted_y['train'])

In [None]:
print(cr(y['validate'], gb_pipeline.predict(X['validate'])))

                         precision    recall  f1-score   support

     Allergic sinusitis       0.70      0.92      0.79       399
            Anaphylaxis       0.67      0.50      0.57       717
                 Chagas       0.32      0.31      0.31       228
HIV (initial infection)       0.45      0.34      0.39       733
              Influenza       0.57      0.37      0.45       731
        Localized edema       0.55      0.78      0.64       728
                    SLE       0.45      0.55      0.50       311
            Sarcoidosis       0.55      0.62      0.58       547
           Tuberculosis       0.48      0.55      0.51       405
         Whooping cough       1.00      1.00      1.00       111

               accuracy                           0.55      4910
              macro avg       0.57      0.59      0.57      4910
           weighted avg       0.55      0.55      0.54      4910



### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    criterion='entropy',
    #n_estimators = 500, # number of trees in ensemble
    max_depth = 50, # max_depth of each tree
    #min_samples_leaf = 5,
    bootstrap= True, # sampling with replacement
    max_samples = 2500, # number of training samples selected with replacement to build tree
    random_state = SEED, # for consistency
    class_weight='balanced_subsample'
)

rf_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), NUMERICAL_FEATURES),
            ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES)
        ],
        remainder='passthrough'
    ),
    rf
)

In [None]:
rf_pipeline.fit(X['train'], y['train'])

In [None]:
print(cr(y['validate'], rf_pipeline.predict(X['validate'])))

                         precision    recall  f1-score   support

     Allergic sinusitis       0.64      0.98      0.77       399
            Anaphylaxis       0.65      0.52      0.58       717
                 Chagas       0.98      0.20      0.33       228
HIV (initial infection)       0.44      0.46      0.45       733
              Influenza       0.61      0.37      0.46       731
        Localized edema       0.53      0.98      0.69       728
                    SLE       0.76      0.38      0.50       311
            Sarcoidosis       0.60      0.62      0.61       547
           Tuberculosis       0.58      0.44      0.50       405
         Whooping cough       1.00      1.00      1.00       111

               accuracy                           0.58      4910
              macro avg       0.68      0.59      0.59      4910
           weighted avg       0.61      0.58      0.56      4910



### Neural networks

In [24]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
!pip install datasets

In [26]:
from datasets import Dataset, load_dataset

In [238]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_labeled = {}

for subset in SUBSETS:
    y_labeled[subset] = le.fit_transform(y[subset])

In [354]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

for subset in SUBSETS:
    y[subset] = ohe.fit_transform(y[subset].values.reshape(-1, 1)).toarray()

In [None]:
y['train']

In [110]:
def df_to_dataset(X, y, shuffle=True, batch_size=32):
  df = X.copy()
  labels = y.copy()
  df = {key: value.values[:,tf.newaxis] for key, value in df.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(df))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [375]:
ds = {}
for subset in SUBSETS:
    ds[subset] = df_to_dataset(X[subset], y[subset])

In [291]:
NUMERICALS = get_numerical_features(X['train'])

NUMERICALS

['AGE',
 'pain_intensity',
 'lesion_pain_swollen',
 'lesion_pain_intense',
 'itching_severity']

In [292]:
CATEGORICALS = [col for col in X['train'].columns if X['train'][col].dtype == 'object']

CATEGORICALS

['SEX',
 'INITIAL_EVIDENCE',
 'pain_char',
 'pain_somewhere',
 'lesion_color',
 'lesion_location',
 'swelling_location']

In [113]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for the feature.
  normalizer = tf.keras.layers.Normalization(axis=None)

  # Prepare a Dataset that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

In [115]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds =  dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

In [376]:
inputs_not_converted = ['swollen_nodes', 'sweating', 'diarrhea', 'pain', 'fever', 'lesions', 'nausea', 'weight_loss', 'itchy_nose', 'eye_itching', 'runny_nose', 'contact_allergy', 'short_breath', 'swelling', 'lost_consciousness', 'stridor', 'cough', 'cough_blood', 'ulcers', 'anorexia', 'new_fatigue', 'convulsion', 'red_eye', 'gained_weight', 'dizziness', 'wheezing_exhale', 'fatigue_ext', 'sore_throat', 'muscle_pain', 'lost_appetite', 'vomiting_cough', 'coughing_fits', 'chills', 'vag_discharge', 'wheezing_inhale', 'fatigue']
all_inputs = []
encoded_features = []

for header in inputs_not_converted:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    all_inputs.append(numeric_col)
    encoded_features.append(numeric_col)

In [377]:
# Numerical features.
for header in NUMERICALS:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, ds['train'])
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)

In [378]:
for feature in CATEGORICALS:
    categorical_col = tf.keras.Input(shape=(1,), name=feature, dtype='string')
    encoding_layer = get_category_encoding_layer(
        name=feature,
        dataset=ds['train'],
        dtype='string',
        #max_tokens=5
        )
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

In [365]:
from sklearn.utils import compute_class_weight

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_labeled['train']), y=y_labeled['train'])

In [403]:
history = []

In [562]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(32, activation="relu")(x)
x = tf.keras.layers.Dropout(0.5)(x)
#x = tf.keras.layers.Dense(32, activation="selu")(x)
#x = tf.keras.layers.Dropout(0.5)(x)

# If using one-hot encoding
output = tf.keras.layers.Dense(10)(x)

# If using label encoding
#output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

In [563]:
optimizer = tf.keras.optimizers.Adam(
    #learning_rate=0.001,
    #epsilon=0.0001
)

# If using one-hot encoding
cross_entropy_loss = tf.keras.losses.CategoricalFocalCrossentropy(
    from_logits=True,
    gamma=2,
    alpha=class_weights
)

# If using label encoding
sparse_cross_entropy_loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    #label_smoothing=0.01
)

hinge_loss = tf.keras.losses.Hinge()

squared_hinge_loss = tf.keras.losses.SquaredHinge()

accuracy = tf.keras.metrics.CategoricalAccuracy()
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()
f1 = tf.keras.metrics.F1Score()
patr = tf.keras.metrics.PrecisionAtRecall(recall=0.6)
ratp = tf.keras.metrics.RecallAtPrecision(precision=0.6)

metrics = [
    accuracy,
    precision,
    recall,
    #patr,
    #ratp,
    f1
    ]

In [564]:
model.compile(
    optimizer=optimizer,
    loss=cross_entropy_loss,
    metrics=metrics,
    loss_weights=class_weights)

history.append(model.fit(ds['train'], epochs=4, validation_data=ds['validate']))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [554]:
model.compile(
    optimizer=optimizer,
    loss=squared_hinge_loss,
    metrics=metrics,
    loss_weights=class_weights)

history.append(model.fit(ds['train'], epochs=2, validation_data=ds['validate']))

Epoch 1/2
Epoch 2/2


In [590]:
for his in history[-8:]:
    print()
    total = len(layers)-69
    print(his.model.get_compile_config()['loss']['class_name'])
    layers = his.model.get_config()['layers']
    print('Total layers: ', total)
    print('\tName: ', layers[-3]['config']['name'].split('_')[0], 'Activation: ', layers[-3]['config']['activation'])
    print('\tName: ', layers[-2]['config']['name'].split('_')[0], 'Rate: ', layers[-2]['config']['rate'])
    #if total > 2:
        #try:
            #print('Name: ', layers[-4]['config']['name'].split('_')[0], 'Rate: ', layers[-4]['config']['rate'])
        #except:
            #print('Name: ', layers[-4]['config']['name'].split('_')[0], 'Activation: ', layers[-4]['config']['activation'])
        #print('Name: ', layers[-5]['config']['name'].split('_')[0], 'Activation: ', layers[-5]['config']['activation'])
    results = his.model.get_metrics_result()
    for result in results:
        score = results[result].numpy()
        print(str(result) + ': ' + str(score))



SquaredHinge
Total layers:  2
	Name:  dense Activation:  selu
	Name:  dropout Rate:  0.5
loss: 0.25044158
categorical_accuracy: 0.56904274
precision_55: 0.96605164
recall_55: 0.2665988
f1_score: [0.76783913 0.5468384  0.32967034 0.29251102 0.48408714 0.6828774
 0.5068493  0.641193   0.5074257  1.        ]

SquaredHinge
Total layers:  2
	Name:  dense Activation:  relu
	Name:  dropout Rate:  0.5
loss: 0.24657348
categorical_accuracy: 0.58350307
precision_56: 0.9689796
recall_56: 0.24175152
f1_score: [0.7584216  0.581265   0.3284672  0.3982558  0.48263893 0.68299586
 0.49122804 0.6316695  0.5110822  1.        ]

SquaredHinge
Total layers:  2
	Name:  dense Activation:  relu
	Name:  dropout Rate:  0.5
loss: 0.24657348
categorical_accuracy: 0.58350307
precision_56: 0.9689796
recall_56: 0.24175152
f1_score: [0.7584216  0.581265   0.3284672  0.3982558  0.48263893 0.68299586
 0.49122804 0.6316695  0.5110822  1.        ]

CategoricalFocalCrossentropy
Total layers:  2
	Name:  dense Activation:  

## Binary classification

In [None]:
y['train'].map(lambda x: 1 if x == 'SLE' else 0).unique()

array([0, 1])

In [None]:
y['train'].unique()

array(['HIV (initial infection)', 'Allergic sinusitis', 'Anaphylaxis',
       'Tuberculosis', 'SLE', 'Chagas', 'Localized edema', 'Sarcoidosis',
       'Influenza', 'Whooping cough'], dtype=object)

In [None]:
y_i = []
diseases = list(df['train']['PATHOLOGY'].unique())

for disease in diseases:
    y_i_curr = {}
    for subset in SUBSETS:
        y_i_curr[subset] = y[subset].map(lambda x: x if x == disease else 'Other')
    y_i.append(y_i_curr)

In [None]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import f1_score as f1
import numpy as np


np.set_printoptions(precision=2)

scores = {}

lr_scores = []

for i in range(len(diseases)):
    disease = diseases[i]
    lr = LogisticRegression(max_iter=10, solver='saga')
    lr_pipeline = make_pipeline(
        ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), NUMERICAL_FEATURES),
                ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES)
            ],
            remainder='passthrough'
        ),
        lr
    )
    lr_pipeline.fit(X['train'], y_i[i]['train'])
    score = f1(y_i[i]['validate'], lr_pipeline.predict(X['validate']), pos_label='Other')
    lr_scores.append(score)

scores['LogisticRegression'] = lr_scores


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_scores = []

for i in range(len(diseases)):
    disease = diseases[i]
    rf = RandomForestClassifier(n_estimators=100, random_state=SEED, max_depth=20, min_samples_leaf=5)

    rf_pipeline = make_pipeline(
        ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), NUMERICAL_FEATURES),
                ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES)
            ],
            remainder='passthrough'
        ),
        rf
    )
    rf_pipeline.fit(X['train'], y_i[i]['train'])
    score = f1(y_i[i]['validate'], rf_pipeline.predict(X['validate']), pos_label='Other')
    rf_scores.append(score)

scores['RandomForest'] = rf_scores

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_scores = []

for i in range(len(diseases)):
    disease = diseases[i]
    gb = GradientBoostingClassifier(random_state = SEED, n_estimators=10)
    gb_pipeline = make_pipeline(
        ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), NUMERICAL_FEATURES),
                ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES)
            ],
            remainder='passthrough'
        ),
        gb
    )
    gb_pipeline.fit(X['train'], y_i[i]['train'])
    score = f1(y_i[i]['validate'], gb_pipeline.predict(X['validate']), pos_label='Other')
    gb_scores.append(score)

scores['GradientBoosting'] = gb_scores

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_scores = []

for i in range(len(diseases)):
    disease = diseases[i]
    ada = AdaBoostClassifier(random_state = SEED, n_estimators=10, estimator=RandomForestClassifier(random_state = SEED, n_estimators=10))
    ada_pipeline = make_pipeline(
        ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), NUMERICAL_FEATURES),
                ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES)
            ],
            remainder='passthrough'
        ),
        ada
    )
    ada_pipeline.fit(X['train'], y_i[i]['train'])
    score = f1(y_i[i]['validate'], ada_pipeline.predict(X['validate']), pos_label='Other')
    ada_scores.append(score)

scores['AdaBoost'] = ada_scores

In [None]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

xgb_scores = []

for i in range(len(diseases)):
    print(str(i) + '/' + str(len(diseases)))
    disease = diseases[i]
    xgb = XGBClassifier(random_state = SEED, n_estimators=10)
    xgb_pipeline = make_pipeline(
        ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), NUMERICAL_FEATURES),
                ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES)
            ],
            remainder='passthrough'
        ),
        xgb
    )
    le = LabelEncoder()
    xgb_pipeline.fit(X['train'], le.fit_transform(y_i[i]['train']))
    score = f1(le.transform(y_i[i]['validate']), xgb_pipeline.predict(X['validate']), pos_label=le.transform(['Other'])[0])
    xgb_scores.append(score)

scores['XGBoost'] = xgb_scores

0/10
1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10


In [None]:
scores

{'LogisticRegression': [0.9214043035107587,
  0.9801946962067808,
  0.9504366564591131,
  0.9657704351581903,
  0.9781984473040518,
  0.9808316748716874,
  0.9549444119368052,
  0.960373998219056,
  0.9305253519514707,
  1.0],
 'RandomForest': [0.9194312796208531,
  0.9863711911357341,
  0.9507936507936507,
  0.9661162341840018,
  0.9781984473040518,
  0.9808316748716874,
  0.9552203905062552,
  0.96158940397351,
  0.9335269802256732,
  1.0],
 'GradientBoosting': [0.9193353141850995,
  0.9862619100376689,
  0.9436255204230899,
  0.9661162341840018,
  0.9781984473040518,
  0.9808316748716874,
  0.9549065420560747,
  0.9618606701940036,
  0.9332291201429209,
  1.0],
 'AdaBoost': [0.9157165213503242,
  0.9817052888346823,
  0.9349566001578176,
  0.9610556807209527,
  0.9693693693693695,
  0.9761556159799205,
  0.9437340153452686,
  0.9510073764174832,
  0.9231976678999888,
  1.0],
 'XGBoost': [0.9209827357237715,
  0.9845435338596686,
  0.9507936507936507,
  0.9661856558259746,
  0.978198

In [None]:
scores_df = pd.DataFrame(scores).transpose().rename(columns=dict(zip(range(10), diseases)))

In [None]:
scores_df

Unnamed: 0,HIV (initial infection),Allergic sinusitis,Anaphylaxis,Tuberculosis,SLE,Chagas,Localized edema,Sarcoidosis,Influenza,Whooping cough
LogisticRegression,0.921404,0.980195,0.950437,0.96577,0.978198,0.980832,0.954944,0.960374,0.930525,1.0
RandomForest,0.919431,0.986371,0.950794,0.966116,0.978198,0.980832,0.95522,0.961589,0.933527,1.0
GradientBoosting,0.919335,0.986262,0.943626,0.966116,0.978198,0.980832,0.954907,0.961861,0.933229,1.0
AdaBoost,0.915717,0.981705,0.934957,0.961056,0.969369,0.976156,0.943734,0.951007,0.923198,1.0
XGBoost,0.920983,0.984544,0.950794,0.966186,0.978198,0.980725,0.954253,0.962765,0.932243,1.0


## Using few features

### Age, sex and initial evidence

In [None]:
for subset in SUBSETS:
    df[subset] = df[subset][['AGE', 'SEX', 'INITIAL_EVIDENCE', 'PATHOLOGY']]

In [None]:
df['train']

Unnamed: 0,AGE,SEX,INITIAL_EVIDENCE,PATHOLOGY
0,21,M,sweating,HIV (initial infection)
1,8,M,itchy_nose,Allergic sinusitis
2,49,F,lost_consciousness,Anaphylaxis
3,69,M,cough,Tuberculosis
4,30,F,cough_blood,Tuberculosis
...,...,...,...,...
202285,30,M,nausea,HIV (initial infection)
202286,7,F,nausea,HIV (initial infection)
202287,66,F,pain,HIV (initial infection)
202288,54,M,sweating,HIV (initial infection)


In [None]:
X = {}
y = {}

for subset in SUBSETS:
    X[subset] = df[subset].drop(columns=['PATHOLOGY'])
    y[subset] = df[subset]['PATHOLOGY'].copy()

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report as cr

In [None]:
from sklearn.linear_model import LogisticRegression

solver = 'saga'
max_iter = 10

lr = LogisticRegression(max_iter=max_iter, solver=solver)

lr_pipe = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), ['AGE']),
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['SEX', 'INITIAL_EVIDENCE'])
        ],
        remainder='passthrough'
    ),
    lr
)

In [None]:
lr_pipe.fit(X['train'], y['train'])



In [None]:
print(cr(y['validate'], lr_pipe.predict(X['validate'])))

                         precision    recall  f1-score   support

     Allergic sinusitis       0.72      0.93      0.81      2136
            Anaphylaxis       0.98      0.40      0.57      3754
                 Chagas       1.00      0.22      0.37      1124
HIV (initial infection)       0.38      0.55      0.45      3852
              Influenza       0.66      0.33      0.44      3590
        Localized edema       0.52      1.00      0.68      3694
                    SLE       1.00      0.32      0.48      1579
            Sarcoidosis       0.60      0.72      0.65      3028
           Tuberculosis       0.62      0.51      0.56      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.59     25309
              macro avg       0.75      0.60      0.60     25309
           weighted avg       0.68      0.59      0.57     25309



### Just initial evidence

In [None]:
X_train_one = {}

for subset in SUBSETS:
    X_train_one[subset] = df[subset][['INITIAL_EVIDENCE']].copy()

In [None]:
from sklearn.linear_model import LogisticRegression

solver = 'saga'
max_iter = 10

lr_one = LogisticRegression(max_iter=max_iter, solver=solver)

lr_one_pipe = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['INITIAL_EVIDENCE'])
        ],
        remainder='passthrough'
    ),
    lr_one
)

In [None]:
lr_one_pipe.fit(X_train_one['train'], y['train'])



In [None]:
print(cr(y['validate'], lr_one_pipe.predict(X_train_one['validate'])))

                         precision    recall  f1-score   support

     Allergic sinusitis       0.86      0.83      0.85      2136
            Anaphylaxis       0.81      0.47      0.60      3754
                 Chagas       1.00      0.22      0.37      1124
HIV (initial infection)       0.35      0.39      0.37      3852
              Influenza       0.61      0.41      0.49      3590
        Localized edema       0.52      1.00      0.68      3694
                    SLE       1.00      0.32      0.48      1579
            Sarcoidosis       0.60      0.72      0.65      3028
           Tuberculosis       0.57      0.66      0.61      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.59     25309
              macro avg       0.73      0.60      0.61     25309
           weighted avg       0.65      0.59      0.58     25309



## Older Models

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def getXy(dataframe):
    temp = dataframe.drop(dataframe.loc[dataframe['PATHOLOGY'] == 'Ebola'].index)
    return temp.drop(columns=['PATHOLOGY']), temp['PATHOLOGY'].copy()

In [None]:
X_train, y_train = getXy(df['train'])

X_validate, y_validate = getXy(df['validate'])

In [None]:
y_validate

2                  Influenza
6         Allergic sinusitis
12                       SLE
15                 Influenza
19                 Influenza
                 ...        
132421          Tuberculosis
132423             Influenza
132425             Influenza
132433           Sarcoidosis
132441             Influenza
Name: PATHOLOGY, Length: 25309, dtype: object

### Preprocessing and Utilities

In [None]:
categories = [
    [
        'eye_itching', 'fever', 'chills', 'lesions', 'new_fatigue', 'diarrhea', 'cough', 'fatigue_ext', 'bruising', 'dizziness', 'runny_nose', 'coughing_fits', 'convulsion', 'confusion', 'pain', 'weight_loss', 'wheezing_exhale', 'lost_consciousness', 'red_eye', 'fatigue', 'ulcers', 'nausea', 'cough_blood', 'short_breath', 'sore_throat', 'sweating', 'muscle_pain', 'vag_discharge', 'contact_allergy', 'swelling', 'vomiting_cough', 'anorexia', 'lost_appetite', 'wheezing_inhale', 'swollen_nodes', 'stridor', 'gained_weight', 'itchy_nose'
    ],
    ['M', 'F'],
    [
        'Europe', 'South America', 'South Africa', 'Central America', 'West Africa', 'North America', 'Asia', 'N'
    ],
    ['sensitive', 'burning', 'sharp', 'a pulse', 'a knife stroke', 'NA', 'a cramp', 'heavy', 'tugging', 'exhausting', 'tedious'],
    ['pink', 'red', 'pale', 'NA'],
    [
        'temple(R)', 'side of the neck(R)', 'shoulder(R)', 'knee(R)', 'iliac fossa(L)', 'cervical spine', 'side of the neck(L)', 'occiput', 'toe (3)(R)', 'dorsal aspect of the foot(L)', 'dorsal aspect of the foot(R)', 'hip(R)', 'sole(L)', 'finger (index)(L)', 'calf(R)', 'thigh(R)', 'flank(L)', 'finger (middle)(R)', 'toe (1)(R)', 'toe (1)(L)', 'top of the head', 'dorsal aspect of the wrist(L)', 'pharynx', 'nowhere', 'hypochondrium(R)', 'palmar face of the wrist(R)', 'shoulder(L)', 'palmar face of the wrist(L)', 'thumb(R)', 'toe (2)(L)', 'toe (3)(L)', 'thigh(L)', 'knee(L)', 'little toe (4)(R)', 'forehead', 'hypochondrium(L)', 'temple(L)', 'pubis', 'toe (2)(R)', 'back of the neck', 'iliac fossa(R)', 'hip(L)', 'dorsal aspect of the wrist(R)', 'calf(L)', 'belly', 'finger (index)(R)', 'epigastric', 'sole(R)', 'flank(R)', 'back of head', 'finger (middle)(L)'
    ],
    [
        'nose', 'side of the neck(R)', 'palace', 'shoulder(R)', 'thyroid cartilage', 'cervical spine', 'side of the neck(L)', 'labia minora(L)', 'internal cheek(R)', 'thigh(R)', 'nowhere', 'internal cheek(L)', 'ankle(R)', 'shoulder(L)', 'penis', 'vagina', 'lumbar spine', 'ankle(L)', 'labia minora(R)', 'thigh(L)', 'under the tongue', 'forehead', 'upper lip(R)', 'cheek(R)', 'bottom lip(R)', 'back of the neck', 'scrotum', 'cheek(L)', 'epigastric', 'thoracic spine'
    ],
    [
        'nose', 'toe (1)(R)', 'toe (1)(L)', 'thigh(L)', 'calf(L)', 'cheek(L)', 'sole(R)', 'nowhere', 'dorsal aspect of the foot(L)', 'dorsal aspect of the foot(R)', 'sole(L)', 'forehead', 'calf(R)', 'thigh(R)', 'cheek(R)'
    ]

]

In [None]:
CATEGORICAL_FEATURES = [col for col in df['train'].columns if df['train'][col].dtype == 'object']

CATEGORICAL_FEATURES

['SEX',
 'PATHOLOGY',
 'INITIAL_EVIDENCE',
 'pain_char',
 'pain_somewhere',
 'lesion_color',
 'lesion_location',
 'trav1',
 'swelling_location']

In [None]:
NUMERICAL_FEATURES = [col for col in df['train'].columns if (set(df['train'][col].unique()) != set([0,1])) and (df['train'][col].dtype != 'object')]

NUMERICAL_FEATURES

['AGE',
 'pain_intensity',
 'pain_precise',
 'pain_sudden',
 'lesion_pain_swollen',
 'lesion_pain_intense',
 'itching_severity']

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_recall_fscore_support as prfs
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report as cr
from sklearn.feature_selection import SelectFromModel


SEED = 42
BETA = 2

CATEGORICAL_FEATURES.remove('PATHOLOGY')

In [None]:
def custom_pipeline(classifier):
    features_preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), NUMERICAL_FEATURES),
            ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES)
        ],
        remainder='passthrough'
    )
    if classifier == None:
        return make_pipeline(features_preprocessor)
    return make_pipeline(features_preprocessor, classifier)

In [None]:
def score(model, X, y):
    y_pred = model.predict(X)
    #return prfs(y, y_pred, beta=BETA)
    classes = model.classes_
    scores = recall_score(y, y_pred, labels=classes, average=None)
    ret = {}
    for i in range(len(classes)):
        ret[classes[i]] = scores[i]
    return pd.DataFrame(ret, index=['recall']).transpose()

In [None]:
def print_score(model):
    print(cr(y_validate, model.predict(X_validate)))

In [None]:
from PIL import Image, ImageDraw, ImageFont

def export_image(text, filename):
    W, H = (900,450)
    im = Image.new("RGBA",(W,H),"white")

    draw = ImageDraw.Draw(im)

    font = ImageFont.truetype(font=DRIVE + 'FreeMono.ttf', size=20)
    draw.text(xy=(50,50), text=text, fill='black', font=font)

    # Save Image
    im.save(filename, "PNG")

In [None]:
def fit_and_score(classifier):
    model = custom_pipeline(classifier)
    model.fit(X_train, y_train)
    text = str(classifier) + '\n\n' + cr(y_validate, model.predict(X_validate))
    print(text)
    export_image(text, str(classifier) + '.png')
    return model

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

solver = 'saga'
max_iter = 10

lr = LogisticRegression(max_iter=max_iter, solver=solver)

lr_model = fit_and_score(lr)



LogisticRegression(max_iter=10, solver='saga')

                         precision    recall  f1-score   support

     Allergic sinusitis       0.77      0.90      0.83      2136
            Anaphylaxis       0.90      0.42      0.57      3754
                 Chagas       0.99      0.22      0.37      1124
HIV (initial infection)       0.39      0.53      0.45      3852
              Influenza       0.65      0.34      0.44      3590
        Localized edema       0.52      1.00      0.68      3694
                    SLE       0.91      0.33      0.48      1579
            Sarcoidosis       0.60      0.71      0.65      3028
           Tuberculosis       0.57      0.56      0.56      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.59     25309
              macro avg       0.73      0.60      0.60     25309
           weighted avg       0.67      0.59      0.58     25309



In [None]:
import pickle

with open('logistic.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

In [None]:
with open('logistic.pkl', 'rb') as f:
    lr_model = pickle.load(f)

In [None]:
X_validate.iloc[2].values.reshape(1, -1)

array([[56, 'F', 'pain', 1, 1, 'exhausting', 'occiput', 6, 6, 2, 1, 1, 1,
        0, 1, 'pink', 1, 'side of the neck(L)', 2, 1, 1, 1, 1, 1, 'N', 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 'nowhere', 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]], dtype=object)

In [None]:
X_validate.iloc[2:4].shape

(2, 91)

In [None]:
d = dict(X_validate.iloc[2])

In [None]:
d

{'AGE': 56,
 'SEX': 'F',
 'INITIAL_EVIDENCE': 'pain',
 'sweating': 1,
 'pain': 1,
 'pain_char': 'exhausting',
 'pain_somewhere': 'occiput',
 'pain_intensity': 6,
 'pain_precise': 6,
 'pain_sudden': 2,
 'f17.210': 1,
 'fatigue_ext': 1,
 'fever': 1,
 'chills': 0,
 'lesions': 1,
 'lesion_color': 'pink',
 'lesion_pain_swollen': 1,
 'lesion_location': 'side of the neck(L)',
 'lesion_pain_intense': 2,
 'lesion_larger_than_1cm': 1,
 'itching_severity': 1,
 'muscle_pain': 1,
 'lost_appetite': 1,
 'runny_nose': 1,
 'trav1': 'N',
 'fam_j45': 0,
 'itchy_nose': 0,
 'eye_itching': 0,
 'urban1': 0,
 'z84.89': 0,
 'I30': 0,
 'cough': 1,
 'z92.25': 0,
 'severe_allergy': 0,
 'contact_allergy': 0,
 'diarrhea': 0,
 'short_breath': 0,
 'dizziness': 0,
 'nausea': 0,
 'swelling': 0,
 'swelling_location': 'nowhere',
 'wheezing_exhale': 0,
 'swollen_nodes': 0,
 'std': 0,
 'unprotected_sex': 0,
 'weight_loss': 0,
 'lost_consciousness': 0,
 'stridor': 0,
 'sore_throat': 0,
 'sex_hiv': 0,
 'nsaids': 0,
 'heart_v

In [None]:
np.array(d)

array({'AGE': 56, 'SEX': 'F', 'INITIAL_EVIDENCE': 'pain', 'sweating': 1, 'pain': 1, 'pain_char': 'exhausting', 'pain_somewhere': 'occiput', 'pain_intensity': 6, 'pain_precise': 6, 'pain_sudden': 2, 'f17.210': 1, 'fatigue_ext': 1, 'fever': 1, 'chills': 0, 'lesions': 1, 'lesion_color': 'pink', 'lesion_pain_swollen': 1, 'lesion_location': 'side of the neck(L)', 'lesion_pain_intense': 2, 'lesion_larger_than_1cm': 1, 'itching_severity': 1, 'muscle_pain': 1, 'lost_appetite': 1, 'runny_nose': 1, 'trav1': 'N', 'fam_j45': 0, 'itchy_nose': 0, 'eye_itching': 0, 'urban1': 0, 'z84.89': 0, 'I30': 0, 'cough': 1, 'z92.25': 0, 'severe_allergy': 0, 'contact_allergy': 0, 'diarrhea': 0, 'short_breath': 0, 'dizziness': 0, 'nausea': 0, 'swelling': 0, 'swelling_location': 'nowhere', 'wheezing_exhale': 0, 'swollen_nodes': 0, 'std': 0, 'unprotected_sex': 0, 'weight_loss': 0, 'lost_consciousness': 0, 'stridor': 0, 'sore_throat': 0, 'sex_hiv': 0, 'nsaids': 0, 'heart_valves': 0, 'gained_weight': 0, 'i80': 0, 'k74

In [None]:
d.keys()

dict_keys(['AGE', 'SEX', 'INITIAL_EVIDENCE', 'sweating', 'pain', 'pain_char', 'pain_somewhere', 'pain_intensity', 'pain_precise', 'pain_sudden', 'f17.210', 'fatigue_ext', 'fever', 'chills', 'lesions', 'lesion_color', 'lesion_pain_swollen', 'lesion_location', 'lesion_pain_intense', 'lesion_larger_than_1cm', 'itching_severity', 'muscle_pain', 'lost_appetite', 'runny_nose', 'trav1', 'fam_j45', 'itchy_nose', 'eye_itching', 'urban1', 'z84.89', 'I30', 'cough', 'z92.25', 'severe_allergy', 'contact_allergy', 'diarrhea', 'short_breath', 'dizziness', 'nausea', 'swelling', 'swelling_location', 'wheezing_exhale', 'swollen_nodes', 'std', 'unprotected_sex', 'weight_loss', 'lost_consciousness', 'stridor', 'sore_throat', 'sex_hiv', 'nsaids', 'heart_valves', 'gained_weight', 'i80', 'k74', 'sahs', 'synd_nephro', 'cortico', 'i50', 'lymph_surg', 'ca_blockers', 'convulsion', 'e66', 'vag_discharge', 'agri', 'fatigue', 'high_bp', 'menarche_12', 'ulcers', 'fam_allergies', 'red_eye', 'IV_drugs', 'anorexia', 'p

In [None]:
pd.DataFrame(d, index=[0])

Unnamed: 0,AGE,SEX,INITIAL_EVIDENCE,sweating,pain,pain_char,pain_somewhere,pain_intensity,pain_precise,pain_sudden,...,vaccination,cough_blood,e10_e11,v85.0,wheezing_inhale,breastfed_9,confusion,ebolacase,bruising,contact
0,56,F,pain,1,1,exhausting,occiput,6,6,2,...,0,0,0,0,0,0,0,0,0,0


In [None]:
lr_model.feature_names_in_

array(['AGE', 'SEX', 'INITIAL_EVIDENCE', 'swollen_nodes', 'std',
       'sweating', 'diarrhea', 'pain', 'pain_char', 'pain_somewhere',
       'pain_intensity', 'pain_precise', 'pain_sudden', 'fever',
       'unprotected_sex', 'lesions', 'lesion_color',
       'lesion_pain_swollen', 'lesion_location', 'lesion_pain_intense',
       'lesion_larger_than_1cm', 'itching_severity', 'nausea',
       'weight_loss', 'sex_hiv', 'trav1', 'fam_allergies', 'fam_j45',
       'j45', 'itchy_nose', 'eye_itching', 'runny_nose', 'urban1',
       'severe_allergy', 'contact_allergy', 'short_breath', 'swelling',
       'swelling_location', 'lost_consciousness', 'stridor', 'z84.89',
       'HIV', 'cortico', 'IV_drugs', 'e10_e11', 'f10.129', 'cough',
       'cough_blood', 'v85.0', 'I30', 'f17.210', 'high_bp', 'ulcers',
       'anorexia', 'new_fatigue', 'nsaids', 'i50', 'i80', 'lymph_surg',
       'synd_nephro', 'convulsion', 'e66', 'red_eye', 'agri',
       'gained_weight', 'k74', 'patho_endo', 'dizziness',
  

In [None]:
lr_model.predict(pd.DataFrame(d, index=[0]))

array(['Localized edema'], dtype=object)

In [None]:
another = {
  "AGE": 0,
  "SEX": 'F',
  "INITIAL_EVIDENCE": 0,
  "sweating": 0,
  "pain": 0,
  "f17.210": 0,
  "fatigue_ext": 0,
  "fever": 0,
  "sore_throat": 0,
  "lesions": 0,
  "muscle_pain": 0,
  "lost_appetite": 0,
  "cough": 0,
  "trav1": 0,
  "z92.25": 0,
  "runny_nose": 0,
  "heart_valves": 0,
  "cortico": 0,
  "gained_weight": 0,
  "i50": 0,
  "i80": 0,
  "k74": 0,
  "lymph_surg": 0,
  "swelling": 0,
  "sahs": 0,
  "synd_nephro": 0,
  "nsaids": 0,
  "swollen_nodes": 0,
  "std": 0,
  "diarrhea": 0,
  "unprotected_sex": 0,
  "weight_loss": 0,
  "sex_hiv": 0,
  "convulsion": 0,
  "short_breath": 0,
  "e66": 0,
  "agri": 0,
  "itchy_nose": 0,
  "eye_itching": 0,
  "urban1": 0,
  "z84.89": 0,
  "HIV": 0,
  "cough_blood": 0,
  "IV_drugs": 0,
  "f10.129": 0,
  "ca_blockers": 0,
  "I30": 0,
  "fatigue": 0,
  "high_bp": 0,
  "menarche_12": 0,
  "ulcers": 0,
  "red_eye": 0,
  "vag_discharge": 0,
  "nausea": 0,
  "severe_allergy": 0,
  "contact_allergy": 0,
  "wheezing_exhale": 0,
  "fam_allergies": 0,
  "fam_j45": 0,
  "j45": 0,
  "dizziness": 0,
  "lost_consciousness": 0,
  "chills": 0,
  "e10_e11": 0,
  "v85.0": 0,
  "stridor": 0,
  "patho_endo": 0,
  "confusion": 0,
  "contact": 0,
  "ebolacase": 0,
  "bruising": 0,
  "breastfed_9": 0,
  "anorexia": 0,
  "new_fatigue": 0,
  "vomiting_cough": 0,
  "coughing_fits": 0,
  "vaccination": 0,
  "cont_pertussis": 0,
  "wheezing_inhale": 0,
  "pain_char": 0,
  "pain_somewhere": 0,
  "pain_intensity": 0,
  "pain_precise": 0,
  "pain_sudden": 0,
  "lesion_color": 0,
  "lesion_pain_swollen": 0,
  "lesion_location": 0,
  "lesion_pain_intense": 0,
  "lesion_larger_than_1cm": 0,
  "itching_severity": 0,
  "swelling_location": 0
}

In [None]:
dataframe = pd.DataFrame(another, index=[0])
dataframe = dataframe[X_train.columns]

In [None]:
dataframe

Unnamed: 0,AGE,SEX,INITIAL_EVIDENCE,swollen_nodes,std,sweating,diarrhea,pain,pain_char,pain_somewhere,...,ca_blockers,vag_discharge,wheezing_inhale,fatigue,menarche_12,breastfed_9,confusion,contact,ebolacase,bruising
0,0,F,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
!pip list

Package                          Version
-------------------------------- ---------------------
absl-py                          1.4.0
aiohttp                          3.9.5
aiosignal                        1.3.1
alabaster                        0.7.16
albumentations                   1.3.1
altair                           4.2.2
annotated-types                  0.7.0
anyio                            3.7.1
argon2-cffi                      23.1.0
argon2-cffi-bindings             21.2.0
array_record                     0.5.1
arviz                            0.15.1
astropy                          5.3.4
astunparse                       1.6.3
async-timeout                    4.0.3
atpublic                         4.1.0
attrs                            23.2.0
audioread                        3.0.1
autograd                         1.6.2
Babel                            2.15.0
backcall                         0.2.0
beautifulsoup4                   4.12.3
bidict                           0.23.1

In [None]:
dataframe.dtypes.values

array([dtype('int64'), dtype('O'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
       dty

In [None]:
lr_model.predict(dataframe.astype(object))

array(['HIV (initial infection)'], dtype=object)

### Stochastic Gradient Descent Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

loss = 'log_loss'
penalty = 'l1'
alpha = 0.01

sgd = SGDClassifier(
    #loss=loss,
    penalty=penalty,
    #alpha=alpha,
    random_state = SEED,
    shuffle=True
    )

fit_and_score(sgd)

SGDClassifier(penalty='l1', random_state=42)

                         precision    recall  f1-score   support

     Allergic sinusitis       0.77      0.90      0.83      2136
            Anaphylaxis       0.90      0.42      0.58      3754
                 Chagas       0.82      0.23      0.36      1124
HIV (initial infection)       0.40      0.38      0.39      3852
              Influenza       0.42      0.51      0.46      3590
        Localized edema       0.53      0.96      0.68      3694
                    SLE       0.40      0.48      0.44      1579
            Sarcoidosis       0.74      0.57      0.65      3028
           Tuberculosis       0.65      0.41      0.50      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.57     25309
              macro avg       0.66      0.59      0.59     25309
           weighted avg       0.62      0.57      0.56     25309



### K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neighbors = 5

knn = KNeighborsClassifier(
    n_neighbors = neighbors
    )
fit_and_score(knn)

KNeighborsClassifier()

                         precision    recall  f1-score   support

     Allergic sinusitis       0.44      0.81      0.57      2136
            Anaphylaxis       0.46      0.50      0.48      3754
                 Chagas       0.37      0.24      0.29      1124
HIV (initial infection)       0.32      0.33      0.33      3852
              Influenza       0.42      0.32      0.36      3590
        Localized edema       0.55      0.64      0.59      3694
                    SLE       0.50      0.31      0.38      1579
            Sarcoidosis       0.56      0.49      0.53      3028
           Tuberculosis       0.50      0.35      0.42      2007
         Whooping cough       0.89      0.80      0.84       545

               accuracy                           0.47     25309
              macro avg       0.50      0.48      0.48     25309
           weighted avg       0.47      0.47      0.46     25309



### Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB(alpha=0.1)

fit_and_score(nb)

ValueError: Negative values in data passed to MultinomialNB (input X)

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state = SEED)

fit_and_score(dt)

DecisionTreeClassifier(random_state=42)

                         precision    recall  f1-score   support

     Allergic sinusitis       0.74      0.88      0.81      2136
            Anaphylaxis       0.65      0.52      0.58      3754
                 Chagas       0.56      0.26      0.35      1124
HIV (initial infection)       0.42      0.42      0.42      3852
              Influenza       0.55      0.36      0.43      3590
        Localized edema       0.54      0.93      0.68      3694
                    SLE       0.62      0.39      0.48      1579
            Sarcoidosis       0.63      0.63      0.63      3028
           Tuberculosis       0.53      0.54      0.53      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.58     25309
              macro avg       0.62      0.59      0.59     25309
           weighted avg       0.58      0.58      0.56     25309



### Ensemble methods

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = SEED, n_estimators=10)

fit_and_score(rf)

RandomForestClassifier(n_estimators=10, random_state=42)

                         precision    recall  f1-score   support

     Allergic sinusitis       0.74      0.90      0.81      2136
            Anaphylaxis       0.66      0.51      0.58      3754
                 Chagas       0.68      0.23      0.35      1124
HIV (initial infection)       0.42      0.42      0.42      3852
              Influenza       0.57      0.36      0.44      3590
        Localized edema       0.53      0.95      0.68      3694
                    SLE       0.61      0.38      0.47      1579
            Sarcoidosis       0.64      0.63      0.63      3028
           Tuberculosis       0.52      0.55      0.53      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.58     25309
              macro avg       0.64      0.59      0.59     25309
           weighted avg       0.59      0.58      0.56     25309



In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state = SEED, n_estimators=10)

fit_and_score(gb)

GradientBoostingClassifier(n_estimators=10, random_state=42)

                         precision    recall  f1-score   support

     Allergic sinusitis       0.86      0.83      0.85      2136
            Anaphylaxis       0.59      0.46      0.52      3754
                 Chagas       1.00      0.22      0.37      1124
HIV (initial infection)       0.39      0.60      0.48      3852
              Influenza       0.71      0.29      0.41      3590
        Localized edema       0.52      1.00      0.68      3694
                    SLE       1.00      0.32      0.48      1579
            Sarcoidosis       0.78      0.56      0.65      3028
           Tuberculosis       0.57      0.66      0.61      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.59     25309
              macro avg       0.74      0.60      0.60     25309
           weighted avg       0.66      0.59      0.57     25309



In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(random_state = SEED, n_estimators=10, estimator=RandomForestClassifier(random_state = SEED, n_estimators=10))

fit_and_score(ada)

AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=10,
                                                    random_state=42),
                   n_estimators=10, random_state=42)

                         precision    recall  f1-score   support

     Allergic sinusitis       0.74      0.90      0.81      2136
            Anaphylaxis       0.69      0.50      0.58      3754
                 Chagas       0.78      0.23      0.36      1124
HIV (initial infection)       0.42      0.42      0.42      3852
              Influenza       0.57      0.36      0.44      3590
        Localized edema       0.53      0.97      0.68      3694
                    SLE       0.65      0.38      0.48      1579
            Sarcoidosis       0.63      0.64      0.63      3028
           Tuberculosis       0.52      0.55      0.54      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.58     25309
              macro avg  

### XGBoost and CatBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

xgb = XGBClassifier(random_state = SEED, n_estimators=10)

xgb_model = custom_pipeline(xgb)
xgb_model.fit(X_train, le.fit_transform(y_train))

In [None]:
print(cr(le.fit_transform(y_validate), xgb_model.predict(X_validate)))

              precision    recall  f1-score   support

           0       0.79      0.88      0.83      2136
           1       0.98      0.40      0.57      3754
           2       1.00      0.22      0.37      1124
           3       0.40      0.58      0.47      3852
           4       0.70      0.29      0.41      3590
           5       0.52      1.00      0.68      3694
           6       0.88      0.34      0.49      1579
           7       0.60      0.72      0.65      3028
           8       0.58      0.61      0.59      2007
           9       1.00      1.00      1.00       545

    accuracy                           0.60     25309
   macro avg       0.74      0.60      0.61     25309
weighted avg       0.69      0.60      0.58     25309



In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


This is using CatBoost the way we did everything else:

In [None]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(
    random_state = SEED,
    iterations=10,
    learning_rate=0.8
    )

fit_and_score(cat)

0:	learn: 1.7534206	total: 1.01s	remaining: 9.13s
1:	learn: 1.4637232	total: 1.91s	remaining: 7.66s
2:	learn: 1.3375018	total: 3.08s	remaining: 7.18s
3:	learn: 1.3414745	total: 4.57s	remaining: 6.85s
4:	learn: 15.5164142	total: 6.14s	remaining: 6.14s
5:	learn: 52.3325873	total: 7.6s	remaining: 5.07s
6:	learn: 131.4412794	total: 8.64s	remaining: 3.7s
7:	learn: 22.9945767	total: 9.56s	remaining: 2.39s
8:	learn: 75.1067938	total: 10.5s	remaining: 1.17s
9:	learn: 33.0058048	total: 11.4s	remaining: 0us
<catboost.core.CatBoostClassifier object at 0x785a37c0eb90>

                         precision    recall  f1-score   support

     Allergic sinusitis       0.80      0.87      0.83      2136
            Anaphylaxis       0.82      0.47      0.60      3754
                 Chagas       1.00      0.22      0.36      1124
HIV (initial infection)       0.42      0.39      0.40      3852
              Influenza       0.53      0.43      0.48      3590
        Localized edema       0.52      1.00 

However, CatBoost has support for categorical variables. So we should try to use it without one-hot encoding:

In [None]:
from catboost import CatBoostClassifier

X_val_for_cat = X_validate.copy()
X_val_for_cat[categorical_features] = X_val_for_cat[categorical_features].astype(str)
X_val_for_cat = X_val_for_cat[X_train.columns]

cat2 = CatBoostClassifier(
    iterations=5,
    learning_rate=0.7,
    random_seed=SEED,
    loss_function='MultiClass',
    cat_features=categorical_features
)
cat2.fit(
    X_train, y_train,
    cat_features=categorical_features,
    eval_set=(X_val_for_cat, y_validate),
    verbose=True,
    #plot=True
)

0:	learn: 1.2504885	test: 1.3112720	best: 1.3112720 (0)	total: 8.59s	remaining: 34.4s
1:	learn: 0.9793981	test: 1.0209881	best: 1.0209881 (1)	total: 17.2s	remaining: 25.8s
2:	learn: 0.9065016	test: 0.9301462	best: 0.9301462 (2)	total: 23.8s	remaining: 15.9s
3:	learn: 0.8663491	test: 0.8960783	best: 0.8960783 (3)	total: 31.6s	remaining: 7.89s
4:	learn: 0.8548126	test: 0.8865453	best: 0.8865453 (4)	total: 39.1s	remaining: 0us

bestTest = 0.8865452824
bestIteration = 4



<catboost.core.CatBoostClassifier at 0x785a37c0d930>

In [None]:
print(cr(y_validate, cat2.predict(X_val_for_cat, )))

                         precision    recall  f1-score   support

     Allergic sinusitis       0.80      0.86      0.83      2136
            Anaphylaxis       1.00      0.40      0.57      3754
                 Chagas       1.00      0.22      0.37      1124
HIV (initial infection)       0.34      0.39      0.37      3852
              Influenza       0.58      0.47      0.52      3590
        Localized edema       0.52      1.00      0.68      3694
                    SLE       1.00      0.32      0.48      1579
            Sarcoidosis       0.60      0.72      0.65      3028
           Tuberculosis       0.57      0.62      0.60      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.59     25309
              macro avg       0.74      0.60      0.61     25309
           weighted avg       0.67      0.59      0.58     25309



#### Stacking

In [None]:
from sklearn.ensemble import StackingClassifier

estimators = [
    ('xgb', xgb_model),
    ('cat', cat2)
]

stacking = StackingClassifier(
    estimators=estimators,
    #final_estimator=RandomForestClassifier(random_state = SEED, n_estimators=10)
)

stacking.fit(X_train, y_train)

0:	learn: 1.2504885	total: 8.54s	remaining: 34.2s
1:	learn: 0.9793981	total: 15.3s	remaining: 22.9s
2:	learn: 0.9065016	total: 24.4s	remaining: 16.2s
3:	learn: 0.8663491	total: 30.5s	remaining: 7.62s
4:	learn: 0.8548126	total: 38.7s	remaining: 0us
0:	learn: 1.2523221	total: 5.74s	remaining: 22.9s
1:	learn: 0.9657569	total: 10.5s	remaining: 15.8s
2:	learn: 0.8891795	total: 17.9s	remaining: 11.9s
3:	learn: 0.8613044	total: 22.9s	remaining: 5.71s
4:	learn: 0.8511641	total: 27.6s	remaining: 0us
0:	learn: 1.2465605	total: 6.46s	remaining: 25.8s
1:	learn: 0.9561243	total: 12.6s	remaining: 18.9s
2:	learn: 0.8874947	total: 19.8s	remaining: 13.2s
3:	learn: 0.8648569	total: 24.6s	remaining: 6.16s
4:	learn: 0.8522853	total: 29.4s	remaining: 0us
0:	learn: 1.2390571	total: 6.84s	remaining: 27.4s
1:	learn: 1.0148520	total: 11.8s	remaining: 17.6s
2:	learn: 0.8939759	total: 17.9s	remaining: 11.9s
3:	learn: 0.8747247	total: 23.7s	remaining: 5.92s
4:	learn: 0.8573286	total: 28.4s	remaining: 0us
0:	learn

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
print(cr(y_validate, stacking.predict(X_val_for_cat)))

                         precision    recall  f1-score   support

     Allergic sinusitis       0.83      0.85      0.84      2136
            Anaphylaxis       0.78      0.48      0.60      3754
                 Chagas       0.90      0.23      0.36      1124
HIV (initial infection)       0.35      0.36      0.35      3852
              Influenza       0.58      0.47      0.52      3590
        Localized edema       0.52      1.00      0.68      3694
                    SLE       0.89      0.34      0.49      1579
            Sarcoidosis       0.60      0.72      0.65      3028
           Tuberculosis       0.58      0.52      0.55      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.59     25309
              macro avg       0.70      0.60      0.60     25309
           weighted avg       0.63      0.59      0.58     25309



### LightGBM

In [None]:
import lightgbm as lgb

lgbm = lgb.LGBMClassifier(random_state = SEED, n_estimators=10)

fit_and_score(lgbm)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.268306 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 202290, number of used features: 216
[LightGBM] [Info] Start training from score -2.043828
[LightGBM] [Info] Start training from score -1.987620
[LightGBM] [Info] Start training from score -3.084863
[LightGBM] [Info] Start training from score -1.941958
[LightGBM] [Info] Start training from score -2.020853
[LightGBM] [Info] Start training from score -1.983767
[LightGBM] [Info] Start training from score -2.835941
[LightGBM] [Info] Start training from score -2.251700
[LightGBM] [Info] Start training from score -2.521917
[LightGBM] [Info] Start training from score -3.506344
LGBMClassifier(n_estimators=10, random_state=42)

                         precision    recall  f1-score  

In [None]:
ct = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(categories=categories), categorical_features)
    ],
    remainder='passthrough'
)

ct.fit(X_train)
X_train_lgb = ct.transform(X_train)
X_validate_lgb = ct.transform(X_validate)

le = LabelEncoder()
y_train_lgb = le.fit_transform(y_train)
y_validate_lgb = le.transform(y_validate)

train_data = lgb.Dataset(X_train_lgb, label=y_train_lgb)
param = {'num_leaves': 31, 'objective': 'binary'}
num_round = 10

lgb.cv(param, train_data, num_round, nfold=5)


[LightGBM] [Info] Number of positive: 140870, number of negative: 20962
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.218417 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 161832, number of used features: 216
[LightGBM] [Info] Number of positive: 140869, number of negative: 20963
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.219071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 161832, number of used features: 216
[LightGBM] [Info] Number of positive: 140869, number of negative: 20963
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of te

{'valid binary_logloss-mean': [0.31026741435934524,
  0.2684985724151307,
  0.23828579126893773,
  0.21465099582740743,
  0.19543375215229924,
  0.17941598297356015,
  0.1658524883158296,
  0.1542469662112118,
  0.14423764965624492,
  0.13552880638100318],
 'valid binary_logloss-stdv': [0.00023583415323031722,
  0.0003116537447577421,
  0.0003442647920699498,
  0.00037241342809398164,
  0.0003929957722390439,
  0.0004173139362707479,
  0.0004381317208824662,
  0.00045452563803426854,
  0.00047176629290164693,
  0.0004859948669684686]}

In [None]:
lgbooster = lgb.train(param, train_data, num_round)

[LightGBM] [Info] Number of positive: 176087, number of negative: 26203
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.419620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 202290, number of used features: 216
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.870468 -> initscore=1.905104
[LightGBM] [Info] Start training from score 1.905104


In [None]:
print(cr(y_validate_lgb, lgbooster.predict(X_validate_lgb).round()))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.85      0.84      0.84      2136
           1       0.16      1.00      0.28      3754
           2       0.00      0.00      0.00      1124
           3       0.00      0.00      0.00      3852
           4       0.00      0.00      0.00      3590
           5       0.00      0.00      0.00      3694
           6       0.00      0.00      0.00      1579
           7       0.00      0.00      0.00      3028
           8       0.00      0.00      0.00      2007
           9       0.00      0.00      0.00       545

    accuracy                           0.22     25309
   macro avg       0.10      0.18      0.11     25309
weighted avg       0.10      0.22      0.11     25309



  _warn_prf(average, modifier, msg_start, len(result))


### Support Vector Machine Classifier

In [None]:
from sklearn.svm import LinearSVC

max_iter = 1

C = 1/np.sqrt(len(X_train))

svm = LinearSVC(
    random_state = SEED,
    max_iter=max_iter,
    C=C
    )

svm_model = fit_and_score(svm)



LinearSVC(C=0.0022233753825767564, max_iter=1, random_state=42)

                         precision    recall  f1-score   support

     Allergic sinusitis       0.73      0.92      0.82      2136
            Anaphylaxis       0.51      0.64      0.57      3754
                 Chagas       0.93      0.23      0.37      1124
HIV (initial infection)       0.43      0.40      0.41      3852
              Influenza       0.69      0.25      0.37      3590
        Localized edema       0.53      0.95      0.68      3694
                    SLE       0.88      0.33      0.48      1579
            Sarcoidosis       0.67      0.63      0.65      3028
           Tuberculosis       0.55      0.56      0.55      2007
         Whooping cough       0.99      1.00      1.00       545

               accuracy                           0.58     25309
              macro avg       0.69      0.59      0.59     25309
           weighted avg       0.62      0.58      0.56     25309



Warning: SVM takes a long time!

In [None]:
# from sklearn.svm import SVC

# svm = SVC(kernel='linear', random_state = SEED, decision_function_shape='ovo')

# svm_model = make_pipeline(ct, ss, svm)
# svm_model.fit(X_train, y_train)
# svm_model.score(X_validate, y_validate)

### Combinations

In [None]:
pipe = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(categories=categories), categorical_features)
        ],
        remainder='passthrough'
    ),
    SelectFromModel(
        LinearSVC(
            random_state = SEED,
            max_iter = 10,
            C = C
        ),
        threshold = 'median'
    ),
    GradientBoostingClassifier(random_state = SEED, n_estimators=10)
)


In [None]:
pipe.fit(X_train, y_train)



In [None]:
print(cr(y_validate, pipe.predict(X_validate)))

                         precision    recall  f1-score   support

     Allergic sinusitis       0.86      0.83      0.85      2136
            Anaphylaxis       0.59      0.46      0.52      3754
                 Chagas       1.00      0.22      0.37      1124
HIV (initial infection)       0.39      0.63      0.48      3852
              Influenza       0.74      0.25      0.37      3590
        Localized edema       0.52      1.00      0.68      3694
                    SLE       1.00      0.32      0.48      1579
            Sarcoidosis       0.78      0.56      0.65      3028
           Tuberculosis       0.57      0.66      0.61      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.59     25309
              macro avg       0.74      0.59      0.60     25309
           weighted avg       0.67      0.59      0.57     25309



In [None]:
pipe2 = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(categories=categories), categorical_features)
        ],
        remainder='passthrough'
    ),
    SelectFromModel(
        GradientBoostingClassifier(random_state = SEED, n_estimators=10),
        threshold = 'median'
    ),
    GradientBoostingClassifier(random_state = SEED, n_estimators=10)
)


In [None]:
pipe2.fit(X_train, y_train)

In [None]:
print(cr(y_validate, pipe2.predict(X_validate)))

                         precision    recall  f1-score   support

     Allergic sinusitis       0.86      0.83      0.85      2136
            Anaphylaxis       0.59      0.46      0.52      3754
                 Chagas       1.00      0.22      0.37      1124
HIV (initial infection)       0.39      0.60      0.48      3852
              Influenza       0.71      0.29      0.41      3590
        Localized edema       0.52      1.00      0.68      3694
                    SLE       1.00      0.32      0.48      1579
            Sarcoidosis       0.78      0.56      0.65      3028
           Tuberculosis       0.57      0.66      0.61      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.59     25309
              macro avg       0.74      0.60      0.60     25309
           weighted avg       0.66      0.59      0.57     25309

