## GPU stuff

Attempting to make the code run on a GPU using CuDF and CuML. Not successful so far.

In [None]:
!nvidia-smi

In [1]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 481, done.[K
remote: Counting objects: 100% (212/212), done.[K
remote: Compressing objects: 100% (121/121), done.[K
remote: Total 481 (delta 143), reused 124 (delta 91), pack-reused 269[K
Receiving objects: 100% (481/481), 133.58 KiB | 5.14 MiB/s, done.
Resolving deltas: 100% (245/245), done.
Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.1/53.1 kB 1.9 MB/s eta 0:00:00
Installing collected packages: pynvml
Successfully installed pynvml-11.5.0
***********************************************************************
Woo! Your instance has a Tesla T4 GPU!
We will install the latest stable RAPIDS via pip 24.4.*!  Please stand by, should be quick...
***********************************************************************

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cuml-cu12==24.4.*
  Downloading https://pypi.nvidia.

In [2]:
import cudf
%load_ext cudf.pandas

In [None]:
import pandas as pd
import numpy as np
import ast
import warnings

DATA_DIR = ''

DISEASES = ['HIV (initial infection)', 'Whooping cough', 'Chagas',
    'Tuberculosis', 'Ebola', 'Influenza',
    'SLE', 'Sarcoidosis', 'Anaphylaxis',
    'Allergic sinusitis', 'Localized edema']

DISEASES_FR = ['VIH (Primo-infection)', 'Coqueluche', 'Chagas',
    'Tuberculose', 'Ebola', 'Possible influenza ou syndrome virémique typique',
    'Lupus érythémateux disséminé (LED)', 'Sarcoïdose', 'Anaphylaxie',
    'Rhinite allergique', 'Oedème localisé ou généralisé sans atteinte pulmonaire associée']

SYMPTOMS_WITH_STR_ENTRIES = ['trav1', 'lesion_larger_than_1cm', 'lesions_peeling',
    'pain_char', 'lesion_color', 'pain_somewhere',
    'pain_radiate', 'lesion_location', 'swelling_location']

REPLACE_DICT = {
    'AGE': 'unknown',
    'pain_char': 'NA',
    'pain_somewhere': 'nowhere',
    'pain_radiate': 'nowhere',
    'pain_intensity': '0',
    'pain_precise': '0',
    'pain_sudden': '0',
    'lesion_color': 'NA',
    'lesion_location': 'nowhere',
    'lesions_peeling': 'N',
    'lesion_pain_swollen': '0',
    'lesion_larger_than_1cm': 'N',
    'lesion_pain_intense': '0',
    'swelling_location': 'nowhere',
    'trav1': 'N',
    'itching_severity': '0'
}

INTEGER_COLS = ['AGE', 'pain_intensity', 'pain_precise', 'pain_sudden',
                'lesion_pain_swollen', 'lesion_pain_intense', 'itching_severity']

conditions = pd.DataFrame()
evidences = pd.DataFrame()
evidences_en = pd.DataFrame()

def set_dir(directory):
    global DATA_DIR
    DATA_DIR = directory

def load_metadata(directory = DATA_DIR):
    global DATA_DIR, conditions, evidences, evidences_en
    DATA_DIR = directory
    conditions = pd.read_json(DATA_DIR + 'release_conditions.json').transpose()
    evidences = pd.read_json(DATA_DIR + 'release_evidences.json').transpose().rename(columns={'possible-values': 'possible_values'})
    evidences_en = pd.read_csv(DATA_DIR + 'evidences_en.csv', index_col=0)

    evidences['possible_values_en'] = [list() for n in range(len(evidences))]

    for row in evidences.itertuples():
        vals = row.possible_values
        if len(row.value_meaning) == 0:
            continue
        for value in vals:
            row.possible_values_en.append(row.value_meaning[value]['en'])

    evidences_en['value_meaning'] = [dict(ast.literal_eval(thing)) for thing in evidences_en['value_meaning'].values]

def get_english(symptom, detail):
    try:
        val = evidences_en.loc[symptom]['value_meaning'][detail]['en']
    except KeyError:
        val = detail
    return val

def pad_list(l):
    if len(l) >= 2:
        return l
    else:
        return l + [1]

class DiagDataFrame(cudf.DataFrame):

    _metadata = ["ddx"]

    def __init__(self, *args, **kwargs):
        diff_diag = kwargs.pop('ddx', False)
        super().__init__(*args, **kwargs)
        self.ddx = diff_diag

    def format_and_translate(self):
        if self.ddx:
            self.dds_to_dicts()
        self.evidences_to_lists()
        self.evidences_to_dicts()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self.expand_evidences()
        self.rename_symptoms()
        self.translate_to_english()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self.replace_values()
        self.fillna(0, inplace=True)
        self.to_integers()

    def dds_to_dicts(self):
        self['DIFFERENTIAL_DIAGNOSIS'] = [dict(ast.literal_eval(thing)) for thing in self['DIFFERENTIAL_DIAGNOSIS'].values]

    def evidences_to_lists(self):
        self['EVIDENCES'] = [ast.literal_eval(thing) for thing in self['EVIDENCES'].values]

    def evidences_to_dicts(self):
        self['EVIDENCES'] = [dict([pad_list(symp.split('_@_')) for symp in symps]) for symps in self['EVIDENCES']]

    def expand_evidences(self):
        temp = cudf.DataFrame(self.pop('EVIDENCES').values.tolist())
        for column in temp.columns:
            self[column] = temp[column]

    def rename_symptoms(self):
        renames = {}
        for column in self.columns:
            if column in evidences.index:
                renames[column] = evidences_en.loc[evidences_en['name'] == column].index[0]

        self.rename(columns=renames, inplace=True)

    def translate_to_english(self):
        if self.ddx:
            self['DIFFERENTIAL_DIAGNOSIS'] = [{conditions.loc[k]['cond-name-eng']: v.pop(k) for k in list(v.keys())} for v in self['DIFFERENTIAL_DIAGNOSIS']]

        self['PATHOLOGY'] = [conditions.loc[k]['cond-name-eng'] for k in self['PATHOLOGY']]

        self['INITIAL_EVIDENCE'] = [evidences_en.loc[evidences_en['name'] == k].index[0] for k in self['INITIAL_EVIDENCE']]

        for column in SYMPTOMS_WITH_STR_ENTRIES:
            self[column] = [get_english(column, thing) for thing in self[column].values]

    def replace_values(self):
        for column in self.columns:
            if column in REPLACE_DICT:
                self.loc[self[column].isnull(), column] = REPLACE_DICT[column]

    def to_integers(self):
        for column in self.columns:
            if column in SYMPTOMS_WITH_STR_ENTRIES + ['SEX', 'PATHOLOGY', 'INITIAL_EVIDENCE']:
                continue
            self[column] = self[column].astype('int64')

    def _constructor(self, *args, **kwargs):
        return DiagDataFrame(*args, **kwargs)

def load_csv(filename, diseases=DISEASES_FR, ddx=False):
    if ddx:
        loader = cudf.read_csv(filename, iterator=True, chunksize=10000)
    else:
        loader = cudf.read_csv(filename, iterator=True, chunksize=10000,
                            usecols=lambda x: x != "DIFFERENTIAL_DIAGNOSIS")
    ddf = DiagDataFrame(cudf.concat([chunk[chunk['PATHOLOGY'].isin(diseases)] for chunk in loader]))
    ddf.format_and_translate()
    return ddf

def load_feather(filename):
    return DiagDataFrame(cudf.read_feather(filename))

def load_datasets(subsets=['train', 'validate', 'test'], ddx=False, directory=DATA_DIR, csv=False, diseases=DISEASES_FR):
    load_metadata(directory)
    df = {}
    for ds in subsets:
        if csv:
            df[ds] = load_csv(directory + 'release_' + ds + '_patients.csv', ddx=ddx, diseases=diseases)
        else:
            df[ds] = load_feather(directory + ds + '.feather')
    return df

## Regular stuff

In [2]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [3]:
DRIVE = '/content/drive/MyDrive/Disease-Prediction/ddx-dataset/'

In [4]:
!pip install git+https://github.com/nina-adhikari/disease_prediction

Collecting git+https://github.com/nina-adhikari/disease_prediction
  Cloning https://github.com/nina-adhikari/disease_prediction to /tmp/pip-req-build-ug59b5pn
  Running command git clone --filter=blob:none --quiet https://github.com/nina-adhikari/disease_prediction /tmp/pip-req-build-ug59b5pn
  Resolved https://github.com/nina-adhikari/disease_prediction to commit 05cff20b63666bcb740956f2dec38d65ea7be09f
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: disease-prediction
  Building wheel for disease-prediction (setup.py) ... [?25l[?25hdone
  Created wheel for disease-prediction: filename=disease_prediction-0.1-py3-none-any.whl size=3831 sha256=a880ac102c3de93efae6945b13f0f39b8ca6e506605498e3c2be58b58d3d4be3
  Stored in directory: /tmp/pip-ephem-wheel-cache-oryhat50/wheels/c1/30/69/a4efc8ebfadf754cf631ddaf3e9e848bd514c4db078acf14f5
Successfully built disease-prediction
Installing collected packages: disease-prediction
Successfully installed

In [5]:
from disease_prediction.data import datasets as ds

In [6]:
SUBSETS = ['train', 'test', 'validate']

In [7]:
df = ds.load_datasets(subsets=SUBSETS, directory=DRIVE)

In [8]:
df['test']

Unnamed: 0,AGE,SEX,PATHOLOGY,INITIAL_EVIDENCE,sweating,pain,pain_char,pain_somewhere,pain_intensity,pain_radiate,...,ebolacase,bruising,breastfed_9,anorexia,new_fatigue,vomiting_cough,coughing_fits,vaccination,cont_pertussis,wheezing_inhale
22,10,F,Influenza,cough,0,1,sharp,belly,3,nowhere,...,0,0,0,0,0,0,0,0,0,0
24,79,M,Influenza,sore_throat,1,1,exhausting,temple(R),6,nowhere,...,0,0,0,0,0,0,0,0,0,0
35,20,M,Influenza,fever,0,1,a cramp,pubis,4,nowhere,...,0,0,0,0,0,0,0,0,0,0
38,3,F,Localized edema,pain,0,1,sensitive,temple(L),2,nowhere,...,0,0,0,0,0,0,0,0,0,0
39,87,M,Localized edema,pain,0,1,sharp,sole(L),3,nowhere,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134509,9,M,Localized edema,gained_weight,0,0,,nowhere,0,nowhere,...,0,0,0,0,0,0,0,0,0,0
134520,17,F,Anaphylaxis,nausea,0,0,,nowhere,0,nowhere,...,0,0,0,0,0,0,0,0,0,0
134521,4,F,Influenza,muscle_pain,0,0,,nowhere,0,nowhere,...,0,0,0,0,0,0,0,0,0,0
134522,21,M,Allergic sinusitis,eye_itching,0,0,,nowhere,0,nowhere,...,0,0,0,0,0,0,0,0,0,0


In [9]:
d = {'Y': 1, 'N': 0}

# drop the columns that have a single value in all three datasets and convert Y/N to 1/0
for subset in SUBSETS:
    df[subset].drop(columns=['pain_radiate', 'lesions_peeling'], inplace=True)
    df[subset]['lesion_larger_than_1cm'] = df[subset]['lesion_larger_than_1cm'].map(d)

In [10]:
STANDARD_ORDER = ['PATHOLOGY', 'INITIAL_EVIDENCE', 'SEX',
    'trav1', 'pain_char', 'lesion_color', 'pain_somewhere',
    'lesion_location', 'swelling_location',
    'AGE', 'swollen_nodes', 'std', 'lesion_larger_than_1cm',
    'sweating', 'diarrhea', 'pain',
    'pain_intensity', 'pain_precise', 'pain_sudden',
    'fever', 'unprotected_sex', 'lesions',
    'lesion_pain_swollen', 'lesion_pain_intense', 'itching_severity',
    'nausea', 'weight_loss', 'sex_hiv', 'fam_allergies', 'fam_j45',
    'j45', 'itchy_nose', 'eye_itching', 'runny_nose', 'urban1',
    'severe_allergy', 'contact_allergy', 'short_breath', 'swelling',
    'lost_consciousness', 'stridor', 'z84.89', 'HIV',
    'cortico', 'IV_drugs', 'e10_e11', 'f10.129', 'cough', 'cough_blood',
    'v85.0', 'I30', 'f17.210', 'high_bp', 'ulcers', 'anorexia',
    'new_fatigue', 'nsaids', 'i50', 'i80', 'lymph_surg', 'synd_nephro',
    'convulsion', 'e66', 'red_eye', 'agri', 'gained_weight', 'k74',
    'patho_endo', 'dizziness', 'wheezing_exhale', 'fatigue_ext',
    'sore_throat', 'muscle_pain', 'lost_appetite', 'heart_valves', 'sahs',
    'cont_pertussis', 'vomiting_cough', 'coughing_fits', 'vaccination',
    'chills', 'z92.25', 'ca_blockers', 'vag_discharge', 'wheezing_inhale',
    'fatigue', 'menarche_12', 'breastfed_9', 'confusion', 'contact',
    'ebolacase', 'bruising']

In [11]:
len(STANDARD_ORDER)

92

In [12]:
list(df['test'].columns)

['AGE',
 'SEX',
 'PATHOLOGY',
 'INITIAL_EVIDENCE',
 'sweating',
 'pain',
 'pain_char',
 'pain_somewhere',
 'pain_intensity',
 'pain_precise',
 'pain_sudden',
 'f17.210',
 'fatigue_ext',
 'fever',
 'sore_throat',
 'lesions',
 'lesion_color',
 'lesion_pain_swollen',
 'lesion_location',
 'lesion_pain_intense',
 'lesion_larger_than_1cm',
 'itching_severity',
 'muscle_pain',
 'lost_appetite',
 'cough',
 'trav1',
 'z92.25',
 'runny_nose',
 'heart_valves',
 'cortico',
 'gained_weight',
 'i50',
 'i80',
 'k74',
 'lymph_surg',
 'swelling',
 'swelling_location',
 'sahs',
 'synd_nephro',
 'nsaids',
 'swollen_nodes',
 'std',
 'diarrhea',
 'unprotected_sex',
 'weight_loss',
 'sex_hiv',
 'convulsion',
 'short_breath',
 'e66',
 'agri',
 'itchy_nose',
 'eye_itching',
 'urban1',
 'z84.89',
 'HIV',
 'cough_blood',
 'IV_drugs',
 'f10.129',
 'ca_blockers',
 'I30',
 'fatigue',
 'high_bp',
 'menarche_12',
 'ulcers',
 'red_eye',
 'vag_discharge',
 'nausea',
 'severe_allergy',
 'contact_allergy',
 'wheezing_ex

In [13]:
set(df['train'].columns) - set(STANDARD_ORDER)

set()

In [14]:
set(STANDARD_ORDER) - set(df['test'].columns)

set()

In [15]:
textcolumns = [ 'PATHOLOGY', 'INITIAL_EVIDENCE', 'SEX'] + ds.SYMPTOMS_WITH_STR_ENTRIES

In [16]:
textcolumns.remove('pain_radiate')
textcolumns.remove('lesions_peeling')
textcolumns.remove('lesion_larger_than_1cm')

## Feature analysis

In [None]:
df['train'].drop(columns=textcolumns).corr()

Unnamed: 0,AGE,swollen_nodes,std,sweating,diarrhea,pain,pain_intensity,pain_precise,pain_sudden,fever,...,ca_blockers,vag_discharge,wheezing_inhale,fatigue,menarche_12,breastfed_9,confusion,contact,ebolacase,bruising
AGE,1.000000,0.001783,-0.002762,-0.000528,-0.001544,0.001417,0.002443,0.002019,0.002240,0.000480,...,0.002666,0.002324,-0.000780,0.002945,0.002449,0.003435,0.002188,0.001048,0.003157,0.003059
swollen_nodes,0.001783,1.000000,0.517607,0.318335,0.355019,0.489224,0.417984,0.437258,0.303732,0.376996,...,-0.027331,0.283805,-0.012428,-0.017983,-0.013464,-0.007197,-0.004947,-0.005162,-0.005701,-0.005585
std,-0.002762,0.517607,1.000000,0.456444,0.420518,0.344240,0.307555,0.312695,0.230321,0.474110,...,-0.019265,-0.014409,-0.008760,-0.012676,-0.009491,-0.005073,-0.003487,-0.003638,-0.004018,-0.003937
sweating,-0.000528,0.318335,0.456444,1.000000,0.256506,0.453384,0.405397,0.420040,0.297613,0.600354,...,-0.025335,-0.018948,-0.011520,-0.016670,-0.012481,-0.006672,-0.004586,-0.004785,-0.005284,-0.005177
diarrhea,-0.001544,0.355019,0.420518,0.256506,1.000000,0.470297,0.463526,0.354911,0.418894,0.329407,...,-0.026679,-0.019953,-0.012131,-0.003712,-0.013143,-0.007025,0.071731,0.076499,0.084890,0.083098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
breastfed_9,0.003435,-0.007197,-0.005073,-0.006672,-0.007025,0.080242,0.068603,0.118476,0.053360,-0.008897,...,-0.004476,-0.003348,-0.002035,0.239962,0.363858,1.000000,-0.000810,-0.000845,-0.000934,-0.000915
confusion,0.002188,-0.004947,-0.003487,-0.004586,0.071731,-0.010097,-0.009216,-0.008857,-0.008437,0.044624,...,-0.003077,-0.002301,-0.001399,0.115694,-0.001516,-0.000810,1.000000,0.695363,0.791009,0.760269
contact,0.001048,-0.005162,-0.003638,-0.004785,0.076499,-0.010535,-0.009616,-0.009241,-0.008802,0.043903,...,-0.003210,-0.002401,-0.001460,0.120125,-0.001581,-0.000845,0.695363,1.000000,0.846542,0.826418
ebolacase,0.003157,-0.005701,-0.004018,-0.005284,0.084890,-0.011634,-0.010620,-0.010205,-0.009721,0.048194,...,-0.003545,-0.002652,-0.001612,0.136040,-0.001747,-0.000934,0.791009,0.846542,1.000000,0.925309


In [None]:
!pip install ydata_profiling

Collecting ydata_profiling
  Downloading ydata_profiling-4.8.3-py2.py3-none-any.whl (359 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/359.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/359.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m358.4/359.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m359.5/359.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting visions[type_image_path]<0.7.7,>=0.7.5 (from ydata_profiling)
  Downloading visions-0.7.6-py3-none-any.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.8/104.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting htmlmin==0.1.12 (from ydata_profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Col

In [None]:
from ydata_profiling import ProfileReport

In [None]:
report = ProfileReport(df['train'], title='Pandas Profiling Report', minimal=True)

In [None]:
report

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [None]:
import pandas as pd

dummies = pd.get_dummies(df['train']['PATHOLOGY'])

In [None]:
dummies

Unnamed: 0,Allergic sinusitis,Anaphylaxis,Chagas,Ebola,HIV (initial infection),Influenza,Localized edema,SLE,Sarcoidosis,Tuberculosis,Whooping cough
1,False,False,False,False,True,False,False,False,False,False,False
10,True,False,False,False,False,False,False,False,False,False,False
13,False,True,False,False,False,False,False,False,False,False,False
18,False,False,False,False,False,False,False,False,False,True,False
19,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
1023490,False,False,False,False,True,False,False,False,False,False,False
1023491,False,False,False,False,True,False,False,False,False,False,False
1023492,False,False,False,False,True,False,False,False,False,False,False
1023493,False,False,False,False,True,False,False,False,False,False,False


In [None]:
dummies2 = pd.get_dummies(df['train']['INITIAL_EVIDENCE'])

In [None]:
dummies2

Unnamed: 0,anorexia,bruising,chills,confusion,contact_allergy,convulsion,cough,cough_blood,coughing_fits,diarrhea,...,stridor,sweating,swelling,swollen_nodes,ulcers,vag_discharge,vomiting_cough,weight_loss,wheezing_exhale,wheezing_inhale
1,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
10,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
18,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
19,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023490,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1023491,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1023492,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1023493,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False


## Models

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [18]:
def getXy(dataframe):
    temp = dataframe.drop(dataframe.loc[dataframe['PATHOLOGY'] == 'Ebola'].index)
    return temp.drop(columns=['PATHOLOGY']), temp['PATHOLOGY'].copy()

In [19]:
X_train, y_train = getXy(df['train'])

X_validate, y_validate = getXy(df['validate'])

In [20]:
y_validate

2                  Influenza
6         Allergic sinusitis
12                       SLE
15                 Influenza
19                 Influenza
                 ...        
132421          Tuberculosis
132423             Influenza
132425             Influenza
132433           Sarcoidosis
132441             Influenza
Name: PATHOLOGY, Length: 25309, dtype: object

### Preprocessing and Utilities

In [21]:
categories = [
    [
        'eye_itching', 'fever', 'chills', 'lesions', 'new_fatigue', 'diarrhea', 'cough', 'fatigue_ext', 'bruising', 'dizziness', 'runny_nose', 'coughing_fits', 'convulsion', 'confusion', 'pain', 'weight_loss', 'wheezing_exhale', 'lost_consciousness', 'red_eye', 'fatigue', 'ulcers', 'nausea', 'cough_blood', 'short_breath', 'sore_throat', 'sweating', 'muscle_pain', 'vag_discharge', 'contact_allergy', 'swelling', 'vomiting_cough', 'anorexia', 'lost_appetite', 'wheezing_inhale', 'swollen_nodes', 'stridor', 'gained_weight', 'itchy_nose'
    ],
    ['M', 'F'],
    [
        'Europe', 'South America', 'South Africa', 'Central America', 'West Africa', 'North America', 'Asia', 'N'
    ],
    ['sensitive', 'burning', 'sharp', 'a pulse', 'a knife stroke', 'NA', 'a cramp', 'heavy', 'tugging', 'exhausting', 'tedious'],
    ['pink', 'red', 'pale', 'NA'],
    [
        'temple(R)', 'side of the neck(R)', 'shoulder(R)', 'knee(R)', 'iliac fossa(L)', 'cervical spine', 'side of the neck(L)', 'occiput', 'toe (3)(R)', 'dorsal aspect of the foot(L)', 'dorsal aspect of the foot(R)', 'hip(R)', 'sole(L)', 'finger (index)(L)', 'calf(R)', 'thigh(R)', 'flank(L)', 'finger (middle)(R)', 'toe (1)(R)', 'toe (1)(L)', 'top of the head', 'dorsal aspect of the wrist(L)', 'pharynx', 'nowhere', 'hypochondrium(R)', 'palmar face of the wrist(R)', 'shoulder(L)', 'palmar face of the wrist(L)', 'thumb(R)', 'toe (2)(L)', 'toe (3)(L)', 'thigh(L)', 'knee(L)', 'little toe (4)(R)', 'forehead', 'hypochondrium(L)', 'temple(L)', 'pubis', 'toe (2)(R)', 'back of the neck', 'iliac fossa(R)', 'hip(L)', 'dorsal aspect of the wrist(R)', 'calf(L)', 'belly', 'finger (index)(R)', 'epigastric', 'sole(R)', 'flank(R)', 'back of head', 'finger (middle)(L)'
    ],
    [
        'nose', 'side of the neck(R)', 'palace', 'shoulder(R)', 'thyroid cartilage', 'cervical spine', 'side of the neck(L)', 'labia minora(L)', 'internal cheek(R)', 'thigh(R)', 'nowhere', 'internal cheek(L)', 'ankle(R)', 'shoulder(L)', 'penis', 'vagina', 'lumbar spine', 'ankle(L)', 'labia minora(R)', 'thigh(L)', 'under the tongue', 'forehead', 'upper lip(R)', 'cheek(R)', 'bottom lip(R)', 'back of the neck', 'scrotum', 'cheek(L)', 'epigastric', 'thoracic spine'
    ],
    [
        'nose', 'toe (1)(R)', 'toe (1)(L)', 'thigh(L)', 'calf(L)', 'cheek(L)', 'sole(R)', 'nowhere', 'dorsal aspect of the foot(L)', 'dorsal aspect of the foot(R)', 'sole(L)', 'forehead', 'calf(R)', 'thigh(R)', 'cheek(R)'
    ]

]

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_recall_fscore_support as prfs
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report as cr
from sklearn.feature_selection import SelectFromModel


SEED = 42
BETA = 2

categorical_features = textcolumns.copy()
numerical_features = [column for column in STANDARD_ORDER if column not in categorical_features]
categorical_features.remove('PATHOLOGY')


In [23]:
df['test'][numerical_features].iloc[22].values

array([25,  1,  0,  0,  0,  0,  1,  5,  7,  2,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [30]:
def custom_pipeline(classifier):
    features_preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(categories=categories), categorical_features)
        ],
        remainder='passthrough'
    )
    if classifier == None:
        return make_pipeline(features_preprocessor)
    return make_pipeline(features_preprocessor, classifier)

In [88]:
def score(model, X, y):
    y_pred = model.predict(X)
    #return prfs(y, y_pred, beta=BETA)
    classes = model.classes_
    scores = recall_score(y, y_pred, labels=classes, average=None)
    ret = {}
    for i in range(len(classes)):
        ret[classes[i]] = scores[i]
    return pd.DataFrame(ret, index=['recall']).transpose()

In [125]:
def print_score(model):
    print(cr(y_validate, model.predict(X_validate)))

In [126]:
from PIL import Image, ImageDraw, ImageFont

def export_image(text, filename):
    W, H = (900,450)
    im = Image.new("RGBA",(W,H),"white")

    draw = ImageDraw.Draw(im)

    font = ImageFont.truetype(font=DRIVE + 'FreeMono.ttf', size=20)
    draw.text(xy=(50,50), text=text, fill='black', font=font)

    # Save Image
    im.save(filename, "PNG")

In [217]:
def fit_and_score(classifier):
    model = custom_pipeline(classifier)
    model.fit(X_train, y_train)
    text = str(classifier) + '\n\n' + cr(y_validate, model.predict(X_validate))
    print(text)
    export_image(text, str(classifier) + '.png')
    return model

### Tensorflow with NumPy arrays

In [47]:
ct = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(categories=categories), categorical_features)
    ],
    remainder='passthrough'
)

In [48]:
X_train_transformed = ct.fit_transform(X_train)
X_validate_transformed = ct.transform(X_validate)

In [52]:
features_transformed = ct.get_feature_names_out()

In [61]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train_transformed = le.fit_transform(y_train)
y_validate_transformed = le.transform(y_validate)

In [65]:
ohe = OneHotEncoder()

y_train_transformed = ohe.fit_transform(y_train_transformed.reshape(-1, 1)).toarray()
y_validate_transformed = ohe.transform(y_validate_transformed.reshape(-1, 1)).toarray()

In [70]:
y_train_transformed.shape

(202290, 10)

In [56]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras

In [78]:
from keras.losses import CategoricalCrossentropy
from keras.metrics import CategoricalAccuracy

model = keras.Sequential([
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(10, activation='softmax')
])

model.compile(loss=CategoricalCrossentropy(), optimizer='adam', metrics=[CategoricalAccuracy()])

In [80]:
model.fit(
    x = X_train_transformed,
    y = y_train_transformed,
    validation_data = (X_validate_transformed, y_validate_transformed),
    epochs=2,
    batch_size=64
    )

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x79ba1d9692a0>

Using Keras functional API:

In [82]:
inputs = keras.Input(shape=(X_train_transformed.shape[1],))
x = keras.layers.Dense(64, activation='relu')(inputs)
x = keras.layers.Dense(64, activation='relu')(x)
outputs = keras.layers.Dense(10, activation='softmax')(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [89]:
from keras.optimizers import RMSprop

model.compile(
    loss=CategoricalCrossentropy(),
    optimizer=RMSprop(),
    metrics=[CategoricalAccuracy()],
)

In [90]:
history = model.fit(
    x = X_train_transformed,
    y = y_train_transformed,
    validation_data = (X_validate_transformed, y_validate_transformed),
    epochs=2,
    batch_size=64
    )

Epoch 1/2
Epoch 2/2


### Logistic Regression

In [192]:
from sklearn.linear_model import LogisticRegression

solver = 'saga'
max_iter = 10

lr = LogisticRegression(max_iter=max_iter, solver=solver)

fit_and_score(lr)



LogisticRegression(max_iter=10, solver='saga')

                         precision    recall  f1-score   support

     Allergic sinusitis       0.77      0.90      0.83      2136
            Anaphylaxis       0.91      0.41      0.57      3754
                 Chagas       1.00      0.22      0.37      1124
HIV (initial infection)       0.40      0.58      0.47      3852
              Influenza       0.66      0.34      0.45      3590
        Localized edema       0.52      1.00      0.68      3694
                    SLE       0.95      0.32      0.48      1579
            Sarcoidosis       0.60      0.71      0.65      3028
           Tuberculosis       0.60      0.48      0.53      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.59     25309
              macro avg       0.74      0.60      0.60     25309
           weighted avg       0.67      0.59      0.58     25309



### Stochastic Gradient Descent Classifier

In [195]:
from sklearn.linear_model import SGDClassifier

loss = 'log_loss'
penalty = 'l1'
alpha = 0.01

sgd = SGDClassifier(
    #loss=loss,
    penalty=penalty,
    #alpha=alpha,
    random_state = SEED,
    shuffle=True
    )

fit_and_score(sgd)

SGDClassifier(penalty='l1', random_state=42)

                         precision    recall  f1-score   support

     Allergic sinusitis       0.77      0.90      0.83      2136
            Anaphylaxis       0.90      0.42      0.58      3754
                 Chagas       0.82      0.23      0.36      1124
HIV (initial infection)       0.40      0.38      0.39      3852
              Influenza       0.42      0.51      0.46      3590
        Localized edema       0.53      0.96      0.68      3694
                    SLE       0.40      0.48      0.44      1579
            Sarcoidosis       0.74      0.57      0.65      3028
           Tuberculosis       0.65      0.41      0.50      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.57     25309
              macro avg       0.66      0.59      0.59     25309
           weighted avg       0.62      0.57      0.56     25309



### K Nearest Neighbors

In [228]:
from sklearn.neighbors import KNeighborsClassifier

neighbors = 5

knn = KNeighborsClassifier(
    n_neighbors = neighbors
    )
fit_and_score(knn)

KNeighborsClassifier()

                         precision    recall  f1-score   support

     Allergic sinusitis       0.44      0.81      0.57      2136
            Anaphylaxis       0.46      0.50      0.48      3754
                 Chagas       0.37      0.24      0.29      1124
HIV (initial infection)       0.32      0.33      0.33      3852
              Influenza       0.42      0.32      0.36      3590
        Localized edema       0.55      0.64      0.59      3694
                    SLE       0.50      0.31      0.38      1579
            Sarcoidosis       0.56      0.49      0.53      3028
           Tuberculosis       0.50      0.35      0.42      2007
         Whooping cough       0.89      0.80      0.84       545

               accuracy                           0.47     25309
              macro avg       0.50      0.48      0.48     25309
           weighted avg       0.47      0.47      0.46     25309



### Multinomial Naive Bayes

In [196]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB(alpha=0.1)

fit_and_score(nb)

ValueError: Negative values in data passed to MultinomialNB (input X)

### Decision Tree Classifier

In [197]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state = SEED)

fit_and_score(dt)

DecisionTreeClassifier(random_state=42)

                         precision    recall  f1-score   support

     Allergic sinusitis       0.74      0.88      0.81      2136
            Anaphylaxis       0.65      0.52      0.58      3754
                 Chagas       0.56      0.26      0.35      1124
HIV (initial infection)       0.42      0.42      0.42      3852
              Influenza       0.55      0.36      0.43      3590
        Localized edema       0.54      0.93      0.68      3694
                    SLE       0.62      0.39      0.48      1579
            Sarcoidosis       0.63      0.63      0.63      3028
           Tuberculosis       0.53      0.54      0.53      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.58     25309
              macro avg       0.62      0.59      0.59     25309
           weighted avg       0.58      0.58      0.56     25309



### Ensemble methods

In [199]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = SEED, n_estimators=10)

fit_and_score(rf)

RandomForestClassifier(n_estimators=10, random_state=42)

                         precision    recall  f1-score   support

     Allergic sinusitis       0.74      0.90      0.81      2136
            Anaphylaxis       0.66      0.51      0.58      3754
                 Chagas       0.68      0.23      0.35      1124
HIV (initial infection)       0.42      0.42      0.42      3852
              Influenza       0.57      0.36      0.44      3590
        Localized edema       0.53      0.95      0.68      3694
                    SLE       0.61      0.38      0.47      1579
            Sarcoidosis       0.64      0.63      0.63      3028
           Tuberculosis       0.52      0.55      0.53      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.58     25309
              macro avg       0.64      0.59      0.59     25309
           weighted avg       0.59      0.58      0.56     25309



In [247]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state = SEED, n_estimators=10)

fit_and_score(gb)

GradientBoostingClassifier(n_estimators=10, random_state=42)

                         precision    recall  f1-score   support

     Allergic sinusitis       0.86      0.83      0.85      2136
            Anaphylaxis       0.59      0.46      0.52      3754
                 Chagas       1.00      0.22      0.37      1124
HIV (initial infection)       0.39      0.60      0.48      3852
              Influenza       0.71      0.29      0.41      3590
        Localized edema       0.52      1.00      0.68      3694
                    SLE       1.00      0.32      0.48      1579
            Sarcoidosis       0.78      0.56      0.65      3028
           Tuberculosis       0.57      0.66      0.61      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.59     25309
              macro avg       0.74      0.60      0.60     25309
           weighted avg       0.66      0.59      0.57     25309



In [201]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(random_state = SEED, n_estimators=10, estimator=RandomForestClassifier(random_state = SEED, n_estimators=10))

fit_and_score(ada)

AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=10,
                                                    random_state=42),
                   n_estimators=10, random_state=42)

                         precision    recall  f1-score   support

     Allergic sinusitis       0.74      0.90      0.81      2136
            Anaphylaxis       0.69      0.50      0.58      3754
                 Chagas       0.78      0.23      0.36      1124
HIV (initial infection)       0.42      0.42      0.42      3852
              Influenza       0.57      0.36      0.44      3590
        Localized edema       0.53      0.97      0.68      3694
                    SLE       0.65      0.38      0.48      1579
            Sarcoidosis       0.63      0.64      0.63      3028
           Tuberculosis       0.52      0.55      0.54      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.58     25309
              macro avg  

In [32]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

xgb = XGBClassifier(random_state = SEED, n_estimators=10)

xgb_model = custom_pipeline(xgb)
xgb_model.fit(X_train, le.fit_transform(y_train))
prfs(xgb_model, X_validate, le.fit_transform(y_validate))

(0.6768716420207156, 0.5939997637702272, 0.5767537667895815, None)

In [34]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [214]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(
    random_state = SEED,
    iterations=10,
    learning_rate=0.8
    )

fit_and_score(cat)

0:	learn: 1.7534206	total: 1.46s	remaining: 13.1s
1:	learn: 1.4637232	total: 2.98s	remaining: 11.9s
2:	learn: 1.3375018	total: 4.54s	remaining: 10.6s
3:	learn: 1.3414745	total: 5.5s	remaining: 8.24s
4:	learn: 15.5164142	total: 6.35s	remaining: 6.35s
5:	learn: 52.3325873	total: 7.2s	remaining: 4.8s
6:	learn: 131.4412794	total: 8.1s	remaining: 3.47s
7:	learn: 22.9945767	total: 8.97s	remaining: 2.24s
8:	learn: 75.1067938	total: 9.83s	remaining: 1.09s
9:	learn: 33.0058048	total: 10.7s	remaining: 0us
<catboost.core.CatBoostClassifier object at 0x79bfe3cbc310>

                         precision    recall  f1-score   support

     Allergic sinusitis       0.80      0.87      0.83      2136
            Anaphylaxis       0.82      0.47      0.60      3754
                 Chagas       1.00      0.22      0.36      1124
HIV (initial infection)       0.42      0.39      0.40      3852
              Influenza       0.53      0.43      0.48      3590
        Localized edema       0.52      1.00   

### Support Vector Machine Classifier

In [218]:
from sklearn.svm import LinearSVC

max_iter = 1

C = 1/np.sqrt(len(X_train))

svm = LinearSVC(
    random_state = SEED,
    max_iter=max_iter,
    C=C
    )

svm_model = fit_and_score(svm)



LinearSVC(C=0.0022233753825767564, max_iter=1, random_state=42)

                         precision    recall  f1-score   support

     Allergic sinusitis       0.73      0.92      0.82      2136
            Anaphylaxis       0.51      0.64      0.57      3754
                 Chagas       0.93      0.23      0.37      1124
HIV (initial infection)       0.43      0.40      0.41      3852
              Influenza       0.69      0.25      0.37      3590
        Localized edema       0.53      0.95      0.68      3694
                    SLE       0.88      0.33      0.48      1579
            Sarcoidosis       0.67      0.63      0.65      3028
           Tuberculosis       0.55      0.56      0.55      2007
         Whooping cough       0.99      1.00      1.00       545

               accuracy                           0.58     25309
              macro avg       0.69      0.59      0.59     25309
           weighted avg       0.62      0.58      0.56     25309



Warning: SVM takes a long time!

In [None]:
# from sklearn.svm import SVC

# svm = SVC(kernel='linear', random_state = SEED, decision_function_shape='ovo')

# svm_model = make_pipeline(ct, ss, svm)
# svm_model.fit(X_train, y_train)
# svm_model.score(X_validate, y_validate)

### Combinations

In [244]:
pipe = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(categories=categories), categorical_features)
        ],
        remainder='passthrough'
    ),
    SelectFromModel(
        LinearSVC(
            random_state = SEED,
            max_iter = 10,
            C = C
        ),
        threshold = 'median'
    ),
    GradientBoostingClassifier(random_state = SEED, n_estimators=10)
)


In [245]:
pipe.fit(X_train, y_train)



In [246]:
print(cr(y_validate, pipe.predict(X_validate)))

                         precision    recall  f1-score   support

     Allergic sinusitis       0.86      0.83      0.85      2136
            Anaphylaxis       0.59      0.46      0.52      3754
                 Chagas       1.00      0.22      0.37      1124
HIV (initial infection)       0.39      0.63      0.48      3852
              Influenza       0.74      0.25      0.37      3590
        Localized edema       0.52      1.00      0.68      3694
                    SLE       1.00      0.32      0.48      1579
            Sarcoidosis       0.78      0.56      0.65      3028
           Tuberculosis       0.57      0.66      0.61      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.59     25309
              macro avg       0.74      0.59      0.60     25309
           weighted avg       0.67      0.59      0.57     25309



In [248]:
pipe2 = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(categories=categories), categorical_features)
        ],
        remainder='passthrough'
    ),
    SelectFromModel(
        GradientBoostingClassifier(random_state = SEED, n_estimators=10),
        threshold = 'median'
    ),
    GradientBoostingClassifier(random_state = SEED, n_estimators=10)
)


In [249]:
pipe2.fit(X_train, y_train)

In [250]:
print(cr(y_validate, pipe2.predict(X_validate)))

                         precision    recall  f1-score   support

     Allergic sinusitis       0.86      0.83      0.85      2136
            Anaphylaxis       0.59      0.46      0.52      3754
                 Chagas       1.00      0.22      0.37      1124
HIV (initial infection)       0.39      0.60      0.48      3852
              Influenza       0.71      0.29      0.41      3590
        Localized edema       0.52      1.00      0.68      3694
                    SLE       1.00      0.32      0.48      1579
            Sarcoidosis       0.78      0.56      0.65      3028
           Tuberculosis       0.57      0.66      0.61      2007
         Whooping cough       1.00      1.00      1.00       545

               accuracy                           0.59     25309
              macro avg       0.74      0.60      0.60     25309
           weighted avg       0.66      0.59      0.57     25309



### Neural networks

In [251]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [291]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('PATHOLOGY')
  df = {key: value.values[:,tf.newaxis] for key, value in dataframe.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [292]:
batch_size = 5
train_ds = df_to_dataset(df['train'], batch_size=batch_size)

In [301]:
asdf = tf.concat(df['train'][numerical_features], 1)

In [304]:
layer = tf.keras.layers.Normalization(input_shape=(83,), axis=1)

In [305]:
layer.adapt(asdf)

In [307]:
layer(asdf)

<tf.Tensor: shape=(203008, 83), dtype=float32, numpy=
array([[-0.716, -0.21 , -0.148, ..., -0.025, -0.027, -0.027],
       [-1.272, -0.21 , -0.148, ..., -0.025, -0.027, -0.027],
       [ 0.481, -0.21 , -0.148, ..., -0.025, -0.027, -0.027],
       ...,
       [ 1.208, -0.21 , -0.148, ..., -0.025, -0.027, -0.027],
       [ 0.695, -0.21 , -0.148, ..., -0.025, -0.027, -0.027],
       [-0.674, -0.21 , -0.148, ..., -0.025, -0.027, -0.027]],
      dtype=float32)>

In [308]:
asdf

<tf.Tensor: shape=(203008, 83), dtype=int64, numpy=
array([[21,  0,  0, ...,  0,  0,  0],
       [ 8,  0,  0, ...,  0,  0,  0],
       [49,  0,  0, ...,  0,  0,  0],
       ...,
       [66,  0,  0, ...,  0,  0,  0],
       [54,  0,  0, ...,  0,  0,  0],
       [22,  0,  0, ...,  0,  0,  0]])>

In [313]:
categorical = tf.keras.layers.CategoryEncoding(
    #num_tokens = tf.keras.layers.StringLookup().vocabulary_size(),
    output_mode = 'one_hot'
    )

ValueError: num_tokens must be set to use this layer. If the number of tokens is not known beforehand, use the IntegerLookup layer instead.

In [310]:
pqrs = tf.concat(df['train'][categorical_features], 1)

In [312]:
pqrs

<tf.Tensor: shape=(203008, 8), dtype=string, numpy=
array([[b'sweating', b'M', b'N', ..., b'nowhere', b'nowhere', b'nowhere'],
       [b'itchy_nose', b'M', b'North America', ..., b'sole(L)',
        b'nowhere', b'dorsal aspect of the foot(R)'],
       [b'lost_consciousness', b'F', b'N', ..., b'nowhere', b'nowhere',
        b'nowhere'],
       ...,
       [b'pain', b'F', b'N', ..., b'nowhere', b'nowhere', b'nowhere'],
       [b'sweating', b'M', b'N', ..., b'nowhere', b'nowhere', b'nowhere'],
       [b'pain', b'M', b'N', ..., b'nowhere', b'nowhere', b'nowhere']],
      dtype=object)>

## Using a different representation of the dataset

In [368]:
def get_keys_with_value_one(dictionary):
  """
  This function returns a list of keys in a dictionary whose value is 1.

  Args:
    dictionary: The dictionary to search.

  Returns:
    A list of keys in the dictionary whose value is 1.
  """

  keys_with_value_one = []
  for key, value in dictionary.items():
    if value == 1:
      keys_with_value_one.append(key)
  return keys_with_value_one

In [369]:
strictly_numerical_features = ['AGE', 'pain_intensity', 'pain_precise', 'pain_sudden', 'lesion_pain_swollen', 'lesion_pain_intense', 'itching_severity']

features_with_1s = [feature for feature in numerical_features if feature not in strictly_numerical_features]

In [370]:
features_with_1s

['swollen_nodes',
 'std',
 'lesion_larger_than_1cm',
 'sweating',
 'diarrhea',
 'pain',
 'fever',
 'unprotected_sex',
 'lesions',
 'nausea',
 'weight_loss',
 'sex_hiv',
 'fam_allergies',
 'fam_j45',
 'j45',
 'itchy_nose',
 'eye_itching',
 'runny_nose',
 'urban1',
 'severe_allergy',
 'contact_allergy',
 'short_breath',
 'swelling',
 'lost_consciousness',
 'stridor',
 'z84.89',
 'HIV',
 'cortico',
 'IV_drugs',
 'e10_e11',
 'f10.129',
 'cough',
 'cough_blood',
 'v85.0',
 'I30',
 'f17.210',
 'high_bp',
 'ulcers',
 'anorexia',
 'new_fatigue',
 'nsaids',
 'i50',
 'i80',
 'lymph_surg',
 'synd_nephro',
 'convulsion',
 'e66',
 'red_eye',
 'agri',
 'gained_weight',
 'k74',
 'patho_endo',
 'dizziness',
 'wheezing_exhale',
 'fatigue_ext',
 'sore_throat',
 'muscle_pain',
 'lost_appetite',
 'heart_valves',
 'sahs',
 'cont_pertussis',
 'vomiting_cough',
 'coughing_fits',
 'vaccination',
 'chills',
 'z92.25',
 'ca_blockers',
 'vag_discharge',
 'wheezing_inhale',
 'fatigue',
 'menarche_12',
 'breastfed

In [386]:
from collections import defaultdict

def attach_evidences(dataframe):
    df_new = dataframe
    dfq = defaultdict(set)

    for (key, val) in get_keys_with_value_one(df_new.stack()):
        dfq[key].add(val)

    dfq = dict(dfq)
    df_new['EVIDENCES'] = dfq
    return df_new.drop(columns=features_with_1s)

In [387]:
df_collapsed_train = attach_evidences(df['train'])
df_collapsed_validate = attach_evidences(df['validate'])

### Sklearn

In [346]:
from sklearn.preprocessing import MultiLabelBinarizer

def multihot_pipeline(classifier):
    features_preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), strictly_numerical_features),
            ('cat', OneHotEncoder(categories=categories), categorical_features),
            ('evidences', MultiLabelBinarizer(classes=features_with_1s), 'EVIDENCES')
        ],
        remainder='passthrough'
    )
    return make_pipeline(features_preprocessor, classifier)

In [347]:
def multihot_fit_and_score(classifier):
    model = multihot_pipeline(classifier)
    model.fit(X_train, y_train)
    text = str(classifier) + '\n\n' + cr(y_validate, model.predict(X_validate))
    print(text)
    export_image(text, str(classifier) + '.png')
    return model

In [334]:
X_train

Unnamed: 0,AGE,SEX,PATHOLOGY,INITIAL_EVIDENCE,pain_char,pain_somewhere,pain_intensity,pain_precise,pain_sudden,lesion_color,lesion_pain_swollen,lesion_location,lesion_pain_intense,itching_severity,trav1,swelling_location,EVIDENCES
1,21,M,HIV (initial infection),sweating,,nowhere,0,0,0,,0,nowhere,0,0,N,nowhere,"[fam_allergies, fam_j45, j45, itchy_nose, eye_..."
10,8,M,Allergic sinusitis,itchy_nose,heavy,sole(L),6,8,8,,0,nowhere,0,0,North America,dorsal aspect of the foot(R),"[pain, swelling, cortico, nsaids, i50, i80, ly..."
13,49,F,Anaphylaxis,lost_consciousness,,nowhere,0,0,0,,0,nowhere,0,0,N,nowhere,"[fam_allergies, fam_j45, j45, eye_itching, run..."
18,69,M,Tuberculosis,cough,heavy,occiput,8,5,0,pink,0,forehead,2,2,N,nowhere,"[pain, fever, lesions, lesion_larger_than_1cm,..."
19,30,F,Tuberculosis,cough_blood,sharp,epigastric,9,1,3,pink,5,ankle(R),2,8,N,nose,"[pain, pain_precise, lesions, lesion_larger_th..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023490,30,M,HIV (initial infection),nausea,,nowhere,0,0,0,,0,nowhere,0,0,N,nowhere,
1023491,7,F,HIV (initial infection),nausea,,nowhere,0,0,0,,0,nowhere,0,0,N,nowhere,
1023492,66,F,HIV (initial infection),pain,,nowhere,0,0,0,,0,nowhere,0,0,N,nowhere,
1023493,54,M,HIV (initial infection),sweating,,nowhere,0,0,0,,0,nowhere,0,0,N,nowhere,


In [348]:
from sklearn.linear_model import LogisticRegression

solver = 'saga'
max_iter = 10

lr = LogisticRegression(max_iter=max_iter, solver=solver)

multihot_fit_and_score(lr)

TypeError: MultiLabelBinarizer.fit_transform() takes 2 positional arguments but 3 were given

### Tensorflow

In [389]:
df_collapsed_train.fillna(0, inplace=True)

In [390]:
df_collapsed_train

Unnamed: 0,AGE,SEX,PATHOLOGY,INITIAL_EVIDENCE,pain_char,pain_somewhere,pain_intensity,pain_precise,pain_sudden,lesion_color,lesion_pain_swollen,lesion_location,lesion_pain_intense,itching_severity,trav1,swelling_location,EVIDENCES
1,21,M,HIV (initial infection),sweating,,nowhere,0,0,0,,0,nowhere,0,0,N,nowhere,"{j45, runny_nose, itchy_nose, eye_itching, fam..."
10,8,M,Allergic sinusitis,itchy_nose,heavy,sole(L),6,8,8,,0,nowhere,0,0,North America,dorsal aspect of the foot(R),"{nsaids, lymph_surg, i80, swelling, synd_nephr..."
13,49,F,Anaphylaxis,lost_consciousness,,nowhere,0,0,0,,0,nowhere,0,0,N,nowhere,"{j45, runny_nose, eye_itching, fam_allergies, ..."
18,69,M,Tuberculosis,cough,heavy,occiput,8,5,0,pink,0,forehead,2,2,N,nowhere,"{runny_nose, cough, f17.210, muscle_pain, lost..."
19,30,F,Tuberculosis,cough_blood,sharp,epigastric,9,1,3,pink,5,ankle(R),2,8,N,nose,"{wheezing_exhale, lesion_larger_than_1cm, cont..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023490,30,M,HIV (initial infection),nausea,,nowhere,0,0,0,,0,nowhere,0,0,N,nowhere,0
1023491,7,F,HIV (initial infection),nausea,,nowhere,0,0,0,,0,nowhere,0,0,N,nowhere,0
1023492,66,F,HIV (initial infection),pain,,nowhere,0,0,0,,0,nowhere,0,0,N,nowhere,0
1023493,54,M,HIV (initial infection),sweating,,nowhere,0,0,0,,0,nowhere,0,0,N,nowhere,0


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [446]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('PATHOLOGY')
  df = {key: value.values[:,tf.newaxis] for key, value in dataframe.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [391]:
batch_size = 25
train_ds = df_to_dataset(df_collapsed_train, batch_size=batch_size)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type set).

In [449]:
def get_normalization_layer(name, dataframe):
  # Create a Normalization layer for the feature.
  normalizer = tf.keras.layers.Normalization(axis=None)

  # Prepare a Dataset that only yields the feature.
  feature_ds = tf.data.Dataset.from_tensor_slices(dataframe[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

In [405]:
strictly_numerical_features.remove('AGE')

In [450]:
all_inputs = []
encoded_features = []

# Numerical features.
for header in strictly_numerical_features:
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, df_collapsed_train)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

In [1]:
all_inputs

NameError: name 'all_inputs' is not defined

In [351]:
asdf = tf.concat(df['train'][strictly_numerical_features], 1)

normal = tf.keras.layers.Normalization(input_shape=(len(strictly_numerical_features),), axis=1)

normal.adapt(asdf)

In [352]:
asdf

<tf.Tensor: shape=(203008, 7), dtype=int64, numpy=
array([[21,  0,  0, ...,  0,  0,  0],
       [ 8,  6,  8, ...,  0,  0,  0],
       [49,  0,  0, ...,  0,  0,  0],
       ...,
       [66,  0,  0, ...,  0,  0,  0],
       [54,  0,  0, ...,  0,  0,  0],
       [22,  0,  0, ...,  0,  0,  0]])>

In [434]:
categorical_features

['INITIAL_EVIDENCE',
 'SEX',
 'trav1',
 'pain_char',
 'lesion_color',
 'pain_somewhere',
 'lesion_location',
 'swelling_location']

In [435]:
def get_category_encoding_layer(name, dataframe, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataframe[name].copy()

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

In [436]:
for feature in categorical_features:
    categorical_col = tf.keras.Input(shape=(1,), name=feature, dtype='string')
    encoding_layer = get_category_encoding_layer(
        name=feature,
        dataframe=df_collapsed_train,
        dtype='string',
        #max_tokens=5
        )
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

In [437]:
all_inputs

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'pain_intensity')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'pain_precise')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'pain_sudden')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'lesion_pain_swollen')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'lesion_pain_intense')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'itching_severity')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'INITIAL_EVIDENCE')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'SEX')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'trav1')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'pain_char')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'lesion_color')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'pain_somewhere')>,
 <KerasTensor: shape=(None, 

In [438]:
age_col = tf.keras.Input(shape=(1,), name='AGE', dtype='int32')
encoding_layer = get_category_encoding_layer(
    name='AGE',
    dataframe=df_collapsed_train,
    dtype='int',
    #max_tokens=5
    )
encoded_age_col = encoding_layer(age_col)
all_inputs.append(age_col)
encoded_features.append(encoded_age_col)

In [439]:
evi_col = tf.keras.Input(shape=(1,), name='EVIDENCES', dtype='int32')
evi_index = tf.keras.layers.StringLookup(vocabulary=features_with_1s, output_mode='multi_hot')
evi_encoder = tf.keras.layers.CategoryEncoding(num_tokens=evi_index.vocabulary_size())
evi_layer = lambda x: evi_encoder(evi_index(x))

encoded_evi_col = evi_layer(evi_col)
all_inputs.append(evi_col)
encoded_features.append(encoded_evi_col)

In [440]:
all_inputs

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'pain_intensity')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'pain_precise')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'pain_sudden')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'lesion_pain_swollen')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'lesion_pain_intense')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'itching_severity')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'INITIAL_EVIDENCE')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'SEX')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'trav1')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'pain_char')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'lesion_color')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'pain_somewhere')>,
 <KerasTensor: shape=(None, 

In [441]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)