#Imports

In [1]:
import pandas as pd
import numpy as np
import joblib
import re
from google.colab import drive
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from time import time
import warnings
warnings.filterwarnings('ignore')

# Loading Dataset

We're using a prepared dataset in csv format, already splitted into train, test and validation datasets. <br/>
This is done so we are able to compare with other models based on LM and LLM. <br/>
In the case of classic ML, we're training with train and val, and use test to calculate the score

In [2]:
train = pd.read_csv("med_all_bio_train.csv")
val = pd.read_csv('med_all_bio_val.csv')
test = pd.read_csv('med_all_bio_test.csv')

In [3]:
# Verify number of rows and columns in each file

print(f'Train:\t{train.shape}\nVal:\t{val.shape}\nTest:\t{test.shape}')

Train:	(64796, 2)
Val:	(15195, 2)
Test:	(1005, 2)


# Basic EDA

In [12]:
# Naive inspection

train.sample(10, random_state=0)

Unnamed: 0,medicamento,substancia
60636,PREGABALINA 150 mg,PREGABALINA
43620,FLUIMUCIL D 600 MG ENV 5 G,ACETILCISTEINA
31121,fenitOÍNA 250MG (50MG/ML) AMP 5ML,FENITOINA
9427,BROMOPRIDA 4MG/ML GTS,BROMOPRIDA
12821,DIFENIDRAMINA 50MG/ML EV,DIFENIDRAMINA
58895,CLOR. DE MIDAZOLAM 5MG/ML SOL. INJETAVEL AMP 2ML,MIDAZOLAM
19607,CLOPIDOGREL 75 MG COMP,CLOPIDOGREL
20498,NitroPRUSSETO de Sodio (Alta Vigilancia) - F/A...,NITROPRUSSIATO (NITROPRUSSETO)
1351,ISOXSUPRINA 5 MG/ML SOLUCAO INJETAVEL AMPOLA 2ML,ISOXSUPRINA
29252,ZIDOVUDINA 100MG – CAPSULA,ZIDOVUDINA


In [13]:
val.sample(10, random_state=0)

Unnamed: 0,medicamento,substancia
6080,Vodol pó 20 mg/g frasco c/ 30 g,NITRATO DE MICONAZOL (TOPICO)
1972,ACEBROFILINA 25MG/ML XPE PED.,AMBROXOL (ACEBROFILINA)
2607,VITAMINA C 500MG/5ML AMP 5ML AMP,ACIDO ASCORBICO (VITAMINA C)
15120,PROPOVAN 10MG/ML 10ML AMP,PROPOFOL
1553,Expec Xpe 120mL (Não Padrão),CLORIDRATO DE OXOMEMAZINA + IODETO DE POTASSIO...
12300,METILDOPA 250MG,METILDOPA
10772,BACLOFENO 10MG COMPRIMIDO,BACLOFENO
167,VANCOMICINA 500MG - FR - AMP,VANCOMICINA CLORIDRATO
6672,ETOMIDATO 20 MG AMPOLA 10ML,ETOMIDATO
11971,Papaverina 50mg/ml Ampola 2ml,PAPAVERINA


In [15]:
test.sample(10, random_state=0)

Unnamed: 0,medicamento,substancia
710,"Cloreto de potassio 60 mg/mL (0,6 meq/mL) fr 1...",CLORETO DE POTASSIO
666,Ácidos Graxos Essenciais Loção Hidratante Bisn...,ACIDOS GRAXOS ESSENCIAIS (TOPICO)
298,LAMOTRIGINA 100MG - COMPRIMIDO,LAMOTRIGINA
356,"AloPURinol 100mg, comprimido (ORAL)",AloPURinol
571,Água,AGUA PARA INJECAO
1000,(Do paciente) Mesalazina 500 mg supositório,MESALAZINA
27,Synvisc One Seringa 8mg/ml 6ml,HILANO + CLORETO DE SODIO + FOSFATO DISSODICO ...
231,FENILEFRINA 10MG/ML AMP 1ML,FENILEFRINA
306,VASOGARD 100MG 30 CP C,Cilostazol
764,NIMODIPINA 30 MILIGRAMAS (COMPRIMIDO),NiMODipino


In [16]:
# Our train dataset doesn't contain all the classes
print(' # of classes in full dataset: ', pd.concat([train, test, val]).substancia.nunique())
print(' # of classes in train dataset: ', pd.concat([train, val]).substancia.nunique())

 # of classes in full dataset:  2191
 # of classes in train dataset:  2182


#Preprocessing

In [17]:
# Therefore we define a mininum number of examples per class
min_samples_per_class = 10

# Find classes to remove
full = pd.concat([train, test, val])
class_counts = full['substancia'].value_counts()
classes_to_remove = class_counts[class_counts < min_samples_per_class].index

# Filter all dataframes
filtered_dataset = full[~full['substancia'].isin(classes_to_remove)]
train_filtered = train[~train['substancia'].isin(classes_to_remove)]
test_filtered = test[~test['substancia'].isin(classes_to_remove)]
val_filtered = val[~val['substancia'].isin(classes_to_remove)]


In [18]:
# After the filtering, our train dataset contains all classes present in the full dataset, which is good for evaluating
print(' # of classes in filtered dataset: ', filtered_dataset.substancia.nunique())
print(' # of classes in filtered dataset: ', train_filtered.substancia.nunique())




 # of classes in filtered dataset:  1050
 # of classes in filtered dataset:  1050


In [19]:
# As we don't need the val dataset for validation, we'll include it in the training
train_filtered = pd.concat([train_filtered, val_filtered])

#Pipeline and Training

In [20]:
# Separate into features and target
X_train, y_train = train_filtered.medicamento, train_filtered.substancia
X_test, y_test = test_filtered.medicamento, test_filtered.substancia

In [21]:
# Preprocessing function to remove unwanted chars
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[\W\d_]+', '', text) # Only letters
    return text


# Define our pipeline for evaluating different models / parameters
def evaluate_model(classifier, parameters):
    pipeline = Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', classifier),
    ])

    return pipeline.set_params(**parameters).fit(X_train, y_train).score(X_test, y_test, scoring='f1_macro')

In [22]:
# Define our hyperparameter dict
parameters = {
    'vect__max_features':1000,
    'vect__analyzer':'char',
    'vect__strip_accents':'ascii',
    'vect__ngram_range':(1,3),
    'vect__preprocessor':preprocess_text,
}

In [24]:
evaluate_model(RandomForestClassifier(n_jobs=-1), parameters)

0.9534391534391534