#Imports

In [None]:
import pandas as pd
import numpy as np
import joblib
import re
import os
import gc
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from time import time
import warnings
warnings.filterwarnings('ignore')

# Loading Dataset

We're using a prepared dataset in csv format, already splitted into train, test and validation datasets. <br/>
This is done so we are able to compare with other models based on LM and LLM. <br/>
In the case of classic ML, we're training with train and val, and using test to calculate the score.

In [None]:
train = pd.read_csv("med_all_bio_train.csv")
val = pd.read_csv('med_all_bio_val.csv')
test = pd.read_csv('med_all_bio_test.csv')

In [None]:
# Verify number of rows and columns in each file

print(f'Train:\t{train.shape}\nVal:\t{val.shape}\nTest:\t{test.shape}')

Train:	(64796, 2)
Val:	(15195, 2)
Test:	(1005, 2)


# Basic EDA

In [None]:
# Naive inspection of random rows

train.sample(10)

Unnamed: 0,medicamento,substancia
59308,BEVACizumabe SOLUCAO INJETAVEL IV 25MG/ML 4ML## ®,BEVACIZUMABE
41201,RISPERDAL1MG/ML FR 30ML,RISPERIDONA
49193,FLUoxetina 20mg Cap,FLUOXETINA
44042,HIDROCORTISONA 500 MG FRA,HIDROCORTISONA SUCCINATO SODICO
49287,ADENOSina 3MG/ML 2ML AMP,ADENOSINA
14221,Petidina 50 mg/ml amp 2 ml,PETIDINA (MEPERIDINA)
58747,Benerva 300mg Cp rev,TIAMINA (VITAMINA B1)
28533,BECLOMETASONA 50MCG SPRAY N.P.,beclometasona
51596,MIDAZOLAM 2MG/ML 10ML SOL.ORAL,MIDAZOLAM
63042,Zinnat 250mg/5ml (50mg/ml) Suspensão Oral Fr 70ml,AXETILCEFUROXIMA


In [None]:
val.sample(10)

Unnamed: 0,medicamento,substancia
7365,PIRACETAM 400MG COMP,PIRACETAM
1992,"CloNIDina 0,100 mg comprimido",CLONIDINA
5370,Maleato De Enalapril 10mg | Comprimido,ENALAPRIL MALEATO
4191,Propofol 1.000 mg/100 ml 10mg/ml,PROPOFOL
13094,METARAMINOL 10MG/ML C/ 1ML INJ,METARAMINOL
8419,Trileptal 60mg/ml Suspensão Oral 100ml,OXCARBAZEPINA
5790,MC: CARBONATO DE CALCIO 1,CARBONATO DE CALCIO
6034,CEFUROXIMA 750 MG - FA,CEFUROXIMA
6787,Empagliflozina 25mg Comprimido Revestido,empagliflozina
8042,Losartana Potássica 50mg Cp. Rev.,LOSARTANA


In [None]:
test.sample(10)

Unnamed: 0,medicamento,substancia
687,Lisodren (Mitotano) 500mg comprimido,MITOTANO
894,"DEXAMETASONA 0,1% CREME BISNAGA 10G",DEXAmetasona (tópico)
939,OLEO DE GIRASSOL 2% - 50GR - NP,OLEO DE GIRASSOL (TOPICO)
231,FENILEFRINA 10MG/ML AMP 1ML,FENILEFRINA
639,ALBENDAZOL 400MG COMPRIMIDO,ALBENDAZOL
813,METOCLOPRAMIDA INJETAVEL - 2 ML 5 MG/ML AMP,METOCLOPRAMIDA
197,"NATRILIX SR 1,5MG 30 CP C",Indapamida
523,LANSOPRAZOL CAPSULA,LANSOPRAZOL
762,OSCAL (CARBONATO DE CALCIO) 500MG COMP,CARBONATO DE CALCIO
774,MEROpenem 1g Frasco-ampola - ABL,MEROPENEM


In [None]:
# Our train dataset doesn't contain all the classes
print(' # of classes in full dataset: ', pd.concat([train, test, val]).substancia.nunique())
print(' # of classes in train dataset: ', pd.concat([train, val]).substancia.nunique())

 # of classes in full dataset:  2191
 # of classes in train dataset:  2182


#Preprocessing

In [None]:
# Define a mininum number of examples per class
min_samples_per_class = 10

# Find classes to remove
full = pd.concat([train, test, val])
class_counts = full['substancia'].value_counts()
classes_to_remove = class_counts[class_counts < min_samples_per_class].index

# Filter all dataframes
filtered_dataset = full[~full['substancia'].isin(classes_to_remove)]
train_filtered = train[~train['substancia'].isin(classes_to_remove)]
test_filtered = test[~test['substancia'].isin(classes_to_remove)]
val_filtered = val[~val['substancia'].isin(classes_to_remove)]


In [None]:
# After the filtering, our train dataset contains all classes present in the full dataset, which is good for evaluating
print(' # of classes in filtered full dataset: ', filtered_dataset.substancia.nunique())
print(' # of classes in filtered train dataset: ', train_filtered.substancia.nunique())




 # of classes in filtered full dataset:  1050
 # of classes in filtered train dataset:  1050


In [None]:
# As we don't need the val dataset for validation, we'll include it in the training
train_filtered = pd.concat([train_filtered, val_filtered])

#Pipeline and Training

In [None]:
# Separate into features and target
X_train, y_train = train_filtered.medicamento, train_filtered.substancia
X_test, y_test = test_filtered.medicamento, test_filtered.substancia

In [None]:
# Define our pipeline for evaluating different models / parameters
def evaluate_model(classifier, parameters, **kwargs):

    params = parameters | kwargs # include whatever new arguments

    pipeline = Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', classifier),
    ])

    # Start clock
    init_time = time()

    # Train and calculate score
    f1_score = pipeline.set_params(**params).fit(X_train, y_train).score(X_test, y_test, scoring='f1_macro')
    train_time = time()

    # Measure predict latency
    tokens = pipeline['vect'].transform(y_test.sample(1, random_state=0))
    pipeline['clf'].predict(tokens)
    infer_time = time()

    # Save temporary files
    joblib.dump(pipeline['clf'], 'substances_classifier.gz')
    joblib.dump(pipeline['vect'], 'substances_vectorizer.gz')

    # Get file size
    classifier_mb = os.path.getsize('substances_classifier.gz') / (1024**2)
    vectorizer_kb = os.path.getsize('substances_vectorizer.gz') / 1024

    # Delete files
    os.remove('substances_classifier.gz')
    os.remove('substances_vectorizer.gz')

    # Exclude parameters from model name
    model_name = re.sub(r"\(.*?\)", "", str(pipeline['clf']))

    # Free RAM
    del pipeline
    gc.collect()

    # Execution time
    train_seconds = train_time - init_time
    infer_seconds = infer_time - train_time

    # Calculate costs based on n1-highcpu-96
    second_cost = 3.910843 / 3600
    train_cost = train_seconds * second_cost
    predict_cost = infer_seconds * second_cost

    # Print and return 'row' of results
    row =  {
            'model':model_name,
            'max_df':str(params['vect__max_df']),
            'max_features':str(params['vect__max_features']),
            'ngram_range':str(params['vect__ngram_range']),
            'f1_score':f1_score,
            'model_size_mb':classifier_mb,
            'vectorizer_size_kb':vectorizer_kb,
            'train_minutes':train_seconds / 60,
            'train_cost':train_cost,
            'predict_seconds_one_sample':infer_seconds,
            'predict_cost_one_sample':predict_cost,
            }

    display(pd.DataFrame(row, index=[0]))
    return row







In [None]:
# Define our hyperparameter dict with general values
parameters = {
    'vect__max_features':500,
    'vect__analyzer':'char',
    'vect__strip_accents':'ascii',
    'vect__ngram_range':(1,3),
    'vect__max_df':0.5,
}

# Running the experiment

In [None]:
# Final dataframe
results = []

# Algorithms to be tested
model_list = [
                RandomForestClassifier(n_jobs=-1),
                ExtraTreesClassifier(n_jobs=-1),
                KNeighborsClassifier(n_jobs=-1),
                LogisticRegression(n_jobs=-1)
             ]


In [None]:
# Standard parameters
for model in model_list:
    result = evaluate_model(model, parameters)
    results.append(result)


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,RandomForestClassifier,0.5,500,"(1, 3)",0.949206,201.670169,7.329102,0.454082,0.029597,0.055236,6e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,ExtraTreesClassifier,0.5,500,"(1, 3)",0.957672,308.20545,7.329102,0.364365,0.02375,0.060945,6.6e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,KNeighborsClassifier,0.5,500,"(1, 3)",0.873016,29.933669,7.329102,0.115536,0.007531,0.206661,0.000225


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,LogisticRegression,0.5,500,"(1, 3)",0.90582,3.867176,7.329102,4.988431,0.32515,0.004061,4e-06


In [None]:
# Varying n_grams_range
n_grams_values = [(1,2), (1,4), (2,3), (2,4)]

for value in n_grams_values:
    for model in model_list:
        result = evaluate_model(model, parameters, vect__ngram_range=value)
        results.append(result)


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,RandomForestClassifier,0.5,500,"(1, 2)",0.959788,229.686469,7.06543,0.391074,0.02549,0.056751,6.2e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,ExtraTreesClassifier,0.5,500,"(1, 2)",0.960847,334.491272,7.06543,0.331956,0.021637,0.056926,6.2e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,KNeighborsClassifier,0.5,500,"(1, 2)",0.895238,20.977447,7.06543,0.082874,0.005402,0.182528,0.000198


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,LogisticRegression,0.5,500,"(1, 2)",0.915344,3.861627,7.06543,3.501091,0.228204,0.003681,4e-06


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,RandomForestClassifier,0.5,500,"(1, 4)",0.951323,205.725288,7.396484,0.497425,0.032422,0.055311,6e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,ExtraTreesClassifier,0.5,500,"(1, 4)",0.95873,333.482723,7.396484,0.443662,0.028918,0.056105,6.1e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,KNeighborsClassifier,0.5,500,"(1, 4)",0.837037,32.572254,7.396484,0.147099,0.009588,0.19783,0.000215


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,LogisticRegression,0.5,500,"(1, 4)",0.895238,3.868434,7.396484,3.843282,0.250508,0.00396,4e-06


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,RandomForestClassifier,0.5,500,"(2, 3)",0.950265,198.99137,7.347656,0.439501,0.028647,0.068099,7.4e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,ExtraTreesClassifier,0.5,500,"(2, 3)",0.951323,308.554517,7.347656,0.367842,0.023976,0.064177,7e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,KNeighborsClassifier,0.5,500,"(2, 3)",0.880423,27.281119,7.347656,0.096213,0.006271,0.19981,0.000217


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,LogisticRegression,0.5,500,"(2, 3)",0.913228,3.866647,7.347656,3.806568,0.248115,0.003969,4e-06


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,RandomForestClassifier,0.5,500,"(2, 4)",0.949206,204.348669,7.384766,0.464701,0.03029,0.058134,6.3e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,ExtraTreesClassifier,0.5,500,"(2, 4)",0.953439,336.377186,7.384766,0.413645,0.026962,0.046707,5.1e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,KNeighborsClassifier,0.5,500,"(2, 4)",0.833862,30.080433,7.384766,0.127358,0.008301,0.195698,0.000213


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,LogisticRegression,0.5,500,"(2, 4)",0.888889,3.867975,7.384766,4.483074,0.29221,0.003674,4e-06


In [None]:
# Varying max_df
max_df_values = [0.3, 0.7]

for value in max_df_values:
    for model in model_list:
        result = evaluate_model(model, parameters, vect__max_df=value)
        results.append(result)


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,RandomForestClassifier,0.3,500,"(1, 3)",0.948148,203.484678,7.347656,0.439638,0.028656,0.057561,6.3e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,ExtraTreesClassifier,0.3,500,"(1, 3)",0.955556,307.297626,7.347656,0.375976,0.024506,0.058171,6.3e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,KNeighborsClassifier,0.3,500,"(1, 3)",0.877249,24.576193,7.347656,0.113411,0.007392,0.194651,0.000211


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,LogisticRegression,0.3,500,"(1, 3)",0.916402,3.866444,7.347656,3.56257,0.232211,0.003872,4e-06


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,RandomForestClassifier,0.7,500,"(1, 3)",0.94709,202.103412,7.34082,0.490572,0.031976,0.05733,6.2e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,ExtraTreesClassifier,0.7,500,"(1, 3)",0.956614,309.612127,7.34082,0.401228,0.026152,0.059058,6.4e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,KNeighborsClassifier,0.7,500,"(1, 3)",0.87619,31.63539,7.34082,0.115666,0.007539,0.205576,0.000223


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,LogisticRegression,0.7,500,"(1, 3)",0.901587,3.867436,7.34082,4.067915,0.26515,0.003734,4e-06


In [None]:
# Varying max_features
max_feature_values = [100, 200, 300]

for value in max_feature_values:
    for model in model_list:
        result = evaluate_model(model, parameters, vect__max_features=value)
        results.append(result)


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,RandomForestClassifier,0.5,100,"(1, 3)",0.91746,253.861357,2.022461,0.389686,0.0254,0.058475,6.4e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,ExtraTreesClassifier,0.5,100,"(1, 3)",0.920635,418.536488,2.022461,0.357022,0.023271,0.057125,6.2e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,KNeighborsClassifier,0.5,100,"(1, 3)",0.78836,14.837514,2.022461,0.132203,0.008617,0.183965,0.0002


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,LogisticRegression,0.5,100,"(1, 3)",0.797884,0.794165,2.022461,3.427862,0.22343,0.004322,5e-06


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,RandomForestClassifier,0.5,200,"(1, 3)",0.940741,227.302715,3.331055,0.426354,0.02779,0.047549,5.2e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,ExtraTreesClassifier,0.5,200,"(1, 3)",0.942857,367.718507,3.331055,0.41706,0.027184,0.066429,7.2e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,KNeighborsClassifier,0.5,200,"(1, 3)",0.842328,21.491787,3.331055,0.117917,0.007686,0.197479,0.000215


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,LogisticRegression,0.5,200,"(1, 3)",0.871958,1.563274,3.331055,3.431717,0.223682,0.002801,3e-06


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,RandomForestClassifier,0.5,300,"(1, 3)",0.944974,210.55649,4.698242,0.440368,0.028704,0.067527,7.3e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,ExtraTreesClassifier,0.5,300,"(1, 3)",0.951323,333.538276,4.698242,0.384993,0.025094,0.048125,5.2e-05


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,KNeighborsClassifier,0.5,300,"(1, 3)",0.849735,25.400258,4.698242,0.11737,0.00765,0.197527,0.000215


Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
0,LogisticRegression,0.5,300,"(1, 3)",0.886772,2.331544,4.698242,3.442884,0.22441,0.003019,3e-06


In [23]:
pd.DataFrame(results).sort_values(by=['model', 'f1_score'])

Unnamed: 0,model,max_df,max_features,ngram_range,f1_score,model_size_mb,vectorizer_size_kb,train_minutes,train_cost,predict_seconds_one_sample,predict_cost_one_sample
29,ExtraTreesClassifier,0.5,100,"(1, 3)",0.920635,418.536488,2.022461,0.357022,0.023271,0.057125,6.2e-05
33,ExtraTreesClassifier,0.5,200,"(1, 3)",0.942857,367.718507,3.331055,0.41706,0.027184,0.066429,7.2e-05
13,ExtraTreesClassifier,0.5,500,"(2, 3)",0.951323,308.554517,7.347656,0.367842,0.023976,0.064177,7e-05
37,ExtraTreesClassifier,0.5,300,"(1, 3)",0.951323,333.538276,4.698242,0.384993,0.025094,0.048125,5.2e-05
17,ExtraTreesClassifier,0.5,500,"(2, 4)",0.953439,336.377186,7.384766,0.413645,0.026962,0.046707,5.1e-05
21,ExtraTreesClassifier,0.3,500,"(1, 3)",0.955556,307.297626,7.347656,0.375976,0.024506,0.058171,6.3e-05
25,ExtraTreesClassifier,0.7,500,"(1, 3)",0.956614,309.612127,7.34082,0.401228,0.026152,0.059058,6.4e-05
1,ExtraTreesClassifier,0.5,500,"(1, 3)",0.957672,308.20545,7.329102,0.364365,0.02375,0.060945,6.6e-05
9,ExtraTreesClassifier,0.5,500,"(1, 4)",0.95873,333.482723,7.396484,0.443662,0.028918,0.056105,6.1e-05
5,ExtraTreesClassifier,0.5,500,"(1, 2)",0.960847,334.491272,7.06543,0.331956,0.021637,0.056926,6.2e-05


# Save and export results

In [58]:
results_df = pd.DataFrame(results).sort_values(by=['model', 'f1_score'])
results_df.to_csv('ClassicML_Substances.csv')