In [25]:
import numpy as np
import pandas as pd
import math
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score


In [26]:
# load dataset and clean up to only data needed
df = pd.read_csv("Data/Opioid_Related_Deaths_2012-2018.csv")
df = df.dropna(subset=['Age', 'Sex','Race','DeathCity'])
df = df[['Age', 'Sex', 'Race', 'DeathCity', 'Drug_Type'
       ]]

df.head(10)

Unnamed: 0,Age,Sex,Race,DeathCity,Drug_Type
1,30.0,Female,White,DANBURY,0
2,23.0,Male,White,GREENWICH,0
3,22.0,Male,Asian,GREENWICH,1
4,23.0,Male,White,BRISTOL,0
5,21.0,Female,White,WEST HARTFORD,0
6,40.0,Male,White,EAST HARTFORD,0
7,50.0,Male,White,Other,2
8,26.0,Female,Hispanic,STRATFORD,0
9,49.0,Female,White,NEW HAVEN,0
10,50.0,Male,White,DANBURY,0


In [27]:
# build X and y matrices
X = df[["Age", "Sex", "Race","DeathCity"]]
y = df[['Drug_Type']].values.reshape(-1)

X = pd.get_dummies(X)
y


array([0, 0, 1, ..., 0, 1, 0])

In [28]:
# split to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y)


In [29]:
# call TPOT and wait
tpot_clf = TPOTClassifier(generations=5, population_size=50, verbosity=2, n_jobs=-1,
                          max_time_mins=480, scoring='f1_micro')
tpot_clf.fit(X_train, y_train)

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=50, style=ProgressStyle(descripti…

Generation 1 - Current best internal CV score: 0.5721835883171071
Generation 2 - Current best internal CV score: 0.5721835883171071
Generation 3 - Current best internal CV score: 0.5727399165507648
Generation 4 - Current best internal CV score: 0.5730180806675939
Generation 5 - Current best internal CV score: 0.5730180806675939

Best pipeline: ExtraTreesClassifier(ZeroCount(CombineDFs(PolynomialFeatures(input_matrix, degree=2, include_bias=False, interaction_only=False), input_matrix)), bootstrap=True, criterion=entropy, max_features=0.9500000000000001, min_samples_leaf=18, min_samples_split=11, n_estimators=100)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=5,
               max_eval_time_mins=5, max_time_mins=480, memory=None,
               mutation_rate=0.9, n_jobs=-1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=50,
               random_state=None, scoring='f1_micro', subsample=1.0,
               template=None, use_dask=False, verbosity=2, warm_start=False)

In [30]:
# evaluate result
y_hat_test = tpot_clf.predict(X_test)
print(f'F1: {f1_score(y_test, y_hat_test,average=None)}')
print(f'Acc: {accuracy_score(y_test, y_hat_test)}')

F1: [0.71840355 0.03100775 0.19251337]
Acc: 0.5650723025583982




In [31]:
# export model into python code
tpot_clf.export('tpot_model.py')
