In [1]:
import pandas as pd

DRUGS = '/kaggle/input/drug-consumption-classification/drug_consumption.csv'

df = pd.read_csv(filepath_or_buffer=DRUGS, index_col=['ID'])
df.head()

Unnamed: 0_level_0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Cscore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,-0.14277,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,0.58489,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,1.30612,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


Our data has been normalized, and the normalized scores are not easy to interpret. Let's look at some histograms. 

In [2]:
from plotly import express

float_columns =  ['Age', 'Gender', 'Education', 'Country', 'Ethnicity', 'Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore', 'Impulsive', 'SS',]

for column in float_columns:
    express.histogram(data_frame=df, x=column).show()

Much of our data looks like it has been sampled from a normal distribution.

Let's do a quick dimension reduction to see if our data clusters at all. Then we'll see if it is predictive for any of the output variables.

In [3]:
from arrow import now
from umap import UMAP

# we do not have enough positives for Semer to be meaningful so we're going to have to leave it out
targets = ['Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms', 'Nicotine', 'VSA']

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=1000)
df[['x', 'y']] = umap.fit_transform(X=df[float_columns])
express.scatter(data_frame=df, x='x', y='y').show()
print('done with UMAP in {}'.format(now() - time_start))

2024-03-29 15:35:35.786295: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-29 15:35:35.786442: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-29 15:35:35.962016: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=1000, n_jobs=1, random_state=2024, verbose=True)
Fri Mar 29 15:35:51 2024 Construct fuzzy simplicial set
Fri Mar 29 15:35:53 2024 Finding Nearest Neighbors
Fri Mar 29 15:35:58 2024 Finished Nearest Neighbor Search
Fri Mar 29 15:36:02 2024 Construct embedding


Epochs completed:   0%|            0/1000 [00:00]

	completed  0  /  1000 epochs
	completed  100  /  1000 epochs
	completed  200  /  1000 epochs
	completed  300  /  1000 epochs
	completed  400  /  1000 epochs
	completed  500  /  1000 epochs
	completed  600  /  1000 epochs
	completed  700  /  1000 epochs
	completed  800  /  1000 epochs
	completed  900  /  1000 epochs
Fri Mar 29 15:36:09 2024 Finished embedding


done with UMAP in 0:00:18.594392


If there is clustering here it is not obvious; let's see what happens if we color by the dependent variables.

In [4]:
import warnings

warnings.filterwarnings(action='ignore', category=FutureWarning)
# we have to take a sample for performance reasons; if we try to plot the whole dataset our plots crash
sample_df = df.sample(n=300, random_state=2024)
for target in targets:
    express.scatter(data_frame=sample_df, x='x', y='y', color=target).show()

If we look closely we see that the left lobe of our plot tends to be full of CL0s, while the results for the right lobe of our plot are mixed. 

The answer appears to be a resounding maybe. Let's train a model and see what happens. We don't have much data, and we have a lot of classes for each target variable, so we can only expect so much signal in this pool of data.

In [5]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

time_start = arrow.now()

accuracy_results = {}
f1_results = {}
# we need to stratify differently because our classes are differently imbalanced depending on the target
for target in targets:
    
    X_train, X_test, y_train, y_test = train_test_split(df[float_columns], df[target], test_size=0.20, random_state=2024, stratify=df[target])
    model = LogisticRegression(max_iter=100000, tol=1e-12).fit(X=X_train, y=y_train)
    f1_results[target] = f1_score(y_true=y_test, y_pred=model.predict(X=X_test), average='weighted')
    accuracy_results[target] = accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))

express.histogram(data_frame=pd.DataFrame.from_dict(data=f1_results, orient='index').reset_index(), x='index', y=0, title='f1').show()
express.histogram(data_frame=pd.DataFrame.from_dict(data=accuracy_results, orient='index').reset_index(), x='index', y=0, title='accuracy').show()
print('model done in {}'.format(now() - time_start))

model done in 0:00:01.350745


Our f1 scores and our accuracy scores are very close. Let's plot this data a little differently.

In [6]:
f1_df = pd.DataFrame.from_dict(data=f1_results, orient='index').reset_index().rename(columns={0: 'f1'})
accuracy_df = pd.DataFrame.from_dict(data=accuracy_results, orient='index').reset_index().rename(columns={0: 'accuracy'})

express.scatter(data_frame=accuracy_df.merge(how='inner', on='index', right=f1_df), x='accuracy', y='f1', hover_name='index').show()


This tells us that there's more signal in our data for some drugs than others, which is probably not surprising.