In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("drug_consumption.csv", 
                  names=["Age", "Gender", "Education", "Country", "Ethnicity", "Nscore", "Escore", "Oscore", "Ascore", "Cscore", "Impulsive", "SS", "Alcohol", "Amphet", "Amyl", "Benzos", "Caff", "Cannabis", "Choc", "Coke", "Crack", "Ecstasy", "Heroin", "Ketamine", "Legalh", "LSD", "Meth", "Mushrooms", "Nicotine", "Semer", "VSA"])

In [2]:
arr = ["Alcohol", "Amphet", "Amyl", "Benzos", "Caff", "Cannabis", "Choc", "Coke", "Crack", "Ecstasy", "Heroin", "Ketamine", "Legalh", "LSD", "Meth", "Mushrooms", "Nicotine", "Semer", "VSA"]
# remove CL and convert strings to ints
for i in range(0,18):
    df[arr[i]].replace({"CL0": arr[i]+"_0", "CL1": arr[i]+"_1", "CL2": arr[i]+"_1", "CL3": arr[i]+"_1", "CL4": arr[i]+"_1", "CL5": arr[i]+"_1", "CL6": arr[i]+"_1"}, inplace=True)
    #df[arr[i]] = pd.to_numeric(df[arr[i]])

In [3]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from enum import IntEnum

X = df[["Nscore", "Escore", "Oscore", "Ascore", "Cscore", "Impulsive", "SS"]]
y_cols = ["Alcohol", "Amphet", "Cannabis", "Coke", "Ecstasy", "LSD", "Meth", "Mushrooms"]
y_enum = IntEnum("cols", y_cols, start = 0)
y = [None] * len(y_cols) #pd.DataFrame() #df[["Alcohol", "Amphet", "Cannabis", "Coke", "Ecstasy", "LSD", "Meth", "Mushrooms"]]

def one_hot_encoding(df, col):
    # one-hot encoding to represent classes
    myData_encoder = LabelEncoder()
    myData_encoded =  myData_encoder.fit_transform(df[col]) 

    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False) # disable sparse return type

    # reshape the array
    myData_encoded = myData_encoded.reshape(len(myData_encoded), 1) 
    onehot_encoded = onehot_encoder.fit_transform(myData_encoded) 
    #print(onehot_encoded)
    return(pd.DataFrame(onehot_encoded, columns = myData_encoder.classes_))

for col_name in y_cols:
    # convert ndarrary of encodings to dataframe using myData_encoder.classes_ as column names
    y[y_enum[col_name].value] = one_hot_encoding(df, col_name)

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, multilabel_confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score

from math import sqrt

models = [None] * len(y_cols) #list of models in y_cols order

def train(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, test_size=0.2)

    clf = MLPClassifier(hidden_layer_sizes=(32, 24, 12), learning_rate_init = 0.1, activation="logistic",solver = 'sgd', max_iter = 1000, random_state=1)

    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test) 

    #accuracy and MSE
    print('\nModel Accuracy:', clf.score(X_test, y_test))
    print('Model RMSE:', sqrt(mean_squared_error(y_test, y_pred)), '\n')
    
    return clf

for drug, y_col in zip(y, y_cols):
    print(y_col)
    models[y_enum[col_name].value] = train(X, drug)

# train(X, y[0])

Alcohol

Model Accuracy: 0.986737400530504
Model RMSE: 0.11516335992621968 

Amphet

Model Accuracy: 0.7082228116710876
Model RMSE: 0.5389349945790891 

Cannabis

Model Accuracy: 0.8222811671087533
Model RMSE: 0.41999115817395144 

Coke

Model Accuracy: 0.6525198938992043
Model RMSE: 0.5872202195147035 

Ecstasy

Model Accuracy: 0.726790450928382
Model RMSE: 0.5226945083618327 

LSD

Model Accuracy: 0.7374005305039788
Model RMSE: 0.5124446013922102 

Meth

Model Accuracy: 0.7745358090185677
Model RMSE: 0.4748306971768278 

Mushrooms

Model Accuracy: 0.7400530503978779
Model RMSE: 0.5059329696326117 

