In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("drug_consumption.csv", 
                  names=["Age", "Gender", "Education", "Country", "Ethnicity", "Nscore", "Escore", "Oscore", "Ascore", "Cscore", "Impulsive", "SS", "Alcohol", "Amphet", "Amyl", "Benzos", "Caff", "Cannabis", "Choc", "Coke", "Crack", "Ecstasy", "Heroin", "Ketamine", "Legalh", "LSD", "Meth", "Mushrooms", "Nicotine", "Semer", "VSA"])

In [3]:
arr = ["Alcohol", "Amphet", "Amyl", "Benzos", "Caff", "Cannabis", "Choc", "Coke", "Crack", "Ecstasy", "Heroin", "Ketamine", "Legalh", "LSD", "Meth", "Mushrooms", "Nicotine", "Semer", "VSA"]
# remove CL and convert strings to ints
for i in range(0,18):
    df[arr[i]].replace({"CL0": arr[i]+"_0", "CL1": arr[i]+"_1", "CL2": arr[i]+"_1", "CL3": arr[i]+"_1", "CL4": arr[i]+"_1", "CL5": arr[i]+"_1", "CL6": arr[i]+"_1"}, inplace=True)
    #df[arr[i]] = pd.to_numeric(df[arr[i]])

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from enum import IntEnum

X = df[["Nscore", "Escore", "Oscore", "Ascore", "Cscore", "Impulsive", "SS"]]
y_cols = ["Alcohol", "Amphet", "Cannabis", "Coke", "Ecstasy", "LSD", "Meth", "Mushrooms"]
y_enum = IntEnum("cols", y_cols, start = 0)
y = [None] * len(y_cols) #pd.DataFrame() #df[["Alcohol", "Amphet", "Cannabis", "Coke", "Ecstasy", "LSD", "Meth", "Mushrooms"]]

def one_hot_encoding(df, col):
    # one-hot encoding to represent classes
    myData_encoder = LabelEncoder()
    myData_encoded =  myData_encoder.fit_transform(df[col]) 

    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False) # disable sparse return type

    # reshape the array
    myData_encoded = myData_encoded.reshape(len(myData_encoded), 1) 
    onehot_encoded = onehot_encoder.fit_transform(myData_encoded) 
    #print(onehot_encoded)
    return(pd.DataFrame(onehot_encoded, columns = myData_encoder.classes_))

for col_name in y_cols:
    # convert ndarrary of encodings to dataframe using myData_encoder.classes_ as column names
    y[y_enum[col_name].value] = one_hot_encoding(df, col_name)

***USE THESE MODELS***

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, multilabel_confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score

from math import sqrt

models = [None] * len(y_cols) #list of models in y_cols order

def train(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, test_size=0.2)

    clf = MLPClassifier(hidden_layer_sizes=(60, 45), learning_rate_init = 0.1, activation="logistic",solver = 'sgd', max_iter = 1000, random_state=1)

    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test) 

    #accuracy and MSE
    print('\nModel Accuracy:', clf.score(X_test, y_test))
    print('Model RMSE:', sqrt(mean_squared_error(y_test, y_pred)), '\n')
    
    return clf

for drug, y_col in zip(y, y_cols):
    print(y_col)
    models[y_enum[col_name].value] = train(X, drug)

# train(X, y[0])

Alcohol

Model Accuracy: 0.986737400530504
Model RMSE: 0.11516335992621968 

Amphet

Model Accuracy: 0.6896551724137931
Model RMSE: 0.555894385328056 

Cannabis

Model Accuracy: 0.8222811671087533
Model RMSE: 0.42156711552402504 

Coke

Model Accuracy: 0.6710875331564987
Model RMSE: 0.5711916901965592 

Ecstasy

Model Accuracy: 0.7427055702917772
Model RMSE: 0.507241983384876 

LSD

Model Accuracy: 0.7029177718832891
Model RMSE: 0.54383450439427 

Meth

Model Accuracy: 0.7745358090185677
Model RMSE: 0.4734320764739993 

Mushrooms

Model Accuracy: 0.7400530503978779
Model RMSE: 0.5085476277156078 



GRID SEARCH:


In [10]:
from sklearn.model_selection import GridSearchCV

models = [None] * len(y_cols) #list of models in y_cols order

def train(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, test_size=0.2)

    clf = MLPClassifier(hidden_layer_sizes=(32, 24, 12), learning_rate_init = 0.1, activation="logistic",solver = 'sgd', max_iter = 1000, random_state=1)

    # Grid Search
    hidden_layer_sizes = [(45, 36), (60, 45), (32, 24, 12), (60, 45, 24)]
    learning_rate_init = [0.1, 0.2, 0.3, 0.4]
    param_grid = dict(hidden_layer_sizes=hidden_layer_sizes, learning_rate_init=learning_rate_init)
    grid = GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=3)
    grid_result = grid.fit(X_train, y_train)
    
    # summarize results - this code was taken from a lecture example
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    
    return clf

for drug, y_col in zip(y, y_cols):
    print(y_col)
    models[y_enum[col_name].value] = train(X, drug)
    

Alcohol
Best: 0.980766 using {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.1}
0.980766 (0.004103) with: {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.1}
0.980766 (0.004103) with: {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.2}
0.980766 (0.004103) with: {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.3}
0.980766 (0.004103) with: {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.4}
0.980766 (0.004103) with: {'hidden_layer_sizes': (60, 45), 'learning_rate_init': 0.1}
0.980766 (0.004103) with: {'hidden_layer_sizes': (60, 45), 'learning_rate_init': 0.2}
0.980766 (0.004103) with: {'hidden_layer_sizes': (60, 45), 'learning_rate_init': 0.3}
0.980766 (0.004103) with: {'hidden_layer_sizes': (60, 45), 'learning_rate_init': 0.4}
0.980766 (0.004103) with: {'hidden_layer_sizes': (32, 24, 12), 'learning_rate_init': 0.1}
0.980766 (0.004103) with: {'hidden_layer_sizes': (32, 24, 12), 'learning_rate_init': 0.2}
0.980766 (0.004103) with: {'hidden_layer_si