In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("drug_consumption.csv", 
                  names=["Age", "Gender", "Education", "Country", "Ethnicity", "Nscore", "Escore", "Oscore", "Ascore", "Cscore", "Impulsive", "SS", "Alcohol", "Amphet", "Amyl", "Benzos", "Caff", "Cannabis", "Choc", "Coke", "Crack", "Ecstasy", "Heroin", "Ketamine", "Legalh", "LSD", "Meth", "Mushrooms", "Nicotine", "Semer", "VSA"])

In [2]:
#remove instances where individuals reported having taken Semer, a fictional drug
df = df[df.Semer == 'CL0']

In [3]:
arr = ["Alcohol", "Amphet", "Amyl", "Benzos", "Caff", "Cannabis", "Choc", "Coke", "Crack", "Ecstasy", "Heroin", "Ketamine", "Legalh", "LSD", "Meth", "Mushrooms", "Nicotine", "Semer", "VSA"]
# Collapse values into never used = 0 and have used = 1, include drug name for label when OHE
for i in range(0,18):
    df[arr[i]].replace({"CL0": arr[i]+"_0", "CL1": arr[i]+"_1", "CL2": arr[i]+"_1", "CL3": arr[i]+"_1", "CL4": arr[i]+"_1", "CL5": arr[i]+"_1", "CL6": arr[i]+"_1"}, inplace=True)

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from enum import IntEnum

X = df[["Nscore", "Escore", "Oscore", "Ascore", "Cscore", "Impulsive", "SS"]]
y_cols = ["Alcohol", "Amphet", "Cannabis", "Coke", "Ecstasy", "LSD", "Meth", "Mushrooms"]
y_enum = IntEnum("cols", y_cols, start = 0)
y = [None] * len(y_cols) #pd.DataFrame() #df[["Alcohol", "Amphet", "Cannabis", "Coke", "Ecstasy", "LSD", "Meth", "Mushrooms"]]

def one_hot_encoding(df, col):
    # one-hot encoding to represent classes
    myData_encoder = LabelEncoder()
    myData_encoded =  myData_encoder.fit_transform(df[col]) 

    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False) # disable sparse return type

    # reshape the array
    myData_encoded = myData_encoded.reshape(len(myData_encoded), 1) 
    onehot_encoded = onehot_encoder.fit_transform(myData_encoded) 
    #print(onehot_encoded)
    # convert ndarrary of encodings to dataframe using myData_encoder.classes_ as column names
    return(pd.DataFrame(onehot_encoded, columns = myData_encoder.classes_))

for col_name in y_cols:
    #one-hot encode every y_col
    y[y_enum[col_name].value] = one_hot_encoding(df, col_name)

## GRID SEARCH:

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, precision_score, recall_score
from sklearn.model_selection import GridSearchCV

model_params = [None] * len(y_cols) #list of model parameters in y_cols order

#pass inputs X and target y
#returns dictionary of best params for hidden_layer_sizes and learning_rate_init
def search(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, test_size=0.2)

    clf = MLPClassifier(hidden_layer_sizes=(32, 24, 12), learning_rate_init = 0.1, activation="logistic",solver = 'sgd', max_iter = 1000, random_state=1)

    # Grid Search
    hidden_layer_sizes = [(45, 36), (60, 45), (32, 24, 12), (60, 45, 24)]
    learning_rate_init = [0.1, 0.2, 0.3, 0.4]
    param_grid = dict(hidden_layer_sizes=hidden_layer_sizes, learning_rate_init=learning_rate_init)
    grid = GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=3)
    grid_result = grid.fit(X_train, y_train)
    
    # summarize results - this code was taken from a lecture example
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    
    return grid_result.best_params_

for drug, y_col in zip(y, y_cols):
    print(y_col)
    model_params[y_enum[y_col].value] = search(X, drug)
    

Alcohol
Best: 0.981347 using {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.1}
0.981347 (0.003394) with: {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.1}
0.981347 (0.003394) with: {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.2}
0.981347 (0.003394) with: {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.3}
0.981347 (0.003394) with: {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.4}
0.981347 (0.003394) with: {'hidden_layer_sizes': (60, 45), 'learning_rate_init': 0.1}
0.981347 (0.003394) with: {'hidden_layer_sizes': (60, 45), 'learning_rate_init': 0.2}
0.981347 (0.003394) with: {'hidden_layer_sizes': (60, 45), 'learning_rate_init': 0.3}
0.981347 (0.003394) with: {'hidden_layer_sizes': (60, 45), 'learning_rate_init': 0.4}
0.981347 (0.003394) with: {'hidden_layer_sizes': (32, 24, 12), 'learning_rate_init': 0.1}
0.981347 (0.003394) with: {'hidden_layer_sizes': (32, 24, 12), 'learning_rate_init': 0.2}
0.981347 (0.003394) with: {'hidden_layer_si

Best: 0.761472 using {'hidden_layer_sizes': (60, 45), 'learning_rate_init': 0.3}
0.758145 (0.018376) with: {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.1}
0.760141 (0.021038) with: {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.2}
0.727530 (0.020325) with: {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.3}
0.724213 (0.031496) with: {'hidden_layer_sizes': (45, 36), 'learning_rate_init': 0.4}
0.759476 (0.020146) with: {'hidden_layer_sizes': (60, 45), 'learning_rate_init': 0.1}
0.760806 (0.021935) with: {'hidden_layer_sizes': (60, 45), 'learning_rate_init': 0.2}
0.761472 (0.022835) with: {'hidden_layer_sizes': (60, 45), 'learning_rate_init': 0.3}
0.717530 (0.014642) with: {'hidden_layer_sizes': (60, 45), 'learning_rate_init': 0.4}
0.760141 (0.021038) with: {'hidden_layer_sizes': (32, 24, 12), 'learning_rate_init': 0.1}
0.756814 (0.016630) with: {'hidden_layer_sizes': (32, 24, 12), 'learning_rate_init': 0.2}
0.759476 (0.020146) with: {'hidden_layer_sizes': (3

***USE THESE MODELS***

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, precision_score, recall_score

from math import sqrt

models = [None] * len(y_cols) #list of models in y_cols order

#pass inputs X and target y with hyperparams hidden_layer_sizes and learning_rate_init
#returns trained model
def train(X, y, hidden_layer_sizes, learning_rate_init):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, test_size=0.2)

    clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, learning_rate_init = learning_rate_init, activation="logistic",solver = 'sgd', max_iter = 1000, random_state=1)

    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test) 

    #accuracy and MSE
    print('\nModel Accuracy:', clf.score(X_test, y_test))
    print('Model RMSE:', sqrt(mean_squared_error(y_test, y_pred)), '\n')
    
    return clf

for drug, y_col, params in zip(y, y_cols, model_params):
    print(y_col)
    #use parameters from gridsearch to train models
    models[y_enum[y_col].value] = train(X, drug, params['hidden_layer_sizes'], params['learning_rate_init'])


Alcohol

Model Accuracy: 0.9867021276595744
Model RMSE: 0.11531640100361064 

Amphet

Model Accuracy: 0.6675531914893617
Model RMSE: 0.5765820050180531 

Cannabis

Model Accuracy: 0.7473404255319149
Model RMSE: 0.5026525385075511 

Coke

Model Accuracy: 0.6409574468085106
Model RMSE: 0.5980909345220398 

Ecstasy

Model Accuracy: 0.675531914893617
Model RMSE: 0.5696210012862789 

LSD

Model Accuracy: 0.6941489361702128
Model RMSE: 0.5530380310880864 

Meth

Model Accuracy: 0.776595744680851
Model RMSE: 0.4726565934366609 

Mushrooms

Model Accuracy: 0.7127659574468085
Model RMSE: 0.5347001545905415 

