In [1]:
# Get dependencies and libraries
import pandas as pd
import numpy as np
import warnings
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from matplotlib import pyplot
from sklearn.metrics import classification_report

# Suppress warnings
warnings.filterwarnings("ignore")
tf.logging.set_verbosity(tf.logging.ERROR)

In [2]:
# Read the data file
names = ['id','age', 'gender', 'education', 'neuroticism', 'extraversion', 'openness',
'agreeableness','conscientiousness', 'impulsiveness', 'sensation', 'cannabis']
df = pd.read_csv('data/validation_data_personality_scores_ALL_personalities.csv', encoding='utf-8', names=names)
df = df.drop(df.index[0])

In [3]:
# Original data had cannabis usage divided into 7 categories
#     'CL0' : 'Never Used', 
#     'CL1' : 'Used over a Decade Ago'
#     'CL2' : 'Used in Last Decade'
#     'CL3' : 'Used in Last Year'
#     'CL4' : 'Used in Last Month'
#     'CL5' : 'Used in Last Week'
#     'CL6' : 'Used in Last Day'

# Cannabis used categories were combined into one category.
# 0 = never used
# 1 = used -past or present-

df.head()

Unnamed: 0,id,age,gender,education,neuroticism,extraversion,openness,agreeableness,conscientiousness,impulsiveness,sensation,cannabis
1,55235143,6,0,7,14,34,42,18,30,5,3,1
2,13435143,1,0,9,39,42,37,16,36,2,1,1
3,81345143,1,0,6,16,16,26,40,19,5,10,1
4,53955143,1,0,9,50,37,26,35,31,2,3,1
5,51775143,2,1,9,46,34,47,24,33,4,10,0


In [4]:
# Filter the columns
df_cannabis = df[['id','age', 'gender', 'education',
          'neuroticism', 'extraversion', 
          'openness', 'agreeableness',
          'conscientiousness', 'sensation', 'impulsiveness',
          'cannabis']]
df_cannabis.head()

Unnamed: 0,id,age,gender,education,neuroticism,extraversion,openness,agreeableness,conscientiousness,sensation,impulsiveness,cannabis
1,55235143,6,0,7,14,34,42,18,30,3,5,1
2,13435143,1,0,9,39,42,37,16,36,1,2,1
3,81345143,1,0,6,16,16,26,40,19,10,5,1
4,53955143,1,0,9,50,37,26,35,31,3,2,1
5,51775143,2,1,9,46,34,47,24,33,10,4,0


In [5]:
print(df_cannabis.nunique())

id                   56276
age                      6
gender                   2
education                9
neuroticism             49
extraversion            42
openness                35
agreeableness           41
conscientiousness       41
sensation               11
impulsiveness           10
cannabis                 2
dtype: int64


In [6]:
# Check column datatypes
print(df_cannabis.dtypes)

id                   object
age                  object
gender               object
education            object
neuroticism          object
extraversion         object
openness             object
agreeableness        object
conscientiousness    object
sensation            object
impulsiveness        object
cannabis             object
dtype: object


In [7]:
# Convert datatypes to int
df_cannabis["id"] = df_cannabis['id'].astype('float')
df_cannabis["age"] = df_cannabis['age'].astype('float')
df_cannabis["gender"] = df_cannabis['gender'].astype('float')
df_cannabis["education"] = df_cannabis['education'].astype('float')
df_cannabis["neuroticism"] = df_cannabis['neuroticism'].astype('float')
df_cannabis["extraversion"] = df_cannabis['extraversion'].astype('float')
df_cannabis["openness"] = df_cannabis['openness'].astype('float')
df_cannabis["agreeableness"] = df_cannabis['agreeableness'].astype('float')
df_cannabis["conscientiousness"] = df_cannabis['conscientiousness'].astype('float')
df_cannabis["cannabis"] = df_cannabis['cannabis'].astype('float')
df_cannabis["sensation"] = df_cannabis['sensation'].astype('float')
df_cannabis["impulsiveness"] = df_cannabis['impulsiveness'].astype('float')
print(df_cannabis.dtypes)

id                   float64
age                  float64
gender               float64
education            float64
neuroticism          float64
extraversion         float64
openness             float64
agreeableness        float64
conscientiousness    float64
sensation            float64
impulsiveness        float64
cannabis             float64
dtype: object


In [8]:
# Check if personality traits are independent variables
# Negative value means no correlation

import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import scale
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as sm

import scipy
from scipy.stats import spearmanr

age = df_cannabis["age"]
gender = df_cannabis["gender"]
education = df_cannabis["education"]
neuroticism= df_cannabis["neuroticism"]
extraversion = df_cannabis["extraversion"]
openness = df_cannabis["openness"]
agreeableness = df_cannabis["agreeableness"]
conscientiousness = df_cannabis["conscientiousness"]
sensation = df_cannabis["sensation"]
impulsiveness = df_cannabis["impulsiveness"]

def spearman_coefficient(categories, category_names):
    
    for category, name in zip(categories, category_names):
        for compared_category, compared_name in zip(categories, category_names):
            spearmanr_coff, p_value = spearmanr(category, compared_category)
            print(f"Spearmen Coefficient {name}-{compared_name}: \n{spearmanr_coff}")
        
category_list = [age, gender, education, neuroticism, extraversion, openness, agreeableness, conscientiousness, sensation,
                impulsiveness]
category_names = ['age', 'gender', 'education', 'neuroticism', 'extraversion', 'openness', 
                  'agreeableness', 'conscientiousness', 'sensation', 'impulsiveness']

spearman_coefficient(category_list, category_names)

Spearmen Coefficient age-age: 
0.9999999999999999
Spearmen Coefficient age-gender: 
0.00954653099867302
Spearmen Coefficient age-education: 
0.0005147918017486068
Spearmen Coefficient age-neuroticism: 
-0.004965961159821288
Spearmen Coefficient age-extraversion: 
0.004134781981476818
Spearmen Coefficient age-openness: 
0.0006860193598205816
Spearmen Coefficient age-agreeableness: 
0.0008285203266440796
Spearmen Coefficient age-conscientiousness: 
-0.005758754773494007
Spearmen Coefficient age-sensation: 
0.002537037360789293
Spearmen Coefficient age-impulsiveness: 
0.005558935952372676
Spearmen Coefficient gender-age: 
0.00954653099867302
Spearmen Coefficient gender-gender: 
1.0
Spearmen Coefficient gender-education: 
0.0026718624040433184
Spearmen Coefficient gender-neuroticism: 
0.007747379193022096
Spearmen Coefficient gender-extraversion: 
0.004116955848352225
Spearmen Coefficient gender-openness: 
-0.0027926485066409375
Spearmen Coefficient gender-agreeableness: 
0.001936712559664

In [9]:
# Check number of values for cannabis usage (1 = used, 0 = not used)
df_cannabis['cannabis'].value_counts()

1.0    44052
0.0    12224
Name: cannabis, dtype: int64

In [10]:
# Check for nulls
df_cannabis.isnull().sum().head()

id             0
age            0
gender         0
education      0
neuroticism    0
dtype: int64

In [11]:
target = df_cannabis["cannabis"]
data = df_cannabis.drop(columns=["cannabis","id"])

data.head()

Unnamed: 0,age,gender,education,neuroticism,extraversion,openness,agreeableness,conscientiousness,sensation,impulsiveness
1,6.0,0.0,7.0,14.0,34.0,42.0,18.0,30.0,3.0,5.0
2,1.0,0.0,9.0,39.0,42.0,37.0,16.0,36.0,1.0,2.0
3,1.0,0.0,6.0,16.0,16.0,26.0,40.0,19.0,10.0,5.0
4,1.0,0.0,9.0,50.0,37.0,26.0,35.0,31.0,3.0,2.0
5,2.0,1.0,9.0,46.0,34.0,47.0,24.0,33.0,10.0,4.0


In [12]:
from sklearn.preprocessing import OneHotEncoder

columnsToEncode = ['age', 'gender', 'education', 'neuroticism', 'extraversion',
            'openness', 'agreeableness', 'conscientiousness', 'sensation', 'impulsiveness']
data_reindex = data.reset_index(drop=True)


def one_hot(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        df = pd.concat([df, dummies], axis=1)
    return df

one_hot_data = one_hot(data_reindex, columnsToEncode)
one_hot_data = one_hot_data.drop(columns=columnsToEncode)
one_hot_data.head()

Unnamed: 0,age_1.0,age_2.0,age_3.0,age_4.0,age_5.0,age_6.0,gender_0.0,gender_1.0,education_1.0,education_2.0,...,impulsiveness_1.0,impulsiveness_2.0,impulsiveness_3.0,impulsiveness_4.0,impulsiveness_5.0,impulsiveness_6.0,impulsiveness_7.0,impulsiveness_8.0,impulsiveness_9.0,impulsiveness_10.0
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [13]:
from sklearn.externals import joblib
import pickle

# Load and test Logistic Regression model
filename = 'cannabis_logistic_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))

result = loaded_model.score(one_hot_data, target)
print(f"Accuracy: {result}")

Accuracy: 0.6698770346151113


In [14]:
# Make predictions with Logistic Regression model
model_log = loaded_model
y_predict = model_log.predict(one_hot_data)
log_report = classification_report(target, y_predict)
print(log_report)

              precision    recall  f1-score   support

         0.0       0.22      0.20      0.21     12224
         1.0       0.78      0.80      0.79     44052

   micro avg       0.67      0.67      0.67     56276
   macro avg       0.50      0.50      0.50     56276
weighted avg       0.66      0.67      0.67     56276



In [15]:
from sklearn.metrics import confusion_matrix

# Confusion matrix for Logistic Regression model
tn, fp, fn, tp = confusion_matrix(target, y_predict).ravel()
(tn, fp, fn, tp)

(2466, 9758, 8820, 35232)

In [16]:
# Load and test SVC model
filename = 'cannabis_svc_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))

result = loaded_model.score(one_hot_data, target)
print(f"Accuracy: {result}")

Accuracy: 0.594551851588599


In [17]:
# Make predictions for SVC
model_log = loaded_model
y_predict = model_log.predict(one_hot_data)
log_report = classification_report(target, y_predict)
print(log_report)

              precision    recall  f1-score   support

         0.0       0.22      0.34      0.27     12224
         1.0       0.78      0.67      0.72     44052

   micro avg       0.59      0.59      0.59     56276
   macro avg       0.50      0.50      0.49     56276
weighted avg       0.66      0.59      0.62     56276



In [18]:
# Confusion matrix for SVC model
tn, fp, fn, tp = confusion_matrix(target, y_predict).ravel()
(tn, fp, fn, tp)

(4140, 8084, 14733, 29319)