# Python for data analysis : Drug consumption case
---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00373/drug_consumption.data", header = None)
# !ls /datasets/drug_consumption_dataset
# dataset = pd.read_csv("/datasets/drug_consumption_dataset/drug_consumption.data", header = None)

columns = ['ID', 'AGE', 'GENDER', 'EDUCATION_LEVEL', 'COUNTRY', 'ETHNICITY', 'NSCORE_VALUE', 'ESCORE_VALUE', 'OSCORE_VALUE', 'ASCORE_VALUE', 
        'CSCORE_VALUE', 'IMPULSIVENESS', 'SENSATION_SEEING', 'ALCOHOL_CONSUMPTION', 'AMPHET_CONSUMPTION', 'AMYL_CONSUMPTION', 'BENZOS_CONSUMPTION', 
        'CAFFEINE_CONSUMPTION', 'CANNABIS_CONSUMPTION', 'CHOCOLATE_CONSUMPTION', 'COKE_CONSUMPTION', 'CRACK_CONSUMPTION', 'ECSTASY_CONSUMPTION', 
        'HEROIN_CONSUMPTION', 'KETAMINE_CONSUMPTION', 'LEGAL_HIGHS_CONSUMPTION', 'LSD_CONSUMPTION', 'METH_CONSUMPTION', 'MAGIC_MUSHROOMS_CONSUMPTION', 
        'NICOTINE_CONSUMPTION', 'SEMER_CONSUMPTION', 'VSA_CONSUMPTION']
dataset.columns = columns

In [3]:
dataset.shape

(1885, 32)

In [4]:
dataset.head(15)

Unnamed: 0,ID,AGE,GENDER,EDUCATION_LEVEL,COUNTRY,ETHNICITY,NSCORE_VALUE,ESCORE_VALUE,OSCORE_VALUE,ASCORE_VALUE,...,ECSTASY_CONSUMPTION,HEROIN_CONSUMPTION,KETAMINE_CONSUMPTION,LEGAL_HIGHS_CONSUMPTION,LSD_CONSUMPTION,METH_CONSUMPTION,MAGIC_MUSHROOMS_CONSUMPTION,NICOTINE_CONSUMPTION,SEMER_CONSUMPTION,VSA_CONSUMPTION
0,1,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0
5,6,2.59171,0.48246,-1.22751,0.24923,-0.31685,-0.67825,-0.30033,-1.55521,2.03972,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL6,CL0,CL0
6,7,1.09449,-0.48246,1.16365,-0.57009,-0.31685,-0.46725,-1.09207,-0.45174,-0.30172,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL6,CL0,CL0
7,8,0.49788,-0.48246,-1.7379,0.96082,-0.31685,-1.32828,1.93886,-0.84732,-0.30172,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0
8,9,0.49788,0.48246,-0.05921,0.24923,-0.31685,0.62967,2.57309,-0.97631,0.76096,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL6,CL0,CL0
9,10,1.82213,-0.48246,1.16365,0.96082,-0.31685,-0.24649,0.00332,-1.42424,0.59042,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL6,CL0,CL0


In [5]:
# Remplacer les valeurs chiffrées bizarres en données compréhensibles
def age(value): 
    value = round(value, 5)
    if value == -0.95197: 
        value = "18-24"
    elif value == -0.07854: 
        value = "25-34"
    elif value == 0.49788: 
        value = "35-44"
    elif value == 1.09449: 
        value = "45-54"
    elif value == 1.82213: 
        value = "55-64"
    elif value == 2.59171: 
        value = "65+"
    else: 
        value = "undetermined"
    return value


In [6]:
def education_level(value): 
    value = round(value, 5)
    if value == -2.43591: 
        value = "Left school before 16 years"
    elif value == -1.73790: 
        value = "Left school at 16 years"
    elif value == -1.43719: 
        value = "Left school at 17 years"
    elif value == -1.22751: 
        value = "Left school at 18 years"
    elif value == -0.61113: 
        value = "Some college or university, no certificate or degree"
    elif value == -0.05921: 
        value = "Professional certificate/diploma"
    elif value == 0.45468: 
        value = "University degree"
    elif value == 1.16365: 
        value = "Masters degree"
    elif value == 1.98437: 
        value = "Doctorate degree"
    else: 
        value = "undetermined"
    return value

In [7]:
def country(value): 
    value = round(value, 5)
    if value == -0.09765: 
        value = "Australia"
    elif value == 0.24923: 
        value = "Canada"
    elif value == -0.46841: 
        value = "New Zealand"
    elif value == -0.28519: 
        value = "Other"
    elif value == 0.21128: 
        value = "Republic of Ireland"
    elif value == 0.96082: 
        value = "UK"
    elif value == -0.57009: 
        value = "USA"
    else: 
        value = "undetermined"
    return value

In [8]:
def ethnicity(value): 
    value = round(value, 5)
    if value == -0.50212: 
        value = "Asian"
    elif value == -1.10702: 
        value = "Black"
    elif value == 1.90725: 
        value = "Mixed-Black/Asian"
    elif value == 0.12600: 
        value = "Mixed-White/Asian"
    elif value == -0.22166: 
        value = "Mixed-White/Black"
    elif value == 0.11440: 
        value = "Other"
    elif value == -0.31685: 
        value = "White"
    else: 
        value = "undetermined"
    return value

In [9]:
def score(value, score_type): 
    unique_values = dataset[score_type].unique()
    unique_values.sort()
    unique_values = np.around(unique_values, 5)
    value = round(value, 5)
    index = np.where(unique_values == value)
    return index[0][0]

def n_score(value): 
    unique_values = dataset.NSCORE_VALUE.unique()
    scores = [12+i for i in range(len(unique_values))]
    return scores[score(value, "NSCORE_VALUE")]
    
def e_score(value): 
    unique_values = dataset.ESCORE_VALUE.unique()
    scores = [16+i for i in range(len(unique_values))]
    return scores[score(value, "ESCORE_VALUE")]

def o_score(value): 
    unique_values = dataset.OSCORE_VALUE.unique()
    scores = [24+i for i in range(len(unique_values))]
    return scores[score(value, "OSCORE_VALUE")]

def a_score(value): 
    unique_values = dataset.ASCORE_VALUE.unique()
    scores = [12+i for i in range(len(unique_values))]
    return scores[score(value, "ASCORE_VALUE")]

def c_score(value): 
    unique_values = dataset.CSCORE_VALUE.unique()
    scores = [17+i for i in range(len(unique_values))]
    return scores[score(value, "CSCORE_VALUE")]


In [10]:
dataset.GENDER = dataset.GENDER.apply(lambda sex : "F" if sex > 0 else "M")
dataset.AGE = dataset.AGE.apply(age)
dataset.EDUCATION_LEVEL = dataset.EDUCATION_LEVEL.apply(education_level)
dataset.COUNTRY = dataset.COUNTRY.apply(country)
dataset.ETHNICITY = dataset.ETHNICITY.apply(ethnicity)
dataset.NSCORE_VALUE = dataset.NSCORE_VALUE.apply(n_score)
dataset.OSCORE_VALUE = dataset.OSCORE_VALUE.apply(o_score)
dataset.ESCORE_VALUE = dataset.ESCORE_VALUE.apply(e_score)
dataset.ASCORE_VALUE = dataset.ASCORE_VALUE.apply(a_score)
dataset.CSCORE_VALUE = dataset.CSCORE_VALUE.apply(c_score)

In [11]:
dataset

Unnamed: 0,ID,AGE,GENDER,EDUCATION_LEVEL,COUNTRY,ETHNICITY,NSCORE_VALUE,ESCORE_VALUE,OSCORE_VALUE,ASCORE_VALUE,...,ECSTASY_CONSUMPTION,HEROIN_CONSUMPTION,KETAMINE_CONSUMPTION,LEGAL_HIGHS_CONSUMPTION,LSD_CONSUMPTION,METH_CONSUMPTION,MAGIC_MUSHROOMS_CONSUMPTION,NICOTINE_CONSUMPTION,SEMER_CONSUMPTION,VSA_CONSUMPTION
0,1,35-44,F,Professional certificate/diploma,UK,Mixed-White/Asian,39,35,40,29,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,2,25-34,M,Doctorate degree,UK,White,29,51,53,40,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,3,35-44,M,Professional certificate/diploma,UK,White,31,44,38,24,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,4,18-24,F,Masters degree,UK,White,34,33,44,39,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,5,35-44,F,Doctorate degree,UK,White,43,27,41,33,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,1884,18-24,F,"Some college or university, no certificate or ...",USA,White,25,50,55,40,...,CL0,CL0,CL0,CL3,CL3,CL0,CL0,CL0,CL0,CL5
1881,1885,18-24,M,"Some college or university, no certificate or ...",USA,White,33,50,48,40,...,CL2,CL0,CL0,CL3,CL5,CL4,CL4,CL5,CL0,CL0
1882,1886,25-34,F,University degree,USA,White,47,29,35,23,...,CL4,CL0,CL2,CL0,CL2,CL0,CL2,CL6,CL0,CL0
1883,1887,18-24,F,"Some college or university, no certificate or ...",USA,White,45,25,46,24,...,CL3,CL0,CL0,CL3,CL3,CL0,CL3,CL4,CL0,CL0


In [12]:
# Plots

# Pie Chart : proportion Men/women ; Age proportion ; Country
# Histogram : most consumed drug
# link between ethnicity & education level ? 
# Interactive plots : https://towardsdatascience.com/interactive-graphs-in-python-830b1e6c197f

In [13]:
# https://plotly.com/python/histograms/
# https://plotly.com/python/

!pip install plotly
import plotly.express as px
df = px.data.tips()
fig = px.histogram(df, x="total_bill", color="sex")
fig.show()

You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m
