# Substance Abuse and Mental Health Data Archive (SAMHDA)
## 2012 Survey on Drug Use by Age Groups
_By Nick Brooks, November 2020_

**Resources:**
- https://fivethirtyeight.com/features/how-baby-boomers-get-high/
- https://github.com/fivethirtyeight/data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import time
import math
import itertools
from wordcloud import WordCloud
from itertools import combinations

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

SIA = SentimentIntensityAnalyzer()

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

sns.set_style("whitegrid")
notebookstart = time.time()
pd.options.display.max_colwidth = 500
pd.options.display.max_rows = 999
pd.options.display.width = 300
pd.options.display.max_columns = 100

In [None]:
def big_bar_cloud(plot_df, plt_set, x_var, columns, figsize, custom_palette = sns.color_palette("Paired")):
    """
    Iteratively Plot BarPlots
    """
    palette = itertools.cycle(custom_palette)
    rows = math.ceil(len(plt_set)/columns)
    n_plots = rows*columns
    f,ax = plt.subplots(rows, columns, figsize = figsize)
    for i in range(0,n_plots):
        ax = plt.subplot(rows, columns, i+1)
        if i < len(plt_set):
            col = plt_set[i]
            sns.barplot(data=df, x=x_var, y=col, ax=ax, color=next(palette), alpha=.8)
            ax.set_title("{} by {}".format(col, x_var))
        else:
            ax.axis('off')
    plt.tight_layout(pad=2)
    
    
    
def rank_correlations(df, figsize=(12,20), n_charts = 18, polyorder = 2, custom_palette = sns.color_palette("Paired", 5)):
    # Rank Correlations
    palette = itertools.cycle(custom_palette)
    continuous_rankedcorr = (df
                             .corr()
                             .unstack()
                             .drop_duplicates().reset_index())
    continuous_rankedcorr.columns = ["f1","f2","Correlation Coefficient"]
    continuous_rankedcorr['abs_cor'] = abs(continuous_rankedcorr["Correlation Coefficient"])
    continuous_rankedcorr.sort_values(by='abs_cor', ascending=False, inplace=True)

    # Plot Top Correlations
    top_corr = [(x,y,cor) for x,y,cor in list(continuous_rankedcorr.iloc[:, :3].values) if x != y]
    f, axes = plt.subplots(int(n_charts/3),3, figsize=figsize, sharex=False, sharey=False)
    row = 0
    col = 0
    for (x,y, cor) in top_corr[:n_charts]:
        if col == 3:
            col = 0
            row += 1
        g = sns.regplot(x=x, y=y, data=df, order=polyorder, ax = axes[row,col], color=next(palette))
        axes[row,col].set_title('{} and {}'.format(x, y))
        axes[row,col].text(0.18, 0.93,"Cor Coef: {:.2f}".format(cor),
                           ha='center', va='center', transform=axes[row,col].transAxes)
        col += 1
    plt.tight_layout(pad=0)
    plt.show()
    
    
# Data Exploration
def custom_describe(df, value_count_n = 5):
    """
    Custom Describe Function - More Tailored to categorical type variables..
    """
    unique_count = []
    for x in df.columns:
        unique_values_count = df[x].nunique()
        value_count = df[x].value_counts().iloc[:5]

        value_count_list = []
        value_count_string = []
        
        for vc_i in range(0,value_count_n):
            value_count_string += ["ValCount {}".format(vc_i+1),
                                   "Occ"]
            if vc_i <= unique_values_count - 1:
                value_count_list.append(value_count.index[vc_i])
                value_count_list.append(value_count.iloc[vc_i])
            else:
                value_count_list.append(np.nan)
                value_count_list.append(np.nan)
        
        unique_count.append([x,
                             unique_values_count,
                             df[x].isnull().sum(),
                             df[x].dtypes] + value_count_list)
        
    print("Dataframe Dimension: {} Rows, {} Columns".format(*df.shape))
    return pd.DataFrame(unique_count,
            columns=["Column","Unique","Missing","dtype"
                    ] + value_count_string
                       ).set_index("Column")

print("Helper Functions Ready")

In [None]:
categorical_cols = [
    'age'
]


continuous_cols = [
    'alcohol-use',
    'alcohol-frequency',
    'marijuana-use',
    'marijuana-frequency',
    'cocaine-use',
    'cocaine-frequency',
    'crack-use',
    'crack-frequency',
    'heroin-use',
    'heroin-frequency',
    'hallucinogen-use',
    'hallucinogen-frequency',
    'inhalant-use',
    'inhalant-frequency',
    'pain-releiver-use',
    'pain-releiver-frequency',
    'oxycontin-use',
    'oxycontin-frequency',
    'tranquilizer-use',
    'tranquilizer-frequency',
    'stimulant-use',
    'stimulant-frequency',
    'meth-use',
    'meth-frequency',
    'sedative-use',
    'sedative-frequency',
    'n'
]

df = pd.read_csv("/kaggle/input/fivethirtyeight-drug-use-by-age-dataset/drug-use-by-age.csv")
print("DF Shape: {} Rows, {} Columns".format(*df.shape))

# Data Cleaning
for col in continuous_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
display(df.head(30))

In [None]:
print("Categorical Variables")
display(custom_describe(df[categorical_cols]))
print("Continuous Variables")
display(df[continuous_cols].describe().T)

In [None]:
big_bar_cloud(plot_df=df, plt_set=continuous_cols, x_var='age', columns=2, figsize=[20,32])

In [None]:
# Plot Correlation Matrix
f, ax = plt.subplots(figsize=[14,12])
ax = sns.heatmap(df[continuous_cols].corr(), 
                 annot=True, fmt=".1f",
                 vmin=-1, vmax=1,
                 cbar_kws={'label': 'Correlation Coefficient'})
ax.set_title("Continuous Variable Correlation Matrix")
plt.show()

In [None]:
rank_correlations(df = df.loc[:,continuous_cols])

In [None]:
print("Script Complete - Runtime: {:.2f} Minutes".format((time.time() - notebookstart) / 60))