# NSDUH Drug Sequence Analysis Part 3:  Chi-Square Analysis
## Matthew J. Beattie
## University of Oklahoma
__February 5, 2021__

This notebook does a deeper dive on Chi-Square analysis of the clusters with regard to their demographic categories. 

In [1]:
"""
Import python modules
"""
import pandas as pd
import numpy as np
import copy
import os
import pathlib, itertools
import time as timelib
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import random
from sklearn_extra import cluster as cs
from sklearn.cluster import KMeans
import pickle
import json
import pathutils as pu
import scipy.stats as stats
from math import dist
import mlflow

# Parallelize pandas with modin and dash
#import modin.pandas as pd

HOME_DIR = pathlib.Path.home()
CW_DIR = pathlib.Path.cwd()

FIGW = 12
FIGH = 5
FONTSIZE = 8
FIGURESIZE = (FIGW,FIGH)

plt.rcParams['figure.figsize'] = (FIGW, FIGH)
plt.rcParams['font.size'] = FONTSIZE

plt.rcParams['xtick.labelsize'] = FONTSIZE
plt.rcParams['ytick.labelsize'] = FONTSIZE

random.seed(660806)


### Import Data
Import data from pickle files and prepare for analysis

In [4]:
# Set parameters
datapath = 'C:/Users/mjbea/OneDrive/GitHub/abuse_sequence/Data3/'
workingpath = 'C:/Users/mjbea/OneDrive/GitHub/abuse_sequence/Code3/'
outpath = 'C:/Users/mjbea/OneDrive/GitHub/abuse_sequence/Output3/'
year = '2016_2017_2018_2019'
jsondict = datapath + 'NSDUH_field.json'
n_clusters = 11

# Setup filenames
clustpkl = workingpath + 'Kmeans_' + str(n_clusters) + 'clust_' + str(year) + '_nonullpath_clust.pkl'
demogpkl = workingpath + 'Kmeans_' + str(n_clusters) + 'clust_' + str(year) + '_nonullpath_demog.pkl'
modpkl = workingpath + 'Kmeans_' + str(n_clusters) + 'clust_' + str(year) + '_nonullpath_model.pkl'

# Read in cluster and demographic information and the model
dfclust = pd.read_pickle(clustpkl)
dfdemog = pd.read_pickle(demogpkl)
model = pickle.load(open(modpkl, 'rb'))

# Drug name and indices are called by a user-defined function.
ident, rawafuvals, afuvals, drugnames, drugorder, drugnums, drugposition, startdemog, demographics = pu.surveyvars(year)

# Decode dictionaries for NSDUH variables:
f1 = open(jsondict, 'r')
nsduhDecoder = json.load(f1)
f1.close()

In [5]:
strings = 0
ints = 0
selse = 0
for i in range(0,len(dfclust['RESPID'])):
    if type(dfclust['RESPID'][i])==str:
        strings += 1
    elif type(dfclust['RESPID'][i])==int:
        ints += 1
    else:
        selse += 1
print(strings, ints, selse)

170944 0 0


In [7]:
model.inertia_

45120796010121.57

### Review data and key variables

In [8]:
# Show cluster dataframe
dfclust['HEROIN_USER'] = dfclust.apply(lambda row: 'Heroin User' if (row['AFUVECT'][8] > 0 and row['AFUVECT'][8] < 991) else 'Not Heroin User', axis=1)
dfclust.head()

Unnamed: 0,RESPID,AFUVECT,YRWEIGHT,labels,HEROIN_USER
0,201611635143.0,"[0, 16, 15, 20, 991, 991, 991, 991, 991, 991]",204.858562,3,Not Heroin User
1,201635755143.0,"[0, 26, 16, 991, 991, 991, 991, 991, 991, 991]",2533.458396,0,Not Heroin User
2,201692675143.0,"[0, 5, 18, 32, 34, 991, 991, 991, 991, 991]",6203.973093,10,Not Heroin User
3,201659596143.0,"[0, 991, 14, 991, 991, 991, 991, 991, 991, 991]",1386.672703,2,Not Heroin User
4,201641106143.0,"[0, 991, 991, 991, 991, 991, 991, 991, 991, 991]",2384.841656,5,Not Heroin User


In [9]:
dfclust['HEROIN_USER'].value_counts()

Not Heroin User    167088
Heroin User          3856
Name: HEROIN_USER, dtype: int64

In [10]:
# Show demographic dataframe
dfdemog.head()

Unnamed: 0,RESPID,CATAG6,SVCFLAG,IRSEX,IRMARIT,NEWRACE2,EDUHIGHCAT,IRWRKSTAT,GOVTPROG,INCOME,COUTYP4,AIIND102,YRWEIGHT,labels
0,201611635143.0,3,0,2,1,1,4,4,2,4,3,2,204.858562,3
1,201635755143.0,4,0,1,1,7,1,3,2,2,1,2,2533.458396,0
2,201692675143.0,6,0,2,1,1,3,2,2,3,1,2,6203.973093,10
3,201659596143.0,3,0,1,1,5,4,4,2,2,2,2,1386.672703,2
4,201641106143.0,5,0,1,2,1,2,4,2,2,2,2,2384.841656,5


In [11]:
# Show category values for demographic variables
nsduhDecoder

{'CATAG6': {'values': {'1': 'Youth',
   '2': '18-25',
   '3': '26-34',
   '4': '35-49',
   '5': '50-64',
   '6': '65+'}},
 'SVCFLAG': {'values': {'1': 'Military service', '0': 'No military service'}},
 'AGE2': {'values': {'1': '12',
   '2': '13',
   '3': '14',
   '4': '15',
   '5': '16',
   '6': '17',
   '7': '18',
   '8': '19',
   '9': '20',
   '10': '21',
   '11': '22 or 23',
   '12': '24 or 25',
   '13': '26 - 29',
   '14': '30 - 34',
   '15': '35 - 49',
   '16': '50 - 64',
   '17': '65+'}},
 'IRSEX': {'values': {'1': 'Male', '2': 'Female'}},
 'IRMARIT': {'values': {'1': 'Married',
   '2': 'Widowed',
   '3': 'Divorced/Sep',
   '4': 'Never Married',
   '99': 'Youth'}},
 'NEWRACE2': {'values': {'1': 'NonHisp White',
   '2': 'NonHisp Black',
   '3': 'NonHisp NatAmer',
   '4': 'NonHisp HI/PI',
   '5': 'NonHisp Asian',
   '6': 'NonHisp >1 Race',
   '7': 'Hispanic'}},
 'EDUHIGHCAT': {'values': {'1': '< High School',
   '2': 'HS Grad',
   '3': 'Some Coll',
   '4': 'Coll Grad',
   '5': 'You

In [12]:
catcrossnorm = pd.crosstab(index=dfclust['labels'], columns=['Percentage'], normalize='columns')
catcrossnorm

col_0,Percentage
labels,Unnamed: 1_level_1
0,0.178205
1,0.057487
2,0.170179
3,0.205254
4,0.056674
5,0.113838
6,0.062541
7,0.055451
8,0.036217
9,0.026962


In [9]:
# Set output file variables
jsonout = outpath + 'Kmeans_' + str(n_clusters) + 'clust_' + str(year) + '_alldata_clust_stats.json'
outfile = outpath + 'Kmeans_' + str(n_clusters) + 'clust_' + str(year) + '_chi2_output.txt'
outfile2 = outpath + 'Kmeans_' + str(n_clusters) + 'clust_' + str(year) + '_crosstab_latex.txt'
csvout = outpath + 'Kmeans_' + str(n_clusters) + 'clust_' + str(year) + '_ranked_chi2.csv'

# Calculate medoids from cluster model and store center and medoids into a dataframe and dictionary for output to json
clustsmrydf = pd.DataFrame()
tempdict = {}
medoids = []
for j in range(0, len(model.cluster_centers_)):
    clustcenter = model.cluster_centers_[j]
    clust = dfclust.loc[dfclust['labels']==j,:]
    intcenter = np.array(clustcenter, dtype=int)
    if clust.shape[0] > 0:
        clust['distfrommean'] = clust.apply(lambda row: dist(np.array(row['AFUVECT']), clustcenter), axis=1)
        count = clust.shape[0]
        clustfrac = round(clust.shape[0]/dfclust.shape[0],5)
        medoid = clust.loc[clust['distfrommean'].idxmin()]['AFUVECT']
        medoids.append(medoid)
        path = pu.uncodevect(medoid, drugnames)
        clustsmrydf = clustsmrydf.append({'clustnum': j, 'count': count, 'clustfrac': clustfrac, 'center': clustcenter, 
                                          'intcenter': intcenter, 'medoid': medoid, 'path': path}, ignore_index=True)
        tempdict.update({j: {'count': count, 'clustfrac': clustfrac, 'center': clustcenter.tolist(), 'intcenter': intcenter.tolist(), 'medoid': medoid, 'path': path}})
    else:
        clustsmrydf = clustsmrydf.append({'clustnum': j, 'count': 0, 'clustfrac': 0, 'center': clustcenter, 'intcenter': intcenter, 'medoid': 'NA', 'path': 'NA'}, ignore_index=True)
        tempdict.update({j: {'count': 0, 'clustfrac': 0, 'center': clustcenter.tolist(), 'intcenter': intcenter.tolist(), 'medoid': 'NA', 'path': 'NA'}})

# Save cluster dictionary to json file
with open(jsonout,'w') as cluststats_dumped:
    json.dump(tempdict,cluststats_dumped, indent = 4, sort_keys = True)

cluststats_dumped.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [10]:
# Print cluster summary output leave file open for crosstabs
f = open(outfile, 'w')

# Open latex output file
f2 = open(outfile2, 'w')

print('\n******\nCLUSTER CENTERS\n******', file=f)
print(model.cluster_centers_, file=f)
print('\n******\n INTEGER CLUSTER CENTERS\n******', file=f)
print(np.array(model.cluster_centers_, dtype='int'), file=f)
print('\n******\n CLUSTER MEDOIDS\n******\n', file=f)
print(medoids, file=f)
print('\n******\nCLUSTER ITERATIONS\n******', file=f)
print(model.n_iter_, file=f)
print('\n******\nCLUSTER INERTIA\n******', file=f)
print(model.inertia_, file=f)

# Print out cluster pathways and save translation to a list
print('\n******\nKNN CLUSTER PATHWAYS\n******', file=f)
centerpaths = []
i = 0
for i in range(0, len(model.cluster_centers_)):
    path = pu.uncodevect(model.cluster_centers_[i], drugnames)
    print('Cluster ', i, '\n', path, file=f)
    centerpaths.append(path)
    
# Print out cluster pathways and save translation to a list
print('\n******\nKNN PSUDEO-CLUSTER PATHWAYS\n******', file=f)
centerpaths = []
i = 0
for i in range(0, len(model.cluster_centers_)):
    path = pu.uncodevect(model.cluster_centers_[i], drugnames)
    path = [item for item in path if item[1] < 99]
    print('Cluster ', i, '\n', path, file=f)
    centerpaths.append(path)
    
print('\n******\nPSUEDO-MEDOID CLUSTER PATHWAYS\n******', file=f)
centerpaths = []
i = 0
for i in range(0, len(model.cluster_centers_)):
    path = pu.uncodevect(medoids[i], drugnames)
    print('Cluster ', i, '\n', path, file=f)
    centerpaths.append(path)
    
print('\n******\nCLUSTER RAW COUNTS\n******', file=f)
print(dfclust['labels'].value_counts(), file=f)

print('\n******\nCLUSTER PERCENTAGES\n******', file=f)
print(pd.crosstab(index=dfclust['labels'], columns=['Percentage'], normalize='columns'), file=f)
print('Fraction of respondents by cluster:', file=f2)
print(pd.crosstab(index=dfclust['labels'], columns=['Percentage'], normalize='columns').to_latex(), file=f2)

print('\n******\nEND OF SECTION\n******\n', file=f)


### Generate categorical crosstabs
The demographic dataset needs to be converted to crosstabs for chi-square analysis

In [11]:
# Convert fields in the demographic database to category variables.
catvars = copy.deepcopy(demographics)
catvars.remove('AGE2')
catvars.remove('RESPID')

# Create a new dataframe for translation of categorical values into
# reader friendly values from the NSDUH dictionary.
# Youth dictionary values are not considered.
tmpdf = copy.deepcopy(dfdemog)
pipeprep = Pipeline([('makecats', pu.MakeCat(catvars))])
tmpdf = pipeprep.transform(tmpdf)

for fieldname in tmpdf.columns:
    newvallist = []
    if fieldname != 'RESPID' and fieldname != 'labels':
        for key, value in nsduhDecoder[fieldname]['values'].items():
            if value != "Youth":
                newvallist.append(value)
        tmpdf[fieldname].cat.categories = newvallist

### Run Chi-square analysis
Run chi2 for each demographic variable.  Calculate chi2 values for the individual cells and store their values along with the fraction of the total table chi2.  Call this last statistic _INFLUENCE_.

In [12]:
# Initialize individual chi2 dataframe
indchi2df = pd.DataFrame(columns = ['TABLE', 'VALUE', 'CLUSTER', 'OBSERVED', 'EXPECTED', 'DIFF', 'INDCHI2', 'INFLUENCE'])

print('\n*****\nCROSSTABS AND CHISQUARE TEST RESULTS\n*****\n', file=f)
for item in catvars:
    catcross = pd.crosstab(index=tmpdf['labels'], columns=tmpdf[item])
    catcrossnorm = pd.crosstab(index=tmpdf['labels'], columns=tmpdf[item], normalize='columns')
    chi2, p, dof, ex = stats.chi2_contingency(catcross)
    print('\n******', item, '******\n', file=f)
    print('CROSSTAB COLUMN PERCENTAGES', file=f)
    print(catcrossnorm, '\n', file=f)
    print(catcrossnorm.to_latex(index=False), '\n', file=f2)
    print('CHI SQUARE RESULTS', file=f)
    print('CHI2 value: ', chi2, 'p-value: ', p, 'Degrees of freedom: ', dof, '\n', file=f)

    # Write individual chi2 to table
    rowcnt, colcnt = catcross.shape[0], catcross.shape[1]
    for i in range(0, rowcnt):
        newrow = []
        for j in range(0, colcnt):
            table = item
            value = catcross.columns[j]
            observed = catcross.loc[i][j]
            expected = ex[i,j]
            diff = observed - expected
            indchi2 = (catcross.loc[i][j] - ex[i,j])**2/ex[i,j]
            dftmp = {'TABLE': table, 'VALUE': value, 'CLUSTER': i, 'OBSERVED': observed, 'EXPECTED': expected,
                     'DIFF': diff, 'INDCHI2': indchi2, 'INFLUENCE': indchi2/chi2}
            indchi2df = indchi2df.append(dftmp, ignore_index = True)
            

In [13]:
# Duplicate process for heroin use
catcross = pd.crosstab(index=dfclust['labels'], columns=dfclust['HEROIN_USER'])
catcrossnorm = pd.crosstab(index=dfclust['labels'], columns=dfclust['HEROIN_USER'], normalize='columns')
chi2, p, dof, ex = stats.chi2_contingency(catcross)
print('\n******', item, '******\n', file=f)
print('HEROIN USE CROSSTAB COLUMN PERCENTAGES', file=f)
print(catcrossnorm, '\n', file=f)
print(catcrossnorm.to_latex(index=False), '\n', file=f2)
print('CHI SQUARE RESULTS', file=f)
print('CHI2 value: ', chi2, 'p-value: ', p, 'Degrees of freedom: ', dof, '\n', file=f)

# Write individual chi2 to table for heroin use
rowcnt, colcnt = catcross.shape[0], catcross.shape[1]
for i in range(0, rowcnt):
    newrow = []
    for j in range(0, colcnt):
        table = 'HEROIN_USE'
        value = catcross.columns[j]
        observed = catcross.loc[i][j]
        expected = ex[i,j]
        diff = observed - expected
        indchi2 = (catcross.loc[i][j] - ex[i,j])**2/ex[i,j]
        dftmp = {'TABLE': table, 'VALUE': value, 'CLUSTER': i, 'OBSERVED': observed, 'EXPECTED': expected,
                 'DIFF': diff, 'INDCHI2': indchi2, 'INFLUENCE': indchi2/chi2}
        indchi2df = indchi2df.append(dftmp, ignore_index = True)


In [14]:
indchi2df.head()

Unnamed: 0,TABLE,VALUE,CLUSTER,OBSERVED,EXPECTED,DIFF,INDCHI2,INFLUENCE
0,SVCFLAG,Military service,0,27425,28674.810839,-1249.810839,54.473843,0.025564
1,SVCFLAG,No military service,0,3047,1797.189161,1249.810839,869.150096,0.407879
2,SVCFLAG,Military service,1,32644,33017.62562,-373.62562,4.227927,0.001984
3,SVCFLAG,No military service,1,2443,2069.37438,373.62562,67.458119,0.031657
4,SVCFLAG,Military service,2,18623,17854.003217,768.996783,33.121762,0.015544


### Rank the entries in the chi-square crosstab
To find the individual cells in the chi2 crosstab with the most influence, rank them by chi2 score.

In [15]:
indchi2df['rank'] = indchi2df.groupby('TABLE')['INDCHI2'].rank(method="first", ascending=False)
indchi2df.to_csv(csvout)
pd.set_option('max_rows', 100)
indchi2df.head(100)

Unnamed: 0,TABLE,VALUE,CLUSTER,OBSERVED,EXPECTED,DIFF,INDCHI2,INFLUENCE,rank
0,SVCFLAG,Military service,0,27425,28674.810839,-1249.810839,54.473843,0.025564,6.0
1,SVCFLAG,No military service,0,3047,1797.189161,1249.810839,869.150096,0.407879,1.0
2,SVCFLAG,Military service,1,32644,33017.62562,-373.62562,4.227927,0.001984,15.0
3,SVCFLAG,No military service,1,2443,2069.37438,373.62562,67.458119,0.031657,5.0
4,SVCFLAG,Military service,2,18623,17854.003217,768.996783,33.121762,0.015544,8.0
5,SVCFLAG,No military service,2,350,1118.996783,-768.996783,528.469841,0.248003,2.0
6,SVCFLAG,Military service,3,28057,27368.672829,688.327171,17.311555,0.008124,10.0
7,SVCFLAG,No military service,3,1027,1715.327171,-688.327171,276.212202,0.129622,3.0
8,SVCFLAG,Military service,4,5268,5230.19817,37.80183,0.273217,0.000128,22.0
9,SVCFLAG,No military service,4,290,327.80183,-37.80183,4.359275,0.002046,14.0


In [16]:
pd.reset_option('max_rows')

In [17]:
topindchi2 = indchi2df[indchi2df['rank']<=5]
topindchi2.head(100)

Unnamed: 0,TABLE,VALUE,CLUSTER,OBSERVED,EXPECTED,DIFF,INDCHI2,INFLUENCE,rank
1,SVCFLAG,No military service,0,3047,1797.189161,1249.810839,869.150096,0.407879,1.0
3,SVCFLAG,No military service,1,2443,2069.37438,373.62562,67.458119,0.031657,5.0
5,SVCFLAG,No military service,2,350,1118.996783,-768.996783,528.469841,0.248003,2.0
7,SVCFLAG,No military service,3,1027,1715.327171,-688.327171,276.212202,0.129622,3.0
17,SVCFLAG,No military service,8,275,587.778524,-312.778524,166.440932,0.078108,4.0
24,CATAG6,18-25,0,6698,9868.853753,-3170.853753,1018.792433,0.079182,3.0
28,CATAG6,65+,0,5363,2708.796518,2654.203482,2600.710713,0.202131,1.0
34,CATAG6,18-25,2,8382,6144.715223,2237.284777,814.59319,0.063311,5.0
62,CATAG6,50-64,7,1585,735.510249,849.489751,981.132266,0.076255,4.0
64,CATAG6,18-25,8,5194,3227.6515,1966.3485,1197.938013,0.093105,2.0


In [18]:
# Print out top individual chi2 to output
print(topindchi2, file=f)
print(topindchi2.to_latex(index=False), file=f2)

# Close print files
f.close()
f2.close()

# Log runs to mlflow
experiment_name = "Chi-Square Analysis"
mlflow.set_experiment(experiment_name)

with mlflow.start_run():
    mlflow.log_param("n_clusters", n_clusters)
    mlflow.log_param("Observations", 'All')
    
    mlflow.log_artifact(jsonout)
    mlflow.log_artifact(csvout)
    mlflow.log_artifact(outfile)
    mlflow.log_artifact(outfile2)
   
    # End mlflow run
    mlflow.end_run()        
