# NSDUH Drug Sequence Analysis Part 7:  Stable Cluster Assignment and Chi Square
## Matthew J. Beattie
## University of Oklahoma
__February 12, 2021__

### Stability Clusters Exploration
From Step 6, we choose a number of clusters, defined by connected components.  We then take the entire dataset and assign points to these clusters.  With the new clusters, we run chi-square analysis.

### Approach
* Set appropriate parameters
* Read in the cluster definitions from Step 6
* Assign data points to clusters based upon smallest distance to the center
* Run chi square analysis

In [1]:
"""
Import python modules
"""
import pandas as pd
import numpy as np
import copy
import os
import sys
import pathlib, itertools
import time
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import random
import pickle
import json
import pathutils as pu
import mlflow
import profile
import gc
import csv
import scipy.stats as stats
from math import dist

HOME_DIR = pathlib.Path.home()
CW_DIR = pathlib.Path.cwd()

FIGW = 12
FIGH = 5
FONTSIZE = 8
FIGURESIZE = (FIGW,FIGH)

plt.rcParams['figure.figsize'] = (FIGW, FIGH)
plt.rcParams['font.size'] = FONTSIZE

plt.rcParams['xtick.labelsize'] = FONTSIZE
plt.rcParams['ytick.labelsize'] = FONTSIZE


### Read in model, set file names, etc.

In [2]:
# Set working parameters
datapath = 'C:/Users/mjbea/OneDrive/GitHub/abuse_sequence/Data3/'
workingpath = 'C:/Users/mjbea/OneDrive/GitHub/abuse_sequence/Code3/'
outpath = 'C:/Users/mjbea/OneDrive/GitHub/abuse_sequence/Output3/'
hugefiles = 'C:/Users/mjbea/huge_files/'
year = '2016_2017_2018_2019'
jsondict = datapath + 'NSDUH_field.json'
clustjson = outpath + 'stableclusts_13_clust_0.6stab.json'
n_clusters = 11

# Setup filenames
clustpkl = workingpath + 'Kmeans_11clust_2016_2017_2018_2019_nonullpath_clust.pkl'
demogpkl = workingpath + 'Kmeans_11clust_2016_2017_2018_2019_nonullpath_demog.pkl'
modpkl = workingpath + 'Kmeans_11clust_2016_2017_2018_2019_nonullpath_model.pkl'
newclustpkl =  workingpath + 'stable_13_clust_2016_2017_2018_2019_newclust.pkl'

# Get translation dictionaries
stryear = '2016_2017_2018_2019'

# Drug name and indices are called by a user-defined function.
ident, rawafuvals, afuvals, drugnames, drugorder, drugnums, drugposition, startdemog, demographics = pu.surveyvars(year)

# Decode dictionaries for NSDUH variables:
f1 = open(jsondict, 'r')
nsduhDecoder = json.load(f1)
f1.close()


### Helper Utilities
These methods calculate cluster stats such as inertia, average inertia, center of cluster, and nearest medoid to center, and a descriptive AFU path that characterizes the medoid.

In [3]:

"""
getcenter(df)
Calculates and returns the center and inertia of a cluster.  Uses weights from survey.
"""
def getcenter(df):
    df['AFUARRAY'] = df.apply(lambda row: np.matrix(row['AFUVECT']).A[0],axis=1)
    center = np.average(a=df['AFUARRAY'], weights=df['YRWEIGHT'])
    df['DISTSQ'] = df.apply(lambda row: euclidean(row['AFUARRAY'],center)**2, axis=1)
    df['WTDISTSQ'] = df.apply(lambda row: row['DISTSQ'] * row['YRWEIGHT'], axis=1)
    inertia = df['WTDISTSQ'].sum()
    return center, inertia


"""
getmedoid(df)
Calculates and returns medoid of a cluster
"""
def getmedoid(df, clustcenter):
#    clustcenter, clustinertia = getcenter(df)
    df['distfrommean'] = df.apply(lambda row: euclidean(np.matrix(row['AFUVECT']).A[0], clustcenter), axis=1)
    medoidstr = df.loc[df['distfrommean'].idxmin()]['AFUVECT']
    medoid = np.matrix(medoidstr).A[0]
    return medoid


"""
arrayToPath(medoid)
Converts a medoid to a path list
"""
def arrayToPath(array):
    afupath = {}
    i = 0
    for item in array:
        pathstep = {drugposition[i]: float(item)}   # The type conversion is necessary for saving in JSON
        if item != 991:
            afupath.update(pathstep)
        i += 1
    return afupath


"""
assignnewclust(df, newcenters, numclusts)
Assigns an observation to a new component defined cluster by calculating distance to the minimum
"""
def assignnewclust(afuvect, newcentersdf, numclusts):
    numcenters = max(newcentersdf['newlabels'])
    afuarray = np.array(afuvect)
    chkcenter = np.array(newcentersdf.loc[0, 'center'])
    newlabels, newcenter = 0, chkcenter
    mindist = dist(afuarray, newcenter)

    for i in range(1,numclusts):
        if i <= numcenters:
            chkcenter = np.array(newcentersdf.loc[i, 'center'])
            chkdist = dist(afuarray, chkcenter)

            if chkdist < mindist:
                newlabels, newcenter = i, chkcenter
                mindist = chkdist

    return newlabels


### Relabel clustered complete data
Set labels of new clusters for the entire dataset equal to the ones associated with the stability clusters.

In [4]:
# Set flag on whether or not to generate new labels
setnewclust = False

# Set number of new clusters to consider:
numnewclusts = 13

# Bring in stability from last run considered:
stability = 0.60

# Read in new cluster defintions from Step 6
newclustdf = pd.read_json(clustjson).transpose()
newcentersdf = newclustdf[['center','clustfrac']]
newcentersdf['newlabels'] = newcentersdf.index

dfclust = pd.read_pickle(clustpkl)
dfdemog = pd.read_pickle(demogpkl)


if setnewclust:
    # Read in complete dataset
    dfall = pd.merge(dfclust.drop(['YRWEIGHT'], axis=1), dfdemog, on=['RESPID','labels'])

    # Get cluster centers from original model and merge into dataset
    model = pickle.load(open(modpkl, 'rb'))
    origcentersdf = pd.DataFrame()
    i = 0

    for item in model.cluster_centers_:
        origcentersdf = origcentersdf.append({'labels': i, 'origcenter': item}, ignore_index=True)
        i += 1

    dfall = pd.merge(dfall, origcentersdf, on=['labels'])

    # Assign new clusters and cluster centers to dataset
    # NOTE:  newcenter is from sample of data.  True center of new cluster is determined via total labeled dataset.
    dfall['newlabels'] = dfall.apply(lambda row: assignnewclust(row['AFUVECT'], newcentersdf, numnewclusts), axis=1)
    dfall['newcenter'] = dfall.apply(lambda row:  newcentersdf.loc[row['newlabels'], 'center'], axis=1)
    dfall.to_pickle(newclustpkl)
    
else:
    dfall = pd.read_pickle(newclustpkl)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newcentersdf['newlabels'] = newcentersdf.index


In [5]:
dfall.head()

Unnamed: 0,RESPID,AFUVECT,labels,CATAG6,SVCFLAG,IRSEX,IRMARIT,NEWRACE2,EDUHIGHCAT,IRWRKSTAT,GOVTPROG,INCOME,COUTYP4,AIIND102,YRWEIGHT,origcenter,newlabels,newcenter
0,201611635143.0,"[0, 16, 15, 20, 991, 991, 991, 991, 991, 991]",3,3,0,2,1,1,4,4,2,4,3,2,204.858562,"[0.0, 16.389404642865316, 16.65761675165035, 1...",1,"[0.0, 16.238233053523604, 36.34497839137807, 1..."
1,201661056143.0,"[0, 21, 15, 15, 991, 991, 991, 991, 991, 991]",3,4,0,1,1,7,1,1,1,3,2,2,26.197423,"[0.0, 16.389404642865316, 16.65761675165035, 1...",1,"[0.0, 16.238233053523604, 36.34497839137807, 1..."
2,201683666143.0,"[0, 18, 18, 20, 991, 991, 991, 991, 991, 991]",3,3,0,2,1,1,4,1,2,4,2,2,198.317305,"[0.0, 16.389404642865316, 16.65761675165035, 1...",1,"[0.0, 16.238233053523604, 36.34497839137807, 1..."
3,201659497143.0,"[0, 22, 16, 20, 991, 991, 991, 991, 991, 991]",3,2,0,1,4,7,3,1,2,2,1,2,472.577706,"[0.0, 16.389404642865316, 16.65761675165035, 1...",1,"[0.0, 16.238233053523604, 36.34497839137807, 1..."
4,201663899143.0,"[0, 14, 16, 15, 991, 991, 991, 991, 991, 991]",3,3,0,1,4,2,2,1,1,1,3,2,783.607113,"[0.0, 16.389404642865316, 16.65761675165035, 1...",1,"[0.0, 16.238233053523604, 36.34497839137807, 1..."


## Cluster Characterization
New stability-based clusters are described and stats are saved to a json file for later work.

In [6]:
# Set output file variables
jsonout = outpath + 'Newclust_' + str(numnewclusts) + '_clust_' + str(year) + 'stability' + str(round(stability,2)) + '_alldata_clust_stats.json'
outfile = outpath + 'Newclust_' + str(numnewclusts) + '_clust_' + str(year) + 'stability' + str(round(stability,2)) + '_chi2_output.txt'
outfile2 = outpath + 'Newclust_' + str(numnewclusts) + '_clust_' + str(year) + 'stability' + str(round(stability,2)) + '_crosstab_latex.txt'
csvout = outpath + 'Newclust_' + str(numnewclusts) + '_clust_' + str(year) + 'stability' + str(round(stability,2)) + '_ranked_chi2.csv'

# Reset the definitions of the dfclust and dfdemog dataframes to allow easy use of Step 3 code
dfclust = dfall[['RESPID', 'YRWEIGHT', 'AFUVECT', 'newlabels']]
dfclust['labels'] = dfclust.loc[:,'newlabels']
dfclust['HEROIN_USER'] = dfclust.apply(lambda row: 'Heroin User' if (row['AFUVECT'][8] > 0 and row['AFUVECT'][8] < 991) else 'Not Heroin User', axis=1)
dfclust = dfclust[['RESPID', 'YRWEIGHT', 'AFUVECT', 'labels', 'HEROIN_USER']]

dfdemog = dfall[['RESPID', 'YRWEIGHT', 'CATAG6', 'SVCFLAG', 'IRSEX', 'IRMARIT', 'NEWRACE2', 'EDUHIGHCAT', 'IRWRKSTAT', 'GOVTPROG', 'INCOME', 'COUTYP4', 'AIIND102', 'newlabels']]
dfdemog['labels'] = dfdemog.loc[:,'newlabels']
dfdemog = dfdemog[['RESPID', 'YRWEIGHT', 'CATAG6', 'SVCFLAG', 'IRSEX', 'IRMARIT', 'NEWRACE2', 'EDUHIGHCAT', 'IRWRKSTAT', 'GOVTPROG', 'INCOME', 'COUTYP4', 'AIIND102', 'labels']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfclust['labels'] = dfclust.loc[:,'newlabels']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfclust['HEROIN_USER'] = dfclust.apply(lambda row: 'Heroin User' if (row['AFUVECT'][8] > 0 and row['AFUVECT'][8] < 991) else 'Not Heroin User', axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfdemo

In [7]:
# Generate cluster statistics and save to json and a pandas dataframe
clustsmrydf = pd.DataFrame()
tempdict = {}
population = dfclust['YRWEIGHT'].sum()

# Build dictionary of clusters
for index in range(0,numnewclusts):
    clust = dfclust.loc[dfclust['labels'] == index,:]
    clustpop = clust['YRWEIGHT'].sum()
    numnodes = clust.shape[0]
    center, inertia = getcenter(clust)
    medoid = getmedoid(clust, center)
    medoidpath = arrayToPath(medoid)
    centerpath = arrayToPath(center)
    clustfrac = round(clustpop/population,5)
    clustsmrydf = clustsmrydf.append({'clustnum': index, 'count': clustpop, 'clustfrac': clustfrac, 'center': center, 
                                      'centerpath': centerpath, 'medoid': medoid, 'medoidpath': medoidpath}, ignore_index=True)
    tempdict.update({index: {'totalpop': population, 'size': clustpop, 'center': center.tolist(), 'centerpath': centerpath,
                             'medoid': medoid.tolist(), 'medoidpath': medoidpath, 'avginertia': inertia/clustpop,
                             'clustfrac': clustfrac}})

# Save cluster dictionary to json file
with open(jsonout,'w') as cluststats_dumped:
    json.dump(tempdict,cluststats_dumped, indent = 4, sort_keys = True)

cluststats_dumped.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['AFUARRAY'] = df.apply(lambda row: np.matrix(row['AFUVECT']).A[0],axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DISTSQ'] = df.apply(lambda row: euclidean(row['AFUARRAY'],center)**2, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['WTDISTSQ'] = df.apply(lambda row: row['DIS

## Chi-Square Analysis
Copied from Step 3

In [8]:
# Convert fields in the demographic database to category variables.
catvars = copy.deepcopy(demographics)
catvars.remove('AGE2')
catvars.remove('RESPID')
catvars.remove('ANALWT_C')

# Create a new dataframe for translation of categorical values into
# reader friendly values from the NSDUH dictionary.
# Youth dictionary values are not considered.
tmpdf = copy.deepcopy(dfdemog)
pipeprep = Pipeline([('makecats', pu.MakeCat(catvars))])
tmpdf = pipeprep.transform(tmpdf)

# IMPORTANT:  For this to work, the json key, value pairs must be sorted in the json file
for fieldname in tmpdf.columns:
    newvallist = []
    if fieldname != 'RESPID' and fieldname != 'labels' and fieldname != 'YRWEIGHT':
        for key, value in nsduhDecoder[fieldname]['values'].items():
            if value != "Youth":
                newvallist.append(value)
        tmpdf[fieldname].cat.categories = newvallist

In [9]:
# Open files
f = open(outfile, 'w')

# Open latex output file
f2 = open(outfile2, 'w')


# Initialize individual chi2 dataframe
indchi2df = pd.DataFrame(columns = ['TABLE', 'VALUE', 'CLUSTER', 'OBSERVED', 'EXPECTED', 'DIFF', 'INDCHI2', 'INFLUENCE'])

print('\n*****\nCROSSTABS AND CHISQUARE TEST RESULTS\n*****\n', file=f)
for item in catvars:
    catcross = pd.crosstab(index=tmpdf['labels'], columns=tmpdf[item], values=tmpdf['YRWEIGHT'], aggfunc=sum)
    catcrossnorm = pd.crosstab(index=tmpdf['labels'], columns=tmpdf[item], values=tmpdf['YRWEIGHT'], aggfunc=sum, normalize='columns')
    chi2, p, dof, ex = stats.chi2_contingency(catcross)
    print('\n******', item, '******\n', file=f)
    print('CROSSTAB COLUMN PERCENTAGES', file=f)
    print(catcrossnorm, '\n', file=f)
    print(catcrossnorm.to_latex(index=True), '\n', file=f2)
    print('CHI SQUARE RESULTS', file=f)
    print('CHI2 value: ', chi2, 'p-value: ', p, 'Degrees of freedom: ', dof, '\n', file=f)

    # Write individual chi2 to table:
    # Because there are no observations in some clusters, we need to use the index
    # to access elements in catcross.  The expected values array is still indexed
    # in sequence.  So we maintain the two separately with idx and i
    colcnt = catcross.shape[1]
    i = 0
    for idx in catcross.index.tolist():
        newrow = []
        for j in range(0, colcnt):
            table = item
            value = catcross.columns[j]
            observed = catcross.loc[idx][j]
            expected = ex[i,j]
            diff = observed - expected
            indchi2 = (catcross.loc[idx][j] - ex[i,j])**2/ex[i,j]
            dicttmp = {'TABLE': table, 'VALUE': value, 'CLUSTER': idx, 'OBSERVED': observed, 'EXPECTED': expected,
                     'DIFF': diff, 'INDCHI2': indchi2, 'INFLUENCE': indchi2/chi2}
            dftmp = pd.DataFrame([dicttmp])
            indchi2df = pd.concat([indchi2df, dftmp], ignore_index=True)
        i += 1           

  print(catcrossnorm.to_latex(index=True), '\n', file=f2)
  print(catcrossnorm.to_latex(index=True), '\n', file=f2)
  print(catcrossnorm.to_latex(index=True), '\n', file=f2)
  print(catcrossnorm.to_latex(index=True), '\n', file=f2)
  print(catcrossnorm.to_latex(index=True), '\n', file=f2)
  print(catcrossnorm.to_latex(index=True), '\n', file=f2)
  print(catcrossnorm.to_latex(index=True), '\n', file=f2)
  print(catcrossnorm.to_latex(index=True), '\n', file=f2)
  print(catcrossnorm.to_latex(index=True), '\n', file=f2)
  print(catcrossnorm.to_latex(index=True), '\n', file=f2)
  print(catcrossnorm.to_latex(index=True), '\n', file=f2)


In [10]:
indchi2df.head()

Unnamed: 0,TABLE,VALUE,CLUSTER,OBSERVED,EXPECTED,DIFF,INDCHI2,INFLUENCE
0,SVCFLAG,No military service,0,44846368.61005,47589011.2633,-2742642.65325,158063.563914,0.03824
1,SVCFLAG,Military service,0,7362859.532646,4620216.879396,2742642.65325,1628081.304359,0.393875
2,SVCFLAG,No military service,1,56081877.120107,56929727.468589,-847850.348483,12626.974436,0.003055
3,SVCFLAG,Military service,1,6374918.064483,5527067.716001,847850.348483,130059.961332,0.031465
4,SVCFLAG,No military service,2,40667378.746182,39140745.076701,1526633.669481,59544.353492,0.014405


### Rank the entries in the chi-square crosstab
To find the individual cells in the chi2 crosstab with the most influence, rank them by chi2 score, and save to a CSV file.

In [11]:
indchi2df['rank'] = indchi2df.groupby('TABLE')['INDCHI2'].rank(method="first", ascending=False)
indchi2df.to_csv(csvout)

# Generate dataframe of most influential individual chi-square elements
topindchi2 = indchi2df[indchi2df['rank']<=5]
topindchi2.head(100)

Unnamed: 0,TABLE,VALUE,CLUSTER,OBSERVED,EXPECTED,DIFF,INDCHI2,INFLUENCE,rank
0,SVCFLAG,No military service,0,44846368.61005,47589011.2633,-2742642.65325,158063.563914,0.03824,5.0
1,SVCFLAG,Military service,0,7362859.532646,4620216.879396,2742642.65325,1628081.304359,0.393875,1.0
5,SVCFLAG,Military service,2,2273376.590765,3800010.260246,-1526633.669481,613316.860003,0.148377,3.0
7,SVCFLAG,Military service,3,758118.618648,2354448.933121,-1596330.314473,1082321.403135,0.261842,2.0
11,SVCFLAG,Military service,5,608213.489074,1041712.549003,-433499.059928,180396.631622,0.043643,4.0
26,CATAG6,18-25,0,4054973.965712,7199927.136311,-3144953.170599,1373726.463894,0.06156,4.0
30,CATAG6,65+,0,17875217.351379,10652511.316381,7222706.034997,4897200.380136,0.219456,1.0
50,CATAG6,65+,4,940521.076468,3196993.639548,-2256472.56308,1592642.651817,0.07137,2.0
51,CATAG6,18-25,5,3166173.712067,1623355.492953,1542818.219114,1466276.52881,0.065708,3.0
59,CATAG6,50-64,6,4413909.015799,2655834.356441,1758074.659358,1163787.380182,0.052152,5.0


In [12]:
# Print out top individual chi2 to output
print(topindchi2, file=f)
print(topindchi2.to_latex(index=False), file=f2)

# Close print files
f.close()
f2.close()

# Log runs to mlflow
experiment_name = "New Cluster Chi-Square Analysis"
mlflow.set_experiment(experiment_name)

with mlflow.start_run():
    mlflow.log_param("numnewclusts", numnewclusts)
    mlflow.log_param("stability", stability)
    mlflow.log_param("observations", 'all')
    
    mlflow.log_artifact(jsonout)
    mlflow.log_artifact(csvout)
    mlflow.log_artifact(outfile)
    mlflow.log_artifact(outfile2)
   
    # End mlflow run
    mlflow.end_run()        

  print(topindchi2.to_latex(index=False), file=f2)


## Quick age by education analysis of the no-use cluster

In [19]:
agedf = tmpdf[tmpdf['labels']==3]
agecrossnorm = pd.crosstab(index=agedf['EDUHIGHCAT'], columns=agedf['CATAG6'], values=agedf['YRWEIGHT'], aggfunc=sum, normalize='columns')
agecrossnorm

CATAG6,18-25,26-34,35-49,50-64,65+
EDUHIGHCAT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
< High School,0.22464,0.193151,0.257467,0.257426,0.270779
HS Grad,0.377619,0.239943,0.234863,0.264092,0.327811
Some Coll,0.333456,0.221394,0.200849,0.216271,0.183298
Coll Grad,0.064284,0.345512,0.30682,0.262212,0.218112
