# NSDUH Drug Sequence Analysis Part 4a1:  Cluster File Save
## Matthew J. Beattie
## University of Oklahoma
__December 30, 2021__

### Databricks File Preparation
This file generates cluster labels for $B$ runs of KMC and saves the data to .txt for uploading into Databricks.  It also stores demographic information, but that isn't used in Databricks.


In [1]:
"""
Import python modules
"""
import pandas as pd
import numpy as np
import copy
import os
import sys
import pathlib, itertools
import time
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import random
from sklearn_extra import cluster as cs
from sklearn.cluster import KMeans
import pickle
import json
import pathutils as pu
#import scipy.stats as stats
from scipy.spatial.distance import euclidean
import mlflow
import mlflow.sklearn
from collections import Counter
import profile
import gc
import csv

HOME_DIR = pathlib.Path.home()
CW_DIR = pathlib.Path.cwd()

FIGW = 12
FIGH = 5
FONTSIZE = 8
FIGURESIZE = (FIGW,FIGH)

plt.rcParams['figure.figsize'] = (FIGW, FIGH)
plt.rcParams['font.size'] = FONTSIZE

plt.rcParams['xtick.labelsize'] = FONTSIZE
plt.rcParams['ytick.labelsize'] = FONTSIZE


### Read in model, set file names, etc.

In [2]:
# Set parameters
datapath = 'C:/Users/mjbea/OneDrive/GitHub/abuse_sequence/Data3/'
workingpath = 'C:/Users/mjbea/OneDrive/GitHub/abuse_sequence/Code3/'
outpath = 'C:/Users/mjbea/OneDrive/GitHub/abuse_sequence/Output3/'
year = '2016_2017_2018_2019'
jsondict = datapath + 'NSDUH_field.json'
n_clusters = 11

hugefiles = 'C:/Users/mjbea/huge_files/'

# Setup filenames
clustpkl = workingpath + 'Kmeans_' + str(n_clusters) + 'clust_' + str(year) + '_nonullpath_clust.pkl'
demogpkl = workingpath + 'Kmeans_' + str(n_clusters) + 'clust_' + str(year) + '_nonullpath_demog.pkl'

# Read in cluster and demographic information and the model
dfclust = pd.read_pickle(clustpkl)
#dfclust = dfclust[['RESPID', 'AFUVECT']]
dfdemog = pd.read_pickle(demogpkl)


In [3]:
dfclust.head()

Unnamed: 0,RESPID,AFUVECT,YRWEIGHT,labels
0,201611635143.0,"[0, 16, 15, 20, 991, 991, 991, 991, 991, 991]",204.858562,3
1,201635755143.0,"[0, 26, 16, 991, 991, 991, 991, 991, 991, 991]",2533.458396,0
2,201692675143.0,"[0, 5, 18, 32, 34, 991, 991, 991, 991, 991]",6203.973093,10
3,201659596143.0,"[0, 991, 14, 991, 991, 991, 991, 991, 991, 991]",1386.672703,2
4,201641106143.0,"[0, 991, 991, 991, 991, 991, 991, 991, 991, 991]",2384.841656,5


In [4]:
dfclust.shape

(170944, 4)

In [5]:
"""
calcinertia()
Calculates the total inertia of a dataset given the data, cluster centers,
and labels from the model.
"""
def calcinertia(df, centers):
    df['afuarray'] = df.apply(lambda row: np.array(row['AFUVECT']), axis=1)
    df['center'] = df.apply(lambda row: centers[row['labels']], axis=1)
    df['dist'] = df.apply(lambda row: euclidean(row['afuarray'],row['center']), axis=1)
    df['distsq'] = df.apply(lambda row: row['dist']**2, axis=1)
    return df['distsq'].sum()

### Generate clusters
Generate _B_ cluster sets using _k-means_ on a fraction of the original dataset.  Merge the cluster results back into the original dataset to create a dataframe with the respondent ID, the AFU vector, and the assignments of the observations to cluster sets.

In [6]:
# Set clustering process parameters
B = 20   # Number of models to generate
fraction = 0.3  # Input dataset fraction of original total
f = 0.8  # Fraction of input dataset to use for model construction
n_init = 10
max_iter = 1000
tol = 0.0001

# Reduce master dataset if desired and save as an array for KNN
if fraction < 0.9:
    dfclust = dfclust.sample(frac=fraction, replace = False)
allarray = np.array(list(dfclust['AFUVECT']))

inertialist = []
starttime = time.time()
try:
    for i in range(0,B):
        # Sample the dfclust dataset
        clustsamp = dfclust.sample(frac=f, replace=False)
        clustsamp = clustsamp[['RESPID', 'AFUVECT', 'YRWEIGHT']]

        # Using k-means, generate a model and label the sample dataset
        samparray = np.array(list(clustsamp['AFUVECT']))
        weights = np.array(list(clustsamp['YRWEIGHT']))

        model = KMeans(n_clusters=n_clusters, init='k-means++', n_init=n_init, max_iter=max_iter, 
                       tol=tol, verbose=0, random_state=None, copy_x=True, algorithm='auto')
        model.fit(samparray, sample_weight=weights)

        # Apply model to entire dataset to generate clusters
        preds = model.predict(allarray)

        # Add labels to original dataframe
        colname = 'labels_' + str(i)
        dfclust[colname] = preds

        # Calculate inertia of labelled total dataset
        tempdf = dfclust[['AFUVECT']].copy()
        tempdf.loc[:,'labels'] = preds.copy()
        inertia = calcinertia(tempdf, model.cluster_centers_)
        inertialist.append(inertia)

    meaninertia = np.array(inertialist).mean()
    clustertime = time.time() - starttime
    print('Mean inertia is:', meaninertia, '\nExecution time was:', clustertime)

except:
    print('Clustering process failed --', sys.exc_info()[0], "occurred.")


Mean inertia is: 10207984350.876812 
Execution time was: 317.11638021469116


In [7]:
# Save files to CSV for work in Azure Databricks
clustcsv = datapath + 'dfclust30pct.txt'
dfclust.to_csv(clustcsv, sep='\t', encoding='utf-8', index=False)

demogcsv = datapath + 'dfdemog.txt'
dfdemog.to_csv(demogcsv, sep='\t', encoding='utf-8', index=False)