# NSDUH Drug Sequence Analysis Part 4b:  Stability Analysis via Loop
## Matthew J. Beattie
## University of Oklahoma
__December 4, 2021__

### Stability index creation
This script takes the KMC clusterings (which were done on a desktop) and creates a list of tuples and their stability index.  A _tuple_ is a pair of respondents who are included in the same cluster.  We aggregate all the tuples from the multiple clusterings into one file and then count the number of times each distinct tuple occurs.  This count, divided by the total number of clusterings, generates a _stability index_ for the tuple.

In [0]:
# Import the Abuse Sequence utilities functions
%run "/dbfs/FileStore/pythonfiles/pathutils.py"

In [0]:
# Import pyspark libraries
from pyspark.sql import functions as f
from pyspark.sql import SparkSession, DataFrameWriter as dfw
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType

# Import standard Python libraries
from os.path import abspath
import matplotlib.pyplot as plt
import datetime as dt
import pandas as pd
import numpy as mp
import copy
import os
import sys
import pathlib, itertools
import time
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import random
from sklearn.cluster import KMeans
import pickle
import json
from scipy.spatial.distance import euclidean
import mlflow
import mlflow.sklearn
from collections import Counter
import profile
import gc
import csv


# Initialize Spark session
spark = SparkSession\
    .builder\
    .config("spark.databricks.delta.retentionDurationCheck.enabled", "false")\
    .enableHiveSupport()\
    .getOrCreate()

# Set Azure parameters
blob_account_name = "abuseseqstorage"
blob_container_name = "datafiles"
blob_sas_token = 'sp=racwdli&st=2021-12-04T18:25:43Z&se=2022-01-02T02:25:43Z&spr=https&sv=2020-08-04&sr=c&sig=VaqJXEF3UoKZ7r%2FitleYNo5TSqAhJwwx4cdWiZ2%2FT%2Fs%3D'

# Set miscellaneous parameters
FIGW = 12
FIGH = 5
FONTSIZE = 8
FIGURESIZE = (FIGW,FIGH)

plt.rcParams['figure.figsize'] = (FIGW, FIGH)
plt.rcParams['font.size'] = FONTSIZE

plt.rcParams['xtick.labelsize'] = FONTSIZE
plt.rcParams['ytick.labelsize'] = FONTSIZE


In [0]:
%sql
/* Create tuplecounts table

DROP TABLE IF EXISTS abuse_sequence.tuplecounts;

CREATE TABLE abuse_sequence.tuplecounts
(
  orignode STRING,
  termnode STRING,
  tuplecount INT
)
USING DELTA;

OPTIMIZE abuse_sequence.tuplecounts;
VACUUM abuse_sequence.tuplecounts;
*/

In [0]:
# Point to files in blob storage
clustercsv = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, 'dfclust.txt')
demogcsv = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, 'dfdemog.csv')
spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name), blob_sas_token)
print('Remote blob path: ' + clustercsv)

clust_schema = StructType([
    StructField("ROWNUM", IntegerType(), False),
    StructField("RESPID", StringType(), False),
    StructField("AFUVECT", StringType(), False),
    StructField("PATHSUM", FloatType(), False),
    StructField("labels", IntegerType(), False),
    StructField("labels_0", IntegerType(), False),
    StructField("labels_1", IntegerType(), False),
    StructField("labels_2", IntegerType(), False),
    StructField("labels_3", IntegerType(), False),
    StructField("labels_4", IntegerType(), False),
    StructField("labels_5", IntegerType(), False),
    StructField("labels_6", IntegerType(), False),
    StructField("labels_7", IntegerType(), False),
    StructField("labels_8", IntegerType(), False),
    StructField("labels_9", IntegerType(), False),
    StructField("labels_10", IntegerType(), False),
    StructField("labels_11", IntegerType(), False),
    StructField("labels_12", IntegerType(), False),
    StructField("labels_13", IntegerType(), False),
    StructField("labels_14", IntegerType(), False),
    StructField("labels_15", IntegerType(), False),
    StructField("labels_16", IntegerType(), False),
    StructField("labels_17", IntegerType(), False),
    StructField("labels_18", IntegerType(), False),
    StructField("labels_19", IntegerType(), False)
])

dfclust = spark.read.load(clustercsv, format="csv", sep="\t", schema=clust_schema, header="true")
display(dfclust)

ROWNUM,RESPID,AFUVECT,PATHSUM,labels,labels_0,labels_1,labels_2,labels_3,labels_4,labels_5,labels_6,labels_7,labels_8,labels_9,labels_10,labels_11,labels_12,labels_13,labels_14,labels_15,labels_16,labels_17,labels_18,labels_19
34657,201652115410,"[0, 20, 18, 991, 991, 991, 991, 991, 991, 991]",6975.0,0,3,3,5,1,1,0,4,3,2,0,0,2,4,4,2,4,2,3,3,2
108220,201925424153,"[0, 991, 23, 991, 991, 991, 991, 15, 991, 991]",6975.0,3,1,6,2,5,3,2,0,1,0,3,4,1,0,2,0,1,1,0,2,4
6751,201662534884,"[0, 12, 14, 991, 991, 991, 991, 991, 991, 991]",6963.0,0,3,3,5,1,1,0,4,3,2,0,0,2,4,4,2,4,2,3,3,2
159774,201771125924,"[0, 10, 15, 13, 22, 29.0, 17, 24, 991, 991]",2112.0,4,2,5,1,0,2,8,7,7,8,6,1,10,3,6,4,8,4,2,7,3
146351,201667620360,"[0, 8, 15, 16, 18, 29.0, 18, 25, 991, 18]",1138.0,6,2,8,4,10,2,6,7,7,5,6,6,5,3,6,4,8,11,2,7,6
37924,201759370983,"[0, 991, 22, 991, 991, 991, 991, 991, 991, 991]",7950.0,3,1,6,2,5,3,2,0,1,0,3,4,1,0,2,0,1,1,0,2,4
164224,201974048938,"[0, 9, 7, 8, 5, 21.0, 17, 991, 21, 21]",1100.0,6,9,8,4,10,7,6,3,0,5,6,6,5,11,11,3,3,11,9,7,3
166174,201873667468,"[0, 15, 14, 15, 24, 22.0, 27, 991, 21, 24]",1153.0,6,9,8,4,10,7,6,3,0,5,6,6,5,11,11,3,3,11,9,7,3
2320,201647352193,"[0, 991, 991, 991, 991, 991, 991, 991, 991, 991]",8919.0,2,5,2,7,2,5,5,5,4,4,5,2,6,8,5,5,6,6,5,6,1
122336,201981020366,"[0, 11, 10, 13, 15, 991, 991, 991, 991, 991]",5004.0,7,11,10,9,11,9,10,11,9,10,7,11,7,7,8,10,11,5,4,4,11


In [0]:
# Convert AFUVECT from string to array of integers
df2 = dfclust.withColumn("AFUVECT",f.regexp_replace("AFUVECT", "\\[", ""))\
             .withColumn("AFUVECT",f.regexp_replace("AFUVECT", "\\]", ""))\
             .withColumn("AFUVECT",f.split(f.col("AFUVECT"),",").cast('array<int>'))
display(df2)
observations = df2.count()

ROWNUM,RESPID,AFUVECT,PATHSUM,labels,labels_0,labels_1,labels_2,labels_3,labels_4,labels_5,labels_6,labels_7,labels_8,labels_9,labels_10,labels_11,labels_12,labels_13,labels_14,labels_15,labels_16,labels_17,labels_18,labels_19
34657,201652115410,"List(0, 20, 18, 991, 991, 991, 991, 991, 991, 991)",6975.0,0,3,3,5,1,1,0,4,3,2,0,0,2,4,4,2,4,2,3,3,2
108220,201925424153,"List(0, 991, 23, 991, 991, 991, 991, 15, 991, 991)",6975.0,3,1,6,2,5,3,2,0,1,0,3,4,1,0,2,0,1,1,0,2,4
6751,201662534884,"List(0, 12, 14, 991, 991, 991, 991, 991, 991, 991)",6963.0,0,3,3,5,1,1,0,4,3,2,0,0,2,4,4,2,4,2,3,3,2
159774,201771125924,"List(0, 10, 15, 13, 22, 29, 17, 24, 991, 991)",2112.0,4,2,5,1,0,2,8,7,7,8,6,1,10,3,6,4,8,4,2,7,3
146351,201667620360,"List(0, 8, 15, 16, 18, 29, 18, 25, 991, 18)",1138.0,6,2,8,4,10,2,6,7,7,5,6,6,5,3,6,4,8,11,2,7,6
37924,201759370983,"List(0, 991, 22, 991, 991, 991, 991, 991, 991, 991)",7950.0,3,1,6,2,5,3,2,0,1,0,3,4,1,0,2,0,1,1,0,2,4
164224,201974048938,"List(0, 9, 7, 8, 5, 21, 17, 991, 21, 21)",1100.0,6,9,8,4,10,7,6,3,0,5,6,6,5,11,11,3,3,11,9,7,3
166174,201873667468,"List(0, 15, 14, 15, 24, 22, 27, 991, 21, 24)",1153.0,6,9,8,4,10,7,6,3,0,5,6,6,5,11,11,3,3,11,9,7,3
2320,201647352193,"List(0, 991, 991, 991, 991, 991, 991, 991, 991, 991)",8919.0,2,5,2,7,2,5,5,5,4,4,5,2,6,8,5,5,6,6,5,6,1
122336,201981020366,"List(0, 11, 10, 13, 15, 991, 991, 991, 991, 991)",5004.0,7,11,10,9,11,9,10,11,9,10,7,11,7,7,8,10,11,5,4,4,11


In [0]:
# Parameters from clustering
# Set clustering process parameters
B = 20   # Number of models to generate
f = 0.8  # Fraction of input dataset to use for model construction
fraction = 0.20
n_init = 10
max_iter = 1000
tol = 0.0001
n_clusters = 12

# Create a list of tuples from the cluster sets
starttime = time.time()
for b in range(0,B):
    # Initiate tuplelist
    tuplelist = []
    
    # Populate tuplelist
    print('Finding tuples for clustering', b)
    clustset = 'labels_' + str(b)
    for c in range(0,n_clusters):
        clustslice = dfclust[dfclust[clustset]==c]
        clustlist = clustslice.select('RESPID').rdd.flatMap(lambda x: x).collect()
        for i in range(0,len(clustlist)):
            for j in range(i+1,len(clustlist)):
                if clustlist[i] < clustlist[j]:
                    tuplelist.append((clustlist[i], clustlist[j]))
                else:
                    tuplelist.append((clustlist[j], clustlist[i]))

    # Convert tuplelist to dataframe and insert into permanent table
    columns = ["orignode", "termnode"]
    print('Creating dataframe from tuplelist')
    df3 = spark.createDataFrame(tuplelist, columns)
    print('tuple list size for clustering', b, 'is', df3.count())
    df3.registerTempTable('tupletbl')
    spark.sql("""
        INSERT INTO abuse_sequence.tuplecounts
        SELECT DISTINCT orignode, termnode, count(*) as tuplecount
        FROM tupletbl
        GROUP BY orignode, termnode
    """)
    tuplecountcnt = spark.sql("""select count(*) from abuse_sequence.tuplecounts""").collect()[0][0]
    print('New abuse_sequence.tuplecounts count is', tuplecountcnt)

sequencetime = time.time() - starttime


In [0]:
%sql
OPTIMIZE abuse_sequence.tuplecounts;
VACUUM abuse_sequence.tuplecounts;

path
dbfs:/user/hive/warehouse/abuse_sequence.db/tuplecounts


In [0]:
%sql
/* Create tuplestability table */

DROP TABLE IF EXISTS abuse_sequence.tuplestability;

CREATE TABLE abuse_sequence.tuplestability
(
  orignode STRING,
  termnode STRING,
  tottuples INT,
  stability FLOAT
)
USING DELTA;

INSERT INTO abuse_sequence.tuplestability
SELECT orignode, termnode, sum(tuplecount) AS tottuples, sum(tuplecount)/20 AS stability
FROM abuse_sequence.tuplecounts
GROUP BY orignode, termnode;

OPTIMIZE abuse_sequence.tuplestability;
VACUUM abuse_sequence.tuplestability;


path
dbfs:/user/hive/warehouse/abuse_sequence.db/tuplestability


In [0]:
# Save tuple stability data to csv on Azure blob
dftuplestability = spark.sql("""
    select tottuples, count(*) as tottuplescnt, count(*)/20 as stability
    from abuse_sequence.tuplestability
    group by tottuples
    order by tottuplescnt desc
""")

output_blob_folder = "%s/wrangled_data_folder" % 'stabilitycounts'

# write the dataframe as a single file to blob storage
(dftuplestability
 .coalesce(1)
 .write
 .mode("overwrite")
 .option("header", "true")
 .format("com.databricks.spark.csv")
 .save(output_blob_folder))

tuplestabilitycsv = 'wasbs:/%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, 'tuplestabilty.csv')
dftuplestability.write.option("header",True).csv(tuplestabilitycsv)


In [0]:
display(dftuplestability)

tottuples,tottuplescnt,stability
20,102509781,5125489.05
2,14357183,717859.15
1,5781509,289075.45
4,5328098,266404.9
15,5208210,260410.5
18,4246006,212300.3
5,1976250,98812.5
11,1570023,78501.15
3,1431082,71554.1
7,1222374,61118.7


In [0]:
# Save tuple stability count file to Azure blob storage
output_container_path = "wasbs://%s@%s.blob.core.windows.net" % (blob_container_name, blob_account_name)
output_blob_folder = "%s/" % output_container_path
output_file_name = 'tuplestability.csv'
final_file_name = 'tuplestabilitycounts.csv'
output_filename = output_blob_folder + output_file_name
final_filename = output_blob_folder + final_file_name

dftuplestability \
    .coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .options(delimiter='\t') \
    .format("csv") \
    .save(output_filename)

# Get the name of the CSV file that was just saved to Azure blob storage (it starts with 'part-')
files = dbutils.fs.ls(output_filename)
output_file = [x for x in files if x.name.startswith("part-")]

# Move the wrangled-data CSV file from a sub-folder (wrangled_data_folder) to the root of the blob container
# While simultaneously changing the file name
dbutils.fs.mv(output_file[0].path, final_filename)

# Remove the parquet blob
dbutils.fs.rm(output_filename, recurse=True)


In [0]:
# Log parameters and results into MLflow
uniquetuples = spark.sql("""select count(*) from abuse_sequence.tuplestability""").collect()[0][0]
maxcount = spark.sql("""select distinct max(tottuples) from abuse_sequence.tuplestability""").collect()[0][0]
maxcountcnt = spark.sql("""select count(*) from abuse_sequence.tuplestability where tottuples={}""".format(maxcount)).collect()[0][0]

with mlflow.start_run():
        mlflow.log_metric("Observations", observations)
        mlflow.log_metric("Total tuples", uniquetuples)
        mlflow.log_param("Clusterings", B)
        mlflow.log_param("Fraction of total dataset", fraction)
        mlflow.log_param("Fold fraction size", f)
        mlflow.log_metric("Most common stability", maxcount)
        mlflow.log_metric("Fraction of tuples with commonest stability", 
                          maxcountcnt/uniquetuples)
        mlflow.log_metric("Sequencing and RDD create runtime", sequencetime)
        
mlflow.end_run()

In [0]:
# Save tuple stability database to CSV file for use with NetworkX or other things
# Save tuple stability count file to Azure blob storage
output_file_name = 'tuplebigblob.csv'
final_file_name = 'tuplebigfile.csv'
output_filename = output_blob_folder + output_file_name
final_filename = output_blob_folder + final_file_name

dftuplebig = spark.sql("""select * from abuse_sequence.tuplestability""")

dftuplebig \
    .coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .format("csv") \
    .save(output_filename)

# Get the name of the CSV file that was just saved to Azure blob storage (it starts with 'part-')
files = dbutils.fs.ls(output_filename)
output_file = [x for x in files if x.name.startswith("part-")]

# Move the wrangled-data CSV file from a sub-folder (wrangled_data_folder) to the root of the blob container
# While simultaneously changing the file name
dbutils.fs.mv(output_file[0].path, final_filename)

# Remove the parquet blob
dbutils.fs.rm(output_filename, recurse=True)