In [3]:
import os
import sys
from pyspark.sql.functions import sum as _sum, max as _max, min as _min

# Preparing the data
## Ingest dictionary and source file

In [2]:
datadict = spark.read.option("sep", ",").option("header","true").csv("gs://119-f19-opioidbucket/data_dictionary.csv")

In [3]:
datadict.show()

+--------------------+--------------------+
|          ColumnName|         Description|
+--------------------+--------------------+
|     REPORTER_DEA_NO|Unique id of enti...|
|    REPORTER_BUS_ACT|Type of business ...|
|       REPORTER_NAME|Name of entity re...|
|REPORTER_ADDL_CO_...|Additional compan...|
|   REPORTER_ADDRESS1|Address of entity...|
|   REPORTER_ADDRESS2|Additional addres...|
|       REPORTER_CITY|City of entity re...|
|      REPORTER_STATE|State of entity r...|
|        REPORTER_ZIP|Zip code of entit...|
|     REPORTER_COUNTY|County of entity ...|
|        BUYER_DEA_NO|Unique id of enti...|
|       BUYER_BUS_ACT|Type of business ...|
|          BUYER_NAME|Name of entity re...|
|  BUYER_ADDL_CO_INFO|Additional compan...|
|      BUYER_ADDRESS1|Address of entity...|
|      BUYER_ADDRESS2|Additional addres...|
|          BUYER_CITY|City of entity re...|
|         BUYER_STATE|State of entity r...|
|           BUYER_ZIP|Zip code of entit...|
|        BUYER_COUNTY|County of 

In [None]:
df = spark.read.option("sep", "\t").option("header", "true").option("inferSchema", "true").csv("gs://119-f19-opioidbucket/arcos_all_washpost.tsv")
# this take a few minutes.
df.printSchema()
## ideas for speeding up:
#co-locate compute and the buckets
#more nodes; specialize?

root
 |-- REPORTER_DEA_NO: string (nullable = true)
 |-- REPORTER_BUS_ACT: string (nullable = true)
 |-- REPORTER_NAME: string (nullable = true)
 |-- REPORTER_ADDL_CO_INFO: string (nullable = true)
 |-- REPORTER_ADDRESS1: string (nullable = true)
 |-- REPORTER_ADDRESS2: string (nullable = true)
 |-- REPORTER_CITY: string (nullable = true)
 |-- REPORTER_STATE: string (nullable = true)
 |-- REPORTER_ZIP: integer (nullable = true)
 |-- REPORTER_COUNTY: string (nullable = true)
 |-- BUYER_DEA_NO: string (nullable = true)
 |-- BUYER_BUS_ACT: string (nullable = true)
 |-- BUYER_NAME: string (nullable = true)
 |-- BUYER_ADDL_CO_INFO: string (nullable = true)
 |-- BUYER_ADDRESS1: string (nullable = true)
 |-- BUYER_ADDRESS2: string (nullable = true)
 |-- BUYER_CITY: string (nullable = true)
 |-- BUYER_STATE: string (nullable = true)
 |-- BUYER_ZIP: integer (nullable = true)
 |-- BUYER_COUNTY: string (nullable = true)
 |-- TRANSACTION_CODE: string (nullable = true)
 |-- DRUG_CODE: integer (null

In [None]:
# df.count() # => 178,598,026 records!

In [None]:
totalcount = 178598026

## Paring down to NH

In [5]:
df1 = df.filter(df.BUYER_STATE == 'NH')

In [None]:
# df1.count() #757944. This took a WHILE!
NHcount = 757944

In [6]:
df1.head()

Row(REPORTER_DEA_NO=u'PB0020139', REPORTER_BUS_ACT=u'DISTRIBUTOR', REPORTER_NAME=u'BURLINGTON DRUG COMPANY', REPORTER_ADDL_CO_INFO=u'null', REPORTER_ADDRESS1=u'91 CATAMOUNT DR', REPORTER_ADDRESS2=u'null', REPORTER_CITY=u'MILTON', REPORTER_STATE=u'VT', REPORTER_ZIP=5468, REPORTER_COUNTY=u'CHITTENDEN', BUYER_DEA_NO=u'AB3017212', BUYER_BUS_ACT=u'RETAIL PHARMACY', BUYER_NAME=u'BANNON PHARMACY INC', BUYER_ADDL_CO_INFO=u'null', BUYER_ADDRESS1=u'109 PLEASANT ST', BUYER_ADDRESS2=u'null', BUYER_CITY=u'CLAREMONT', BUYER_STATE=u'NH', BUYER_ZIP=3743, BUYER_COUNTY=u'SULLIVAN', TRANSACTION_CODE=u'S', DRUG_CODE=9193, NDC_NO=u'53746011805', DRUG_NAME=u'HYDROCODONE', QUANTITY=1.0, UNIT=u'null', ACTION_INDICATOR=u'null', ORDER_FORM_NO=u'null', CORRECTION_NO=u'null', STRENGTH=u'null', TRANSACTION_DATE=9082008, CALC_BASE_WT_IN_GM=2.27025, DOSAGE_UNIT=u'500.0', TRANSACTION_ID=803008893, Product_Name=u'HYDROCODONE.BITARTRATE 7.5MG/APAP 75', Ingredient_Name=u'HYDROCODONE BITARTRATE HEMIPENTAHYDRATE', Measure

In [7]:
df1 = df1.cache()

In [34]:
### lets get sql with it###
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

df1.createOrReplaceTempView("NHOpioids")

#specifically, we want to look at Coos County:
results = sqlContext.sql("SELECT BUYER_NAME, BUYER_ADDL_CO_INFO, BUYER_CITY,\
                         COUNT(TRANSACTION_id) AS REPORT_COUNT,\
                         SUM(DOSAGE_UNIT) AS PILL_SUM \
                         FROM NHOpioids \
                         WHERE 1=1 \
                         AND BUYER_COUNTY = 'COOS'\
                         GROUP BY BUYER_NAME,BUYER_ADDL_CO_INFO, BUYER_CITY") #instantaneous

results.printSchema()
# what I actually want to do: sort and get max value, by column name.

root
 |-- BUYER_NAME: string (nullable = true)
 |-- BUYER_ADDL_CO_INFO: string (nullable = true)
 |-- BUYER_CITY: string (nullable = true)
 |-- REPORT_COUNT: long (nullable = false)
 |-- PILL_SUM: double (nullable = true)



In [35]:
# this collect step may take a few minutes as well
# this took ~8 minutes with following settings: central, 1000GB master, 4 500GB helper nodes
results.sort(results.PILL_SUM.desc()).show(20,False)

#there are 9 pharmacies in the dataset for Coos County.
#the top hits might be likely targets for pill diversion investigation.

+-------------------------------+-------------------------------+----------+------------+---------+
|BUYER_NAME                     |BUYER_ADDL_CO_INFO             |BUYER_CITY|REPORT_COUNT|PILL_SUM |
+-------------------------------+-------------------------------+----------+------------+---------+
|RITE AID OF NEW HAMPSHIRE, INC.|RITE AID #4138                 |COLEBROOK |3831        |2383380.0|
|RITE AID OF NEW HAMPSHIRE, INC.|RITE AID #4127                 |LANCASTER |3246        |2356640.0|
|WAL-MART PHARMACY 10-2634      |null                           |GORHAM    |5323        |1555000.0|
|MAXI DRUG NORTH, INC.          |RITE AID #10287                |BERLIN    |2492        |997700.0 |
|LAPERLE'S IGA PHARMACY         |null                           |COLEBROOK |1936        |394600.0 |
|RITE AID OF NEW HAMPSHIRE INC  |RITE AID PHARMACY #4157        |GORHAM    |858         |202600.0 |
|RITE AID OF NEW HAMPSHIRE INC  |null                           |BERLIN    |410         |199700.0 |


In [36]:
##did the average dose over time present any kind of obvious trend?
strength_vals = sqlContext.sql(
                        "SELECT \
                            BUYER_NAME, BUYER_ADDL_CO_INFO, \
                            RIGHT(TRANSACTION_DATE,4) AS YEAR, \
                            SUM(DOSAGE_UNIT) AS TOTAL_PILLS, \
                            FORMAT_NUMBER(AVG(dos_str),2) AS AVG_DOSE \
                         FROM NHOpioids \
                         WHERE BUYER_COUNTY = 'COOS' \
                         GROUP BY BUYER_NAME, BUYER_ADDL_CO_INFO, YEAR\
                         ORDER BY BUYER_NAME, BUYER_ADDL_CO_INFO, YEAR DESC") #instantaneous

In [37]:
strength_vals.show(50,False)

+-------------------------------+-------------------------------+----+-----------+--------+
|BUYER_NAME                     |BUYER_ADDL_CO_INFO             |YEAR|TOTAL_PILLS|AVG_DOSE|
+-------------------------------+-------------------------------+----+-----------+--------+
|LAPERLE'S IGA PHARMACY         |null                           |2010|128700.0   |15.01   |
|LAPERLE'S IGA PHARMACY         |null                           |2009|149300.0   |16.06   |
|LAPERLE'S IGA PHARMACY         |null                           |2008|79500.0    |13.80   |
|LAPERLE'S IGA PHARMACY         |null                           |2007|33700.0    |15.00   |
|LAPERLE'S IGA PHARMACY         |null                           |2006|3400.0     |10.62   |
|MAXI DRUG NORTH, INC.          |RITE AID #10287                |2012|212540.0   |13.71   |
|MAXI DRUG NORTH, INC.          |RITE AID #10287                |2011|189160.0   |14.09   |
|MAXI DRUG NORTH, INC.          |RITE AID #10287                |2010|180600.0  

In [49]:
##rewritten with DEA ID:
strength_vals = sqlContext.sql(
                        "SELECT \
                            BUYER_NAME, BUYER_ADDL_CO_INFO, BUYER_DEA_NO, BUYER_CITY, \
                            RIGHT(TRANSACTION_DATE,4) AS YEAR, \
                            SUM(DOSAGE_UNIT) AS TOTAL_PILLS, \
                            FORMAT_NUMBER(AVG(dos_str),2) AS AVG_DOSE \
                         FROM NHOpioids \
                         WHERE BUYER_COUNTY = 'COOS' \
                         GROUP BY BUYER_NAME, BUYER_ADDL_CO_INFO, BUYER_DEA_NO, BUYER_CITY, YEAR\
                         ORDER BY BUYER_DEA_NO, YEAR DESC") #instantaneous

strength_vals.show(50,False)

+-------------------------------+-------------------------------+------------+----------+----+-----------+--------+
|BUYER_NAME                     |BUYER_ADDL_CO_INFO             |BUYER_DEA_NO|BUYER_CITY|YEAR|TOTAL_PILLS|AVG_DOSE|
+-------------------------------+-------------------------------+------------+----------+----+-----------+--------+
|MAXI DRUG NORTH, INC.          |RITE AID #10287                |BM5180601   |BERLIN    |2007|35900.0    |11.91   |
|MAXI DRUG NORTH, INC.          |RITE AID #10287                |BM5180601   |BERLIN    |2006|81300.0    |12.52   |
|RITE AID OF NEW HAMPSHIRE INC  |null                           |BR3822978   |BERLIN    |2007|73800.0    |14.56   |
|RITE AID OF NEW HAMPSHIRE INC  |null                           |BR3822978   |BERLIN    |2006|125900.0   |12.88   |
|RITE AID OF NEW HAMPSHIRE, INC.|RITE AID #4138                 |BR4157738   |COLEBROOK |2012|474950.0   |14.61   |
|RITE AID OF NEW HAMPSHIRE, INC.|RITE AID #4138                 |BR41577

In [53]:
### We've identified DEA NO. BR4157738 and BR4157841 as our high offenders.
### But clearly Colebrook, BR4157738, experienced most dramatic change over time, while Lancaster, BR4157841, rose more steadily.

### Who did they buy from? What were the changes?

sellers = sqlContext.sql(
                        "SELECT \
                            REPORTER_DEA_NO, REPORTER_NAME, REPORTER_ADDL_CO_INFO,\
                            RIGHT(TRANSACTION_DATE,4) AS YEAR, \
                            SUM(DOSAGE_UNIT) AS TOTAL_PILLS \
                         FROM NHOpioids \
                         WHERE BUYER_DEA_NO = 'BR4157738' \
                         GROUP BY REPORTER_DEA_NO, REPORTER_NAME, REPORTER_ADDL_CO_INFO, YEAR\
                         ORDER BY YEAR DESC, TOTAL_PILLS DESC")

sellers.show(50,False)

+---------------+------------------------+--------------------------------------------------+----+-----------+
|REPORTER_DEA_NO|REPORTER_NAME           |REPORTER_ADDL_CO_INFO                             |YEAR|TOTAL_PILLS|
+---------------+------------------------+--------------------------------------------------+----+-----------+
|PM0020850      |MCKESSON CORPORATION    |null                                              |2012|331540.0   |
|RE0356003      |ECKERD CORPORATION      |D/B/A RITE AID LIVERPOOL DISTRIBUTION CENTER (#55)|2012|119510.0   |
|RA0287020      |ANDA PHARMACEUTICALS INC|null                                              |2012|23800.0    |
|RA0180733      |ANDA, INC               |null                                              |2012|100.0      |
|PM0020850      |MCKESSON CORPORATION    |null                                              |2011|384860.0   |
|RE0356003      |ECKERD CORPORATION      |D/B/A RITE AID LIVERPOOL DISTRIBUTION CENTER (#55)|2011|109820.0   |
|

In [27]:
### There is no obvious change over time in the average dose strength of the pills, just the number of pills.

### Deep-dive into who the big offenders bought from.

### It looks like somes pharmacies either stopped operating or selling opioids after '07-'08,
###  which may have driven up the numbers at the remaining large chain locations (Rite Aids, Walmart)