In [1]:
import os
import sys
from pyspark.sql.functions import sum as _sum, max as _max, min as _min

# Preparing the data
## Ingest dictionary and source file

In [5]:
datadict = spark.read.option("sep", ",").option("header","true").csv("gs://119-f19-opioidbucket/data_dictionary.csv")

DataFrame[ColumnName: string, Description: string]

In [11]:
datadict.show()

+--------------------+--------------------+
|          ColumnName|         Description|
+--------------------+--------------------+
|     REPORTER_DEA_NO|Unique id of enti...|
|    REPORTER_BUS_ACT|Type of business ...|
|       REPORTER_NAME|Name of entity re...|
|REPORTER_ADDL_CO_...|Additional compan...|
|   REPORTER_ADDRESS1|Address of entity...|
|   REPORTER_ADDRESS2|Additional addres...|
|       REPORTER_CITY|City of entity re...|
|      REPORTER_STATE|State of entity r...|
|        REPORTER_ZIP|Zip code of entit...|
|     REPORTER_COUNTY|County of entity ...|
|        BUYER_DEA_NO|Unique id of enti...|
|       BUYER_BUS_ACT|Type of business ...|
|          BUYER_NAME|Name of entity re...|
|  BUYER_ADDL_CO_INFO|Additional compan...|
|      BUYER_ADDRESS1|Address of entity...|
|      BUYER_ADDRESS2|Additional addres...|
|          BUYER_CITY|City of entity re...|
|         BUYER_STATE|State of entity r...|
|           BUYER_ZIP|Zip code of entit...|
|        BUYER_COUNTY|County of 

In [None]:
df = spark.read.option("sep", "\t").option("header", "true").option("inferSchema", "true").csv("gs://119-f19-opioidbucket/arcos_all_washpost.tsv")
# this take s a minute or two.
df.printSchema()

root
 |-- REPORTER_DEA_NO: string (nullable = true)
 |-- REPORTER_BUS_ACT: string (nullable = true)
 |-- REPORTER_NAME: string (nullable = true)
 |-- REPORTER_ADDL_CO_INFO: string (nullable = true)
 |-- REPORTER_ADDRESS1: string (nullable = true)
 |-- REPORTER_ADDRESS2: string (nullable = true)
 |-- REPORTER_CITY: string (nullable = true)
 |-- REPORTER_STATE: string (nullable = true)
 |-- REPORTER_ZIP: integer (nullable = true)
 |-- REPORTER_COUNTY: string (nullable = true)
 |-- BUYER_DEA_NO: string (nullable = true)
 |-- BUYER_BUS_ACT: string (nullable = true)
 |-- BUYER_NAME: string (nullable = true)
 |-- BUYER_ADDL_CO_INFO: string (nullable = true)
 |-- BUYER_ADDRESS1: string (nullable = true)
 |-- BUYER_ADDRESS2: string (nullable = true)
 |-- BUYER_CITY: string (nullable = true)
 |-- BUYER_STATE: string (nullable = true)
 |-- BUYER_ZIP: integer (nullable = true)
 |-- BUYER_COUNTY: string (nullable = true)
 |-- TRANSACTION_CODE: string (nullable = true)
 |-- DRUG_CODE: integer (null

In [None]:
# df.count() # => 178,598,026 records!

In [None]:
totalcount = 178598026

## Paring down to NH

In [13]:
df1 = df.filter(df.BUYER_STATE == 'NH')

In [None]:
# df1.count() #757944. This took a WHILE!
NHcount = 757944

In [14]:
df1.head()

Row(REPORTER_DEA_NO=u'PB0020139', REPORTER_BUS_ACT=u'DISTRIBUTOR', REPORTER_NAME=u'BURLINGTON DRUG COMPANY', REPORTER_ADDL_CO_INFO=u'null', REPORTER_ADDRESS1=u'91 CATAMOUNT DR', REPORTER_ADDRESS2=u'null', REPORTER_CITY=u'MILTON', REPORTER_STATE=u'VT', REPORTER_ZIP=5468, REPORTER_COUNTY=u'CHITTENDEN', BUYER_DEA_NO=u'AB3017212', BUYER_BUS_ACT=u'RETAIL PHARMACY', BUYER_NAME=u'BANNON PHARMACY INC', BUYER_ADDL_CO_INFO=u'null', BUYER_ADDRESS1=u'109 PLEASANT ST', BUYER_ADDRESS2=u'null', BUYER_CITY=u'CLAREMONT', BUYER_STATE=u'NH', BUYER_ZIP=3743, BUYER_COUNTY=u'SULLIVAN', TRANSACTION_CODE=u'S', DRUG_CODE=9193, NDC_NO=u'53746011805', DRUG_NAME=u'HYDROCODONE', QUANTITY=1.0, UNIT=u'null', ACTION_INDICATOR=u'null', ORDER_FORM_NO=u'null', CORRECTION_NO=u'null', STRENGTH=u'null', TRANSACTION_DATE=9082008, CALC_BASE_WT_IN_GM=2.27025, DOSAGE_UNIT=u'500.0', TRANSACTION_ID=803008893, Product_Name=u'HYDROCODONE.BITARTRATE 7.5MG/APAP 75', Ingredient_Name=u'HYDROCODONE BITARTRATE HEMIPENTAHYDRATE', Measure

In [15]:
df1 = df1.cache()

In [54]:
### lets get sql with it###
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

df1.createOrReplaceTempView("NHOpioids")

results = sqlContext.sql("SELECT BUYER_NAME, BUYER_ADDL_CO_INFO, \
                         COUNT(TRANSACTION_id) AS REPORT_COUNT,\
                         SUM(DOSAGE_UNIT) AS PILL_COUNT \
                         FROM NHOpioids GROUP BY BUYER_NAME,BUYER_ADDL_CO_INFO") #instantaneous

In [52]:
# results.show() #this collect step hurts! Takes a long time.
results.printSchema()
# what I actually want to do: sort and get max value, by column name.

root
 |-- BUYER_NAME: string (nullable = true)
 |-- BUYER_ADDL_CO_INFO: string (nullable = true)
 |-- YEAR: string (nullable = true)
 |-- REPORT_COUNT: long (nullable = false)
 |-- PILL_COUNT: double (nullable = true)



In [61]:
results.sort(results.PILL_COUNT.desc()).show(20,False)
# df2.sort(df2.InvoiceDateClean.desc()).show(1)
# +------------+----+------------+----------+
# |BUYER_DEA_NO|YEAR|REPORT_COUNT|PILL_COUNT|
# +------------+----+------------+----------+
# |   BA5257313|2009|         468| 1127500.0|
# +------------+----+------------+----------+

+----------------------------------+--------------------------+------------+----------+
|BUYER_NAME                        |BUYER_ADDL_CO_INFO        |REPORT_COUNT|PILL_COUNT|
+----------------------------------+--------------------------+------------+----------+
|NEIGHBORCARE OF NEW HAMPSHIRE, LLC|OMNICARE OF NEW HAMPSHIRE |3061        |5735260.0 |
|WALGREEN EASTERN CO., INC.        |DBA: WALGREENS # 03520    |6735        |4951350.0 |
|CVS MANCHESTER NH, L.L.C.         |DBA: CVS/PHARMACY  # 00639|6150        |4128500.0 |
|THE PRESCRIPTION CENTER           |null                      |18284       |4060070.0 |
|DARTMOUTH-HITCHCOCK PHARMACY      |null                      |8728        |4035460.0 |
|CVS MANCHESTER NH, L.L.C.         |DBA: CVS/PHARMACY  # 00640|4382        |3637300.0 |
|CARE PHARMACY                     |null                      |10325       |3236450.0 |
|CVS MANCHESTER NH, L.L.C.         |DBA: CVS/PHARMACY  # 02257|5460        |3170160.0 |
|CVS MANCHESTER NH, L.L.C.      

In [74]:
strength_vals = sqlContext.sql("SELECT BUYER_NAME, BUYER_ADDL_CO_INFO, \
                            RIGHT(TRANSACTION_DATE,4) AS YEAR, \
                            SUM(DOSAGE_UNIT) AS TOTAL_PILLS, \
                            FORMAT_NUMBER(AVG(dos_str),2) AS AVG_DOSE \
                         FROM NHOpioids GROUP BY BUYER_NAME, BUYER_ADDL_CO_INFO, YEAR\
                         ORDER BY TOTAL_PILLS DESC") #instantaneous

In [75]:
strength_vals.show(20,False)

+----------------------------------+--------------------------+----+-----------+--------+
|BUYER_NAME                        |BUYER_ADDL_CO_INFO        |YEAR|TOTAL_PILLS|AVG_DOSE|
+----------------------------------+--------------------------+----+-----------+--------+
|NEIGHBORCARE OF NEW HAMPSHIRE, LLC|OMNICARE OF NEW HAMPSHIRE |2009|1127500.0  |14.87   |
|NEIGHBORCARE OF NEW HAMPSHIRE, LLC|OMNICARE OF NEW HAMPSHIRE |2011|1030640.0  |16.66   |
|NEIGHBORCARE OF NEW HAMPSHIRE, LLC|OMNICARE OF NEW HAMPSHIRE |2012|1027600.0  |15.55   |
|NEIGHBORCARE OF NEW HAMPSHIRE, LLC|OMNICARE OF NEW HAMPSHIRE |2010|976540.0   |16.07   |
|WALGREEN EASTERN CO., INC.        |DBA: WALGREENS # 03520    |2011|822600.0   |13.64   |
|WALGREEN EASTERN CO., INC.        |DBA: WALGREENS # 03520    |2010|815400.0   |14.40   |
|WALGREEN EASTERN CO., INC.        |DBA: WALGREENS # 03520    |2009|724200.0   |12.67   |
|WALGREEN EASTERN CO., INC.        |DBA: WALGREENS # 03520    |2008|717400.0   |14.19   |
|WALGREEN 

In [None]:
###potential next steps: save output of something like the above to a dataframe; graph strength, counts over time. see if any correlation?
### perhaps see if correlation between pillcount, avg_dose, some combination by COUNTY vs. OD rate by county.