# Welcome!

### This notebook is the first in a series that builds the Green Cross Green Shield (GCGS) data analysis project using Medicare Data

### Note:  GCGS requires SageMaker Studio

# Start the "Data Science" Kernel
The kernel powers all of our notebook interactions.

### Click on "No Kernel" in the Upper Right
<!---  ![](img/select_kernel.png)  # Use HTML since ![]() does not support width  --->

<div>
<img src="img/select_kernel.png" width="500"/>  <!--- Use HTML since ![]() does not support width --->
</div>

### Select the `Data Science` Kernel
<!---  ![](img/select_data_science_kernel.png)  --->

<div>
<img src="img/select_data_science_kernel.png" width="500"/>
</div>

### Confirm the Kernel is Started in Upper Right
<!---  ![](img/confirm_kernel_started.png)   --->

<div>
<img src="img/confirm_kernel_started.png" width="500"/>
</div>

### NOTE:  YOU CANNOT CONTINUE UNTIL THE KERNEL IS STARTED
### ### PLEASE WAIT UNTIL THE KERNEL IS STARTED BEFORE CONTINUING!!! ###

# ----------------------------
# List of %StoreMagic local variables to avoid reuse

In [68]:
%store  
# List of %storemagic local variables from OTHER NOTEBOOKS (Avoid reuse)

Stored variables and their in-db values:
setup_gcgs_dependencies_passed             -> True
setup_gcgs_iam_roles_passed                -> True
setup_gcgs_s3_bucket_passed                -> True


# If not done, load all packages and dependencies using the 01_Setup_Dependencies.ipyn Notebook

In [69]:
setup_gcgs_dependencies_passed = True

In [70]:
%store setup_gcgs_dependencies_passed

Stored 'setup_gcgs_dependencies_passed' (bool)


# Load Packages and Create Session
### Sessions typically store the following: Credentials, AWS Region, Other configurations related to your profile
##### Like this: class sagemaker.session.Session(boto_session=None, sagemaker_client=None, sagemaker_runtime_client=None, sagemaker_featurestore_runtime_client=None, default_bucket=None, settings=<sagemaker.session_settings.SessionSettings object>)


In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import boto3.session
import sagemaker

gcgs_sess = boto3.session.Session() # Create our own session
gcgs_sess1 = boto3.Session() # MZ EXPERIMENT
gcgs_region = gcgs_sess.region_name
gcgs_sm_sess = sagemaker.Session(default_bucket="my-508-projects") # Assigned variable with sagemaker resources available
# ASSIGNED NEW DEFAULT BUCKET: "my-508-projects"
gcgs_bucket = gcgs_sm_sess.default_bucket()

s3 = boto3.Session().client(service_name="s3", region_name=gcgs_region)
s31 = gcgs_sess.client(service_name="s3", region_name=gcgs_region) # MZ EXPERIMENT

print(gcgs_sess)
print(gcgs_sess1)
print(gcgs_region)
print(gcgs_sm_sess)
print(gcgs_bucket)
print(s3)
s31

Session(region_name='us-east-1')
Session(region_name='us-east-1')
us-east-1
<sagemaker.session.Session object at 0x7f9d784f82d0>
my-508-projects
<botocore.client.S3 object at 0x7f9d4f537950>


<botocore.client.S3 at 0x7f9d4cc2ae90>

In [72]:
# CODE FOR A DEFAULT SESSION
# sqs = boto3.client('sqs')
# s3 = boto3.resource('s3')

# CODE TO CREATE OUR OWN SESSION (with low-level clients or resource clients from our custom session)
# my_session = boto3.session.Session()
# sqs = my_session.client('sqs')
# s3 = my_session.resource('s3')

In [73]:
setup_gcgs_s3_bucket_passed = False

In [74]:
print("Default bucket: {}".format(gcgs_bucket))

Default bucket: my-508-projects


# Verify S3_BUCKET Bucket Creation

In [75]:
%%bash

aws s3 ls s3://${bucket}/  # CLI command to list S3 buckets

2022-03-28 18:52:26 my-508-projects
2022-03-30 21:44:18 sagemaker-studio-850528502467-h2e8ll9c0fn
2022-03-28 18:24:25 sagemaker-studio-850528502467-rrgcrpja7gg
2022-03-28 20:50:51 sagemaker-us-east-1-850528502467


In [76]:
from botocore.client import ClientError

response = None

try:
    response = s3.head_bucket(Bucket=gcgs_bucket)
    print(response)
    setup_gcgs_s3_bucket_passed = True
except ClientError as e:
    print("[ERROR] Cannot find bucket {} in {} due to {}.".format(gcgs_bucket, response, e))

{'ResponseMetadata': {'RequestId': '9JVEX84XKWA2CVY2', 'HostId': 'EbnzELk1ozhjXgbX7KGCyEddRIS4g5/BKptihZ2Lb1G1d1/CLllfNpjaE42RVCx2tPZ15WmTHVE=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'EbnzELk1ozhjXgbX7KGCyEddRIS4g5/BKptihZ2Lb1G1d1/CLllfNpjaE42RVCx2tPZ15WmTHVE=', 'x-amz-request-id': '9JVEX84XKWA2CVY2', 'date': 'Thu, 31 Mar 2022 00:02:47 GMT', 'x-amz-bucket-region': 'us-east-1', 'x-amz-access-point-alias': 'false', 'content-type': 'application/xml', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}


In [77]:
%store setup_gcgs_s3_bucket_passed

Stored 'setup_gcgs_s3_bucket_passed' (bool)


In [78]:
%store 
# Note new variables stored for later

Stored variables and their in-db values:
setup_gcgs_dependencies_passed             -> True
setup_gcgs_iam_roles_passed                -> True
setup_gcgs_s3_bucket_passed                -> True


# Update IAM Roles and Policies

In [79]:
import time
from time import gmtime, strftime

#sagemaker_session = sagemaker.Session()
#bucket = sagemaker_session.default_bucket()
#region = boto3.Session().region_name

from botocore.config import Config
config = Config(retries={"max_attempts": 10, "mode": "adaptive"})
iam = boto3.client("iam", config=config)
print(config)
print(iam)

<botocore.config.Config object at 0x7f9d4cbd5f10>
<botocore.client.IAM object at 0x7f9d4cba4110>


## Get SageMaker Execution Role Name

In [80]:
role = sagemaker.get_execution_role()
print(role)
role_name = role.split("/")[-1]
print("Role name: {}".format(role_name))

arn:aws:iam::850528502467:role/LabRole
Role name: LabRole


In [81]:
setup_gcgs_iam_roles_passed = False

# **Pre-Requisite:  SageMaker notebook instance ExecutionRole contains `AdministratorAccess` Policy.**
_Note:  The permissions used here are for demonstration purposes only.  Please follow least-privilege security principals appropriate for your environment._

In [82]:
admin = False
gcgs_post_policies = iam.list_attached_role_policies(RoleName=role_name)["AttachedPolicies"]
print(gcgs_post_policies)
for post_policy in gcgs_post_policies:
    if post_policy["PolicyName"] == "AdministratorAccess":
        admin = True
        setup_gcgs_iam_roles_passed = True
        print("[OK] You are all set up to continue with this workshop!")
        break
    else:
        print("*************** [ERROR] SageMakerExecutionRole needs the AdministratorAccess policy attached. *****************")

[{'PolicyName': 'c50727a768849l1711245t1w850528502467-VocLabPolicy2-GOX5BAKNM5FS', 'PolicyArn': 'arn:aws:iam::850528502467:policy/c50727a768849l1711245t1w850528502467-VocLabPolicy2-GOX5BAKNM5FS'}, {'PolicyName': 'c50727a768849l1711245t1w850528502467-VocLabPolicy3-1GXR7BAXB5BR7', 'PolicyArn': 'arn:aws:iam::850528502467:policy/c50727a768849l1711245t1w850528502467-VocLabPolicy3-1GXR7BAXB5BR7'}, {'PolicyName': 'c50727a768849l1711245t1w850528502467-VocLabPolicy1-1KJRJL9IGOVSU', 'PolicyArn': 'arn:aws:iam::850528502467:policy/c50727a768849l1711245t1w850528502467-VocLabPolicy1-1KJRJL9IGOVSU'}, {'PolicyName': 'AdministratorAccess', 'PolicyArn': 'arn:aws:iam::aws:policy/AdministratorAccess'}, {'PolicyName': 'AmazonSSMManagedInstanceCore', 'PolicyArn': 'arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore'}]
*************** [ERROR] SageMakerExecutionRole needs the AdministratorAccess policy attached. *****************
*************** [ERROR] SageMakerExecutionRole needs the AdministratorAccess po

In [83]:
%store setup_gcgs_iam_roles_passed

Stored 'setup_gcgs_iam_roles_passed' (bool)


In [84]:
%store

Stored variables and their in-db values:
setup_gcgs_dependencies_passed             -> True
setup_gcgs_iam_roles_passed                -> True
setup_gcgs_s3_bucket_passed                -> True


# *Final Check*

In [85]:
# role = iam.get_role(RoleName=role_name)
post_policies = iam.list_attached_role_policies(RoleName=role_name)["AttachedPolicies"]

required_policies = [
    "AdministratorAccess",
#     "SecretsManagerReadWrite",
#     "IAMFullAccess",
#     "AmazonS3FullAccess",
#     "AmazonAthenaFullAccess",
#     "ComprehendFullAccess",
#     "AmazonEC2ContainerRegistryFullAccess",
#     "AmazonRedshiftFullAccess",
#     "AWSStepFunctionsFullAccess",
#     "AmazonSageMakerFullAccess",
#     "AmazonKinesisFullAccess",
#     "AmazonKinesisFirehoseFullAccess",
#     "AmazonKinesisAnalyticsFullAccess",
]

admin = False

for post_policy in post_policies:
    if post_policy["PolicyName"] == "AdministratorAccess":
        admin = True
        try:
            required_policies.remove(post_policy["PolicyName"])
        except:
            break
    else:
        try:
            required_policies.remove(post_policy["PolicyName"])
        except:
            pass

if not admin and len(required_policies) > 0:
    setup_gcgs_iam_roles_passed = False
    print("*************** [ERROR] RE-RUN THIS NOTEBOOK *****************")
    for required_policy in required_policies:
        print("Not Attached: {}".format(required_policy))
else:
    setup_gcgs_iam_roles_passed = True
    print("[OK] You are all set up to continue with this workshop!")

[OK] You are all set up to continue with this workshop!


In [86]:
%store setup_gcgs_iam_roles_passed

Stored 'setup_gcgs_iam_roles_passed' (bool)


In [87]:
%store

Stored variables and their in-db values:
setup_gcgs_dependencies_passed             -> True
setup_gcgs_iam_roles_passed                -> True
setup_gcgs_s3_bucket_passed                -> True


## Load the NDC (FDA) File from S3 for Transformation

In [88]:
!aws s3 cp 's3://my-508-projects/ndc-file/ndc_excel3.csv' ./data_transform/

download: s3://my-508-projects/ndc-file/ndc_excel3.csv to data_transform/ndc_excel3.csv


In [89]:
import csv
# Change from UTF-8 encoding required for Window Excel file (from Stack Exchange)
ndc = pd.read_csv(
    r"./data_transform/ndc_excel3.csv",
    encoding = "ISO-8859-1",
    engine = 'python',
    dtype={'NDC5_4':str, 'PROPRIETARYNAME':str, 'NONPROPRIETARYNAME':str}  # Must encode NDC with leading 0s as 'str'
# delimiter="\t",
# quoting=csv.QUOTE_NON##E,
# compression="gzip",
)

ndc.shape

(112259, 21)

In [90]:
ndc.head(5)

Unnamed: 0,PRODUCTID,NDC5_4,PRODUCTTYPENAME,PROPRIETARYNAME,PROPRIETARYNAMESUFFIX,NONPROPRIETARYNAME,DOSAGEFORMNAME,DOSAGEFORMNAME2,DOSAGEFORMNAME3,DOSAGEFORMNAME4,...,ROUTENAME2,ROUTENAME3,ROUTENAME4,SUBSTANCENAME,ACTIVE_NUMERATOR_STRENGTH,ACTIVE_INGRED_UNIT,PHARM_CLASSES,DEASCHEDULE,NDC_EXCLUDE_FLAG,LISTING_RECORD_CERTIFIED_THROUGH
0,0002-0800_662164fd-5ea0-4a08-bfd1-6b08bdd73342,20800,HUMAN OTC DRUG,Sterile Diluent,,diluent,INJECTION,SOLUTION,,,...,,,,WATER,1.0,mL/mL,,,N,20221231.0
1,0002-1200_480fceef-6596-4478-97de-677c155506b3,21200,HUMAN PRESCRIPTION DRUG,Amyvid,,Florbetapir F 18,INJECTION,SOLUTION,,,...,,,,FLORBETAPIR F-18,51.0,mCi/mL,"Positron Emitting Activity [MoA], Radioactive ...",,N,20221231.0
2,0002-1210_151a431b-f07b-4959-b6fa-c41ff80364c8,21210,HUMAN PRESCRIPTION DRUG,TAUVID,,Flortaucipir F-18,INJECTION,SOLUTION,,,...,,,,FLORTAUCIPIR F-18,51.0,mCi/mL,,,N,20221231.0
3,0002-1433_69bd3896-91f6-4960-8538-2880159588c6,21433,HUMAN PRESCRIPTION DRUG,Trulicity,,Dulaglutide,INJECTION,SOLUTION,,,...,,,,DULAGLUTIDE,0.75,mg/.5mL,"GLP-1 Receptor Agonist [EPC], Glucagon-Like Pe...",,N,20231231.0
4,0002-1434_69bd3896-91f6-4960-8538-2880159588c6,21434,HUMAN PRESCRIPTION DRUG,Trulicity,,Dulaglutide,INJECTION,SOLUTION,,,...,,,,DULAGLUTIDE,1.5,mg/.5mL,"GLP-1 Receptor Agonist [EPC], Glucagon-Like Pe...",,N,20231231.0


## Transform the NDC Dataframe in Python Before Uploading Back to S3 as CSV

In [91]:
df = ndc['PROPRIETARYNAME'].str.lower()
ndc = ndc.assign(prop_name_low = df)
df1 = ndc['NONPROPRIETARYNAME'].str.lower()
ndc = ndc.assign(nonprop_name_low = df1)
ndc.head(5)

Unnamed: 0,PRODUCTID,NDC5_4,PRODUCTTYPENAME,PROPRIETARYNAME,PROPRIETARYNAMESUFFIX,NONPROPRIETARYNAME,DOSAGEFORMNAME,DOSAGEFORMNAME2,DOSAGEFORMNAME3,DOSAGEFORMNAME4,...,ROUTENAME4,SUBSTANCENAME,ACTIVE_NUMERATOR_STRENGTH,ACTIVE_INGRED_UNIT,PHARM_CLASSES,DEASCHEDULE,NDC_EXCLUDE_FLAG,LISTING_RECORD_CERTIFIED_THROUGH,prop_name_low,nonprop_name_low
0,0002-0800_662164fd-5ea0-4a08-bfd1-6b08bdd73342,20800,HUMAN OTC DRUG,Sterile Diluent,,diluent,INJECTION,SOLUTION,,,...,,WATER,1.0,mL/mL,,,N,20221231.0,sterile diluent,diluent
1,0002-1200_480fceef-6596-4478-97de-677c155506b3,21200,HUMAN PRESCRIPTION DRUG,Amyvid,,Florbetapir F 18,INJECTION,SOLUTION,,,...,,FLORBETAPIR F-18,51.0,mCi/mL,"Positron Emitting Activity [MoA], Radioactive ...",,N,20221231.0,amyvid,florbetapir f 18
2,0002-1210_151a431b-f07b-4959-b6fa-c41ff80364c8,21210,HUMAN PRESCRIPTION DRUG,TAUVID,,Flortaucipir F-18,INJECTION,SOLUTION,,,...,,FLORTAUCIPIR F-18,51.0,mCi/mL,,,N,20221231.0,tauvid,flortaucipir f-18
3,0002-1433_69bd3896-91f6-4960-8538-2880159588c6,21433,HUMAN PRESCRIPTION DRUG,Trulicity,,Dulaglutide,INJECTION,SOLUTION,,,...,,DULAGLUTIDE,0.75,mg/.5mL,"GLP-1 Receptor Agonist [EPC], Glucagon-Like Pe...",,N,20231231.0,trulicity,dulaglutide
4,0002-1434_69bd3896-91f6-4960-8538-2880159588c6,21434,HUMAN PRESCRIPTION DRUG,Trulicity,,Dulaglutide,INJECTION,SOLUTION,,,...,,DULAGLUTIDE,1.5,mg/.5mL,"GLP-1 Receptor Agonist [EPC], Glucagon-Like Pe...",,N,20231231.0,trulicity,dulaglutide


In [92]:
#Drop unused columns

ndc_small = ndc [["NDC5_4",
                  "PRODUCTTYPENAME",
                  "prop_name_low",
                  "nonprop_name_low",
                  "DOSAGEFORMNAME",
                  "ROUTENAME"]]
ndc_small = ndc_small.rename(columns = {'prop_name_low': 'PROPRIETARYNAME', 
                                        'nonprop_name_low': 'NONPROPRIETARYNAME' })
print(ndc_small.shape)
ndc_small.head(5)


(112259, 6)


Unnamed: 0,NDC5_4,PRODUCTTYPENAME,PROPRIETARYNAME,NONPROPRIETARYNAME,DOSAGEFORMNAME,ROUTENAME
0,20800,HUMAN OTC DRUG,sterile diluent,diluent,INJECTION,SUBCUTANEOUS
1,21200,HUMAN PRESCRIPTION DRUG,amyvid,florbetapir f 18,INJECTION,INTRAVENOUS
2,21210,HUMAN PRESCRIPTION DRUG,tauvid,flortaucipir f-18,INJECTION,INTRAVENOUS
3,21433,HUMAN PRESCRIPTION DRUG,trulicity,dulaglutide,INJECTION,SUBCUTANEOUS
4,21434,HUMAN PRESCRIPTION DRUG,trulicity,dulaglutide,INJECTION,SUBCUTANEOUS


In [93]:
ndc_human = ndc_small[ndc_small.PRODUCTTYPENAME == "HUMAN PRESCRIPTION DRUG"]
ndc_human.shape

(51205, 6)

In [94]:
def ISGENERIC(row):
    if row['PROPRIETARYNAME'] == row['NONPROPRIETARYNAME']:
        return 1
    return 0

df=ndc_human.apply(lambda row: ISGENERIC(row), axis = 1)
print(type(df))
df.value_counts()

#There are 40778 generics in NDC

<class 'pandas.core.series.Series'>


1    40080
0    11125
dtype: int64

In [95]:
#isgen=pd.Series(df)
ndc_gen = ndc_human.assign(isgeneric = df)
ndc_gen.shape

(51205, 7)

In [96]:
#Select the forms and routes of drugs most likely to be dispensed by a pharmacy directly to patients

ndc_gen_D = ndc_gen[((ndc_gen.DOSAGEFORMNAME == "AEROSOL") 
                    | (ndc_gen.DOSAGEFORMNAME == "CAPSULE")
                    | (ndc_gen.DOSAGEFORMNAME == "GRANULE")
                    | (ndc_gen.DOSAGEFORMNAME == "INJECTION")
                    | (ndc_gen.DOSAGEFORMNAME == "LIQUID")
                    | (ndc_gen.DOSAGEFORMNAME == "PATCH")
                    | (ndc_gen.DOSAGEFORMNAME == "PILL")
                    | (ndc_gen.DOSAGEFORMNAME == "POWDER")
                    | (ndc_gen.DOSAGEFORMNAME == "SALVE")
                    | (ndc_gen.DOSAGEFORMNAME == "SUPPOSITORY")
                    | (ndc_gen.DOSAGEFORMNAME == "SUSPENSION")
                    | (ndc_gen.DOSAGEFORMNAME == "SYRUP")
                    | (ndc_gen.DOSAGEFORMNAME == "TABLET")) &
                    ((ndc_gen.ROUTENAME == "OTIC") 
                    | (ndc_gen.ROUTENAME == "CONJUNCTIVAL")
                    | (ndc_gen.ROUTENAME == "CUTANEOUS")
                    | (ndc_gen.ROUTENAME == "NASAL")
                    | (ndc_gen.ROUTENAME == "OPHTHALMIC")
                    | (ndc_gen.ROUTENAME == "ORAL")
                    | (ndc_gen.ROUTENAME == "OROPHARYNGEAL")
                    | (ndc_gen.ROUTENAME == "RECTAL")
                    | (ndc_gen.ROUTENAME == "SUBCUTANEOUS")
                    | (ndc_gen.ROUTENAME == "SUBLINGUAL")
                    | (ndc_gen.ROUTENAME == "SUBMUCOSAL")
                    | (ndc_gen.ROUTENAME == "TOPICAL")
                    | (ndc_gen.ROUTENAME == "TRANSDERMAL")
                    | (ndc_gen.ROUTENAME == "VAGINAL"))
                   ]
ndc_gen_D.shape

(37519, 7)

In [97]:
def ISORAL(row):
    if ((row['ROUTENAME'] == 'ORAL')):
#        | (row['ROUTENAME'] == 'OROPHARYNGEAL')  #These are 0
#        | (row['ROUTENAME'] == 'SUBLINGUAL')):   #These are 0
        return 1
    return 0

df=ndc_gen_D.apply(lambda row: ISORAL(row), axis = 1)
print(type(df))
df.value_counts()

#There are 40778 generics in NDC

<class 'pandas.core.series.Series'>


1    36161
0     1358
dtype: int64

In [98]:
#isgen=pd.Series(df)
ndc_gen_o = ndc_gen_D.assign(isoral = df)
ndc_gen_o.shape

(37519, 8)

In [99]:
ndc_gen_o.head(5)

Unnamed: 0,NDC5_4,PRODUCTTYPENAME,PROPRIETARYNAME,NONPROPRIETARYNAME,DOSAGEFORMNAME,ROUTENAME,isgeneric,isoral
3,21433,HUMAN PRESCRIPTION DRUG,trulicity,dulaglutide,INJECTION,SUBCUTANEOUS,0,0
4,21434,HUMAN PRESCRIPTION DRUG,trulicity,dulaglutide,INJECTION,SUBCUTANEOUS,0,0
5,21436,HUMAN PRESCRIPTION DRUG,emgality,galcanezumab-gnlm,INJECTION,SUBCUTANEOUS,0,0
6,21445,HUMAN PRESCRIPTION DRUG,taltz,ixekizumab,INJECTION,SUBCUTANEOUS,0,0
7,22236,HUMAN PRESCRIPTION DRUG,trulicity,dulaglutide,INJECTION,SUBCUTANEOUS,0,0


In [100]:
ndc_gen_o.head(5)

Unnamed: 0,NDC5_4,PRODUCTTYPENAME,PROPRIETARYNAME,NONPROPRIETARYNAME,DOSAGEFORMNAME,ROUTENAME,isgeneric,isoral
3,21433,HUMAN PRESCRIPTION DRUG,trulicity,dulaglutide,INJECTION,SUBCUTANEOUS,0,0
4,21434,HUMAN PRESCRIPTION DRUG,trulicity,dulaglutide,INJECTION,SUBCUTANEOUS,0,0
5,21436,HUMAN PRESCRIPTION DRUG,emgality,galcanezumab-gnlm,INJECTION,SUBCUTANEOUS,0,0
6,21445,HUMAN PRESCRIPTION DRUG,taltz,ixekizumab,INJECTION,SUBCUTANEOUS,0,0
7,22236,HUMAN PRESCRIPTION DRUG,trulicity,dulaglutide,INJECTION,SUBCUTANEOUS,0,0


## Re-encode the Transformed NDC df to csv and Send Back to S3

In [101]:
# PACKAGE DF AS CSV

filepath = "./data_transform/eng_ndc.csv"
ndc_gen_o.to_csv(filepath, index=False, header=True)

# COPY CSV TO S3

import time

# The timestamp is for creating a fresh folder.  Disabled here so S3 will write over with newly engineered files.
timestamp = int(time.time())
eng_df_s3_uri = gcgs_sm_sess.upload_data(bucket=gcgs_bucket, key_prefix="eng_ndc", path=filepath)
#engineered_df_s3_uri = gcgs_sm_sess.upload_data(bucket=gcgs_bucket, key_prefix="engineered_df-{}".format(timestamp), path=filepath)
print(eng_df_s3_uri)

# LIST DATA IN THE NEW S3 LOCATION

!aws s3 ls $eng_df_s3_uri


s3://my-508-projects/eng_ndc/eng_ndc.csv
2022-03-31 00:02:53    3319339 eng_ndc.csv


## Load the Drug Event File from S3 for Transformation

In [102]:
!aws s3 cp 's3://my-508-projects/drugevent/DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_2.csv' ./data_transform/

download: s3://my-508-projects/drugevent/DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_2.csv to data_transform/DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_2.csv


In [103]:
import csv
# Change from UTF-8 encoding required for Window Excel file (from Stack Exchange)
drug_file_chunk = pd.read_csv(
    r"./data_transform/DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_2.csv",
    dtype={'PROD_SRVC_ID':str},
#    dtype={'DESYNPUF_ID':str, 'PDE_ID':str, 'SRVC_DT':int, 'PROD_SRVC_ID':str,
#           'QTY_DSPNSD_NUM':int, 'DAYS_SUPLY_NUM':int, 'PTNT_PAY_AMT':int, 'TOT_RX_CST_AMT':int},
    chunksize = 500000,
#encoding = "ISO-8859-1",
#engine = 'python'
# delimiter="\t",
# quoting=csv.QUOTE_NON##E,
#compression="zip"
)
#for chunk in drug_file_chunk:
#    print(chunk)
drug_file = pd.concat(drug_file_chunk, ignore_index = True)
drug_file.shape

(5561154, 8)

In [104]:
drug_file.head(5)

Unnamed: 0,DESYNPUF_ID,PDE_ID,SRVC_DT,PROD_SRVC_ID,QTY_DSPNSD_NUM,DAYS_SUPLY_NUM,PTNT_PAY_AMT,TOT_RX_CST_AMT
0,00000B48BCF4AD29,83224466404678,20100207,185010401,30.0,30,0.0,10.0
1,00000B48BCF4AD29,83654467130740,20100312,115163303,100.0,30,0.0,30.0
2,00000B48BCF4AD29,83574462630098,20100421,117193205,20.0,20,0.0,160.0
3,00000B48BCF4AD29,83734462622581,20100427,19458016707,30.0,30,10.0,0.0
4,00000B48BCF4AD29,83594462991534,20100611,59746011109,30.0,30,0.0,0.0


## Transform the Drug Event Dataframe in Python Before Uploading Back to S3 as CSV

In [105]:
#PDE FILE COLUMNS
#DESYNPUF_ID 00000B48BCF4AD29
#PDE_ID 83224466404678
#SRVC_DT 20100207
#PROD_SRVC_ID 185010401
#QTY_DSPNSD_NUM 30.0
#DAYS_SUPLY_NUM 30
#PTNT_PAY_AMT 0.0
#TOT_RX_CST_AMT 10.0

#POTENTIAL FEATURES TO ENGINEER:
#Transform the NDC 11 digit number to 9
#Transform SRVC_DT
#Add total_cost = PTNT_PAY_AMT + TOT_RX_CST_AMT
#Add cost-per_day = total_cost/DAYS_SUPLY_NUM

In [106]:
#Count the length of the NDC numbers
foo = drug_file.PROD_SRVC_ID.astype(str).str.len().value_counts().reset_index() # reset_index is convert from series to data frame
print(foo)
#Create new column for NDC5_4 values
df= drug_file['PROD_SRVC_ID'].str.slice(0,9)
drug_file_9 = drug_file.assign(NDC5_4 = df)
drug_file_9.shape


   index  PROD_SRVC_ID
0     11       5561139
1      5            15


(5561154, 9)

In [107]:
drug_file_9.count()

DESYNPUF_ID       5561154
PDE_ID            5561154
SRVC_DT           5561154
PROD_SRVC_ID      5561154
QTY_DSPNSD_NUM    5561154
DAYS_SUPLY_NUM    5561154
PTNT_PAY_AMT      5561154
TOT_RX_CST_AMT    5561154
NDC5_4            5561154
dtype: int64

In [108]:
drug_file_9.head(10)

Unnamed: 0,DESYNPUF_ID,PDE_ID,SRVC_DT,PROD_SRVC_ID,QTY_DSPNSD_NUM,DAYS_SUPLY_NUM,PTNT_PAY_AMT,TOT_RX_CST_AMT,NDC5_4
0,00000B48BCF4AD29,83224466404678,20100207,185010401,30.0,30,0.0,10.0,1850104
1,00000B48BCF4AD29,83654467130740,20100312,115163303,100.0,30,0.0,30.0,1151633
2,00000B48BCF4AD29,83574462630098,20100421,117193205,20.0,20,0.0,160.0,1171932
3,00000B48BCF4AD29,83734462622581,20100427,19458016707,30.0,30,10.0,0.0,194580167
4,00000B48BCF4AD29,83594462991534,20100611,59746011109,30.0,30,0.0,0.0,597460111
5,0000525AB30E4DEF,83794463089433,20080113,367228196,30.0,30,30.0,70.0,3672281
6,0000525AB30E4DEF,83774462604852,20080115,49483000510,40.0,0,0.0,10.0,494830005
7,0000525AB30E4DEF,83474466939585,20080128,59628075200,30.0,30,10.0,30.0,596280752
8,0000525AB30E4DEF,83614462112487,20080214,63629174202,50.0,10,0.0,10.0,636291742
9,0000525AB30E4DEF,83074464673935,20080220,63739019515,30.0,30,10.0,10.0,637390195


In [109]:
#Transform SRVC_DT
#Add column total_cost = PTNT_PAY_AMT + TOT_RX_CST_AMT
#Add column cost-per_day = total_cost/DAYS_SUPLY_NUM

drug_file_9['total_cost'] = drug_file_9['PTNT_PAY_AMT'] + drug_file_9['TOT_RX_CST_AMT']
drug_file_9['cost_per_day'] = drug_file_9['total_cost']/drug_file_9['DAYS_SUPLY_NUM']
drug_file_9['date'] = pd.to_datetime(drug_file_9['SRVC_DT'], format = '%Y%m%d')
drug_file_9.head(5)

Unnamed: 0,DESYNPUF_ID,PDE_ID,SRVC_DT,PROD_SRVC_ID,QTY_DSPNSD_NUM,DAYS_SUPLY_NUM,PTNT_PAY_AMT,TOT_RX_CST_AMT,NDC5_4,total_cost,cost_per_day,date
0,00000B48BCF4AD29,83224466404678,20100207,185010401,30.0,30,0.0,10.0,1850104,10.0,0.333333,2010-02-07
1,00000B48BCF4AD29,83654467130740,20100312,115163303,100.0,30,0.0,30.0,1151633,30.0,1.0,2010-03-12
2,00000B48BCF4AD29,83574462630098,20100421,117193205,20.0,20,0.0,160.0,1171932,160.0,8.0,2010-04-21
3,00000B48BCF4AD29,83734462622581,20100427,19458016707,30.0,30,10.0,0.0,194580167,10.0,0.333333,2010-04-27
4,00000B48BCF4AD29,83594462991534,20100611,59746011109,30.0,30,0.0,0.0,597460111,0.0,0.0,2010-06-11


In [110]:
# Drop unused columns
#drug_file_9 = drug_file_9.drop('PDE_ID', 1)
drug_file_9 = drug_file_9.drop('QTY_DSPNSD_NUM', 1)
drug_file_9 = drug_file_9.drop('DAYS_SUPLY_NUM', 1)

In [111]:
drug_file_9 = drug_file_9.drop('SRVC_DT', 1)

In [112]:
drug_file_9 = drug_file_9.sample(frac = 0.1)

## Re-encode the Transformed Drug Event df to csv and Send Back to S3

In [113]:
# PACKAGE DF AS CSV

filepath = "./data_transform/eng_pde.csv"
drug_file_9.to_csv(filepath, index=False, header=True)

# COPY CSV TO S3

import time

# The timestamp is for creating a fresh folder.  Disabled here so S3 will write over with newly engineered files.
timestamp = int(time.time())
eng_df_s3_uri = gcgs_sm_sess.upload_data(bucket=gcgs_bucket, key_prefix="eng_pde", path=filepath)
#eng_df_s3_uri = gcgs_sm_sess.upload_data(bucket=gcgs_bucket, key_prefix="eng_df-{}".format(timestamp), path=filepath)
print(eng_df_s3_uri)

# LIST DATA IN THE NEW S3 LOCATION

!aws s3 ls $eng_df_s3_uri

s3://my-508-projects/eng_pde/eng_pde.csv
2022-03-31 00:03:26   51595373 eng_pde.csv


## Load the Three Beneficiary Files from S3 for Transformation

In [114]:
!aws s3 cp 's3://my-508-projects/bene/176589_DE1_0_2008_Beneficiary_Summary_File_Sample_2.zip' ./data_transform/
!aws s3 cp 's3://my-508-projects/bene/176629_DE1_0_2009_Beneficiary_Summary_File_Sample_2.zip' ./data_transform/
!aws s3 cp 's3://my-508-projects/bene/176581_DE1_0_2010_Beneficiary_Summary_File_Sample_2.zip' ./data_transform/

download: s3://my-508-projects/bene/176589_DE1_0_2008_Beneficiary_Summary_File_Sample_2.zip to data_transform/176589_DE1_0_2008_Beneficiary_Summary_File_Sample_2.zip
download: s3://my-508-projects/bene/176629_DE1_0_2009_Beneficiary_Summary_File_Sample_2.zip to data_transform/176629_DE1_0_2009_Beneficiary_Summary_File_Sample_2.zip
download: s3://my-508-projects/bene/176581_DE1_0_2010_Beneficiary_Summary_File_Sample_2.zip to data_transform/176581_DE1_0_2010_Beneficiary_Summary_File_Sample_2.zip


In [115]:
ben_2008 = pd.read_csv(
    r"./data_transform/176589_DE1_0_2008_Beneficiary_Summary_File_Sample_2.zip",
    dtype={'DESYNPUF_ID':str,
        'BENE_BIRTH_DT':str,
        'BENE_DEATH_DT':str,
        'BENE_SEX_IDENT_CD':int,
        'BENE_RACE_CD':int,
        'BENE_ESRD_IND':str,
        'SP_STATE_CODE':int,
        'BENE_COUNTY_CD':int,
        'BENE_HI_CVRAGE_TOT_MONS':int,
        'BENE_SMI_CVRAGE_TOT_MONS':int,
        'BENE_HMO_CVRAGE_TOT_MONS':int,
        'PLAN_CVRG_MOS_NUM':int,
        'SP_ALZHDMTA':int,
        'SP_CHF':int,
        'SP_CHRNKIDN':int,
        'SP_CNCR':int,
        'SP_COPD':int,
        'SP_DEPRESSN':int,
        'SP_DIABETES':int,
        'SP_ISCHMCHT':int,
        'SP_OSTEOPRS':int,
        'SP_RA_OA':int,
        'SP_STRKETIA':int,
        'MEDREIMB_IP':float,
        'BENRES_IP':float,
        'PPPYMT_IP':float,
        'MEDREIMB_OP':float,
        'BENRES_OP':float,
        'PPPYMT_OP':float,
        'MEDREIMB_CAR':float,
        'BENRES_CAR':float,
        'PPPYMT_CAR':float},
    compression="zip"
)
ben_2008.shape

(116395, 32)

In [116]:
ben_2008.head(5)

Unnamed: 0,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,...,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,00000B48BCF4AD29,19230901,,2,5,0,10,260,12,12,...,1,81000.0,3072.0,0.0,1520.0,80.0,0.0,6260.0,1520.0,0.0
1,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,12,...,1,13260.0,2048.0,0.0,1760.0,670.0,0.0,3830.0,1010.0,50.0
2,00009C897C3D8372,19320101,,1,1,Y,7,70,12,12,...,2,37500.0,4096.0,0.0,100.0,160.0,0.0,1540.0,280.0,60.0
3,0001168CE43BE51B,19340901,,2,1,0,6,200,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0002E494BC87CE10,19140701,,1,2,0,5,200,2,2,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [117]:
ben_2008.insert(0,'year', 2008)
ben_2008

Unnamed: 0,year,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,...,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,2008,00000B48BCF4AD29,19230901,,2,5,0,10,260,12,...,1,81000.0,3072.0,0.0,1520.0,80.0,0.0,6260.0,1520.0,0.0
1,2008,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,...,1,13260.0,2048.0,0.0,1760.0,670.0,0.0,3830.0,1010.0,50.0
2,2008,00009C897C3D8372,19320101,,1,1,Y,7,70,12,...,2,37500.0,4096.0,0.0,100.0,160.0,0.0,1540.0,280.0,60.0
3,2008,0001168CE43BE51B,19340901,,2,1,0,6,200,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2008,0002E494BC87CE10,19140701,,1,2,0,5,200,2,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116390,2008,FFFCF02B3CE4D724,19100901,,2,1,0,34,790,12,...,2,0.0,0.0,0.0,1240.0,160.0,0.0,350.0,120.0,0.0
116391,2008,FFFE94CBE61C0479,19360601,,1,1,Y,4,70,12,...,1,15050.0,3072.0,0.0,27580.0,6810.0,0.0,5770.0,1040.0,0.0
116392,2008,FFFEAD5472FC13A2,19301201,,2,1,0,41,50,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116393,2008,FFFF0FD89207928D,19250901,,2,1,0,39,290,12,...,2,0.0,0.0,0.0,1500.0,20.0,0.0,2530.0,770.0,0.0


In [118]:
ben_2009 = pd.read_csv(
    r"./data_transform/176629_DE1_0_2009_Beneficiary_Summary_File_Sample_2.zip",
    dtype={'DESYNPUF_ID':str,
        'BENE_BIRTH_DT':str,
        'BENE_DEATH_DT':str,
        'BENE_SEX_IDENT_CD':int,
        'BENE_RACE_CD':int,
        'BENE_ESRD_IND':str,
        'SP_STATE_CODE':int,
        'BENE_COUNTY_CD':int,
        'BENE_HI_CVRAGE_TOT_MONS':int,
        'BENE_SMI_CVRAGE_TOT_MONS':int,
        'BENE_HMO_CVRAGE_TOT_MONS':int,
        'PLAN_CVRG_MOS_NUM':int,
        'SP_ALZHDMTA':int,
        'SP_CHF':int,
        'SP_CHRNKIDN':int,
        'SP_CNCR':int,
        'SP_COPD':int,
        'SP_DEPRESSN':int,
        'SP_DIABETES':int,
        'SP_ISCHMCHT':int,
        'SP_OSTEOPRS':int,
        'SP_RA_OA':int,
        'SP_STRKETIA':int,
        'MEDREIMB_IP':float,
        'BENRES_IP':float,
        'PPPYMT_IP':float,
        'MEDREIMB_OP':float,
        'BENRES_OP':float,
        'PPPYMT_OP':float,
        'MEDREIMB_CAR':float,
        'BENRES_CAR':float,
        'PPPYMT_CAR':float},
    compression="zip"
)
ben_2009.shape

(114618, 32)

In [119]:
ben_2009.head(5)

Unnamed: 0,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,...,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,00000B48BCF4AD29,19230901,,2,5,0,10,260,12,12,...,2,0.0,0.0,0.0,580.0,400.0,0.0,5720.0,1530.0,520.0
1,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,12,...,1,0.0,0.0,0.0,3380.0,1370.0,0.0,7970.0,2010.0,0.0
2,00009C897C3D8372,19320101,,1,1,0,7,70,12,12,...,2,0.0,0.0,0.0,2250.0,230.0,0.0,900.0,210.0,0.0
3,0001168CE43BE51B,19340901,,2,1,0,6,200,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0002E494BC87CE10,19140701,,1,2,0,5,200,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
ben_2009.insert(0,'year', 2009)
ben_2009

Unnamed: 0,year,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,...,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,2009,00000B48BCF4AD29,19230901,,2,5,0,10,260,12,...,2,0.0,0.0,0.0,580.0,400.0,0.0,5720.0,1530.0,520.0
1,2009,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,...,1,0.0,0.0,0.0,3380.0,1370.0,0.0,7970.0,2010.0,0.0
2,2009,00009C897C3D8372,19320101,,1,1,0,7,70,12,...,2,0.0,0.0,0.0,2250.0,230.0,0.0,900.0,210.0,0.0
3,2009,0001168CE43BE51B,19340901,,2,1,0,6,200,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2009,0002E494BC87CE10,19140701,,1,2,0,5,200,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114613,2009,FFFCF02B3CE4D724,19100901,,2,1,0,34,790,12,...,2,13630.0,2136.0,0.0,2460.0,960.0,0.0,1050.0,350.0,0.0
114614,2009,FFFE94CBE61C0479,19360601,,1,1,Y,4,70,12,...,1,0.0,0.0,0.0,12460.0,4400.0,0.0,7670.0,1870.0,0.0
114615,2009,FFFEAD5472FC13A2,19301201,,2,1,0,41,50,0,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114616,2009,FFFF0FD89207928D,19250901,,2,1,0,39,290,12,...,2,0.0,0.0,0.0,500.0,200.0,0.0,2220.0,680.0,0.0


In [121]:
ben_2010 = pd.read_csv(
    r"./data_transform/176581_DE1_0_2010_Beneficiary_Summary_File_Sample_2.zip",
    dtype={'DESYNPUF_ID':str,
        'BENE_BIRTH_DT':str,
        'BENE_DEATH_DT':str,
        'BENE_SEX_IDENT_CD':int,
        'BENE_RACE_CD':int,
        'BENE_ESRD_IND':str,
        'SP_STATE_CODE':int,
        'BENE_COUNTY_CD':int,
        'BENE_HI_CVRAGE_TOT_MONS':int,
        'BENE_SMI_CVRAGE_TOT_MONS':int,
        'BENE_HMO_CVRAGE_TOT_MONS':int,
        'PLAN_CVRG_MOS_NUM':int,
        'SP_ALZHDMTA':int,
        'SP_CHF':int,
        'SP_CHRNKIDN':int,
        'SP_CNCR':int,
        'SP_COPD':int,
        'SP_DEPRESSN':int,
        'SP_DIABETES':int,
        'SP_ISCHMCHT':int,
        'SP_OSTEOPRS':int,
        'SP_RA_OA':int,
        'SP_STRKETIA':int,
        'MEDREIMB_IP':float,
        'BENRES_IP':float,
        'PPPYMT_IP':float,
        'MEDREIMB_OP':float,
        'BENRES_OP':float,
        'PPPYMT_OP':float,
        'MEDREIMB_CAR':float,
        'BENRES_CAR':float,
        'PPPYMT_CAR':float},
    compression="zip"
)
ben_2010.shape

(112845, 32)

In [122]:
ben_2010.head(5)

Unnamed: 0,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,...,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,00000B48BCF4AD29,19230901,,2,5,Y,10,260,12,12,...,2,0.0,0.0,0.0,600.0,30.0,0.0,3800.0,1460.0,0.0
1,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,12,...,2,0.0,0.0,0.0,240.0,190.0,0.0,440.0,60.0,0.0
2,00009C897C3D8372,19320101,,1,1,0,7,70,12,12,...,2,0.0,0.0,0.0,210.0,160.0,0.0,580.0,250.0,0.0
3,0001168CE43BE51B,19340901,,2,1,0,6,200,12,0,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0002E494BC87CE10,19140701,,1,2,0,5,200,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
ben_2010.insert(0,'year', 2010)
ben_2010

Unnamed: 0,year,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,...,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,2010,00000B48BCF4AD29,19230901,,2,5,Y,10,260,12,...,2,0.0,0.0,0.0,600.0,30.0,0.0,3800.0,1460.0,0.0
1,2010,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,...,2,0.0,0.0,0.0,240.0,190.0,0.0,440.0,60.0,0.0
2,2010,00009C897C3D8372,19320101,,1,1,0,7,70,12,...,2,0.0,0.0,0.0,210.0,160.0,0.0,580.0,250.0,0.0
3,2010,0001168CE43BE51B,19340901,,2,1,0,6,200,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2010,0002E494BC87CE10,19140701,,1,2,0,5,200,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112840,2010,FFFCF02B3CE4D724,19100901,,2,1,0,34,790,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,350.0,130.0,0.0
112841,2010,FFFE94CBE61C0479,19360601,,1,1,Y,4,70,12,...,2,0.0,0.0,0.0,290.0,20.0,0.0,440.0,120.0,0.0
112842,2010,FFFEAD5472FC13A2,19301201,,2,1,0,41,50,0,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112843,2010,FFFF0FD89207928D,19250901,,2,1,0,39,290,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0


In [124]:
#Concatenate the 3 benefit files
bene_transform2 = pd.concat([ben_2008, ben_2009, ben_2010], ignore_index=True, sort=False)
print(bene_transform2.shape)
bene_transform2.head(5)

(343858, 33)


Unnamed: 0,year,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,...,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,2008,00000B48BCF4AD29,19230901,,2,5,0,10,260,12,...,1,81000.0,3072.0,0.0,1520.0,80.0,0.0,6260.0,1520.0,0.0
1,2008,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,...,1,13260.0,2048.0,0.0,1760.0,670.0,0.0,3830.0,1010.0,50.0
2,2008,00009C897C3D8372,19320101,,1,1,Y,7,70,12,...,2,37500.0,4096.0,0.0,100.0,160.0,0.0,1540.0,280.0,60.0
3,2008,0001168CE43BE51B,19340901,,2,1,0,6,200,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2008,0002E494BC87CE10,19140701,,1,2,0,5,200,2,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [125]:
# PACKAGE DF AS CSV

filepath = "./data_transform/bene8_10.csv"
bene_transform2.to_csv(filepath, index=False, header=True)

# COPY CSVs TO S3

import time

# The timestamp is for creating a fresh folder.  Disabled here so S3 will write over with newly engineered files.
timestamp = int(time.time())
eng_df_s3_uri = gcgs_sm_sess.upload_data(bucket=gcgs_bucket, key_prefix="eng_bene8_10", path=filepath)
#engineered_df_s3_uri = gcgs_sm_sess.upload_data(bucket=gcgs_bucket, key_prefix="engineered_df-{}".format(timestamp), path=filepath)
print(eng_df_s3_uri)

# LIST DATA IN THE NEW S3 LOCATION

!aws s3 ls $eng_df_s3_uri

s3://my-508-projects/eng_bene8_10/bene8_10.csv
2022-03-31 00:03:36   41223956 bene8_10.csv


## Read the concatenated bene8_10 csv back into Notebook

In [126]:
!aws s3 cp 's3://my-508-projects/eng_bene8_10/bene8_10.csv' ./data_transform/

download: s3://my-508-projects/eng_bene8_10/bene8_10.csv to data_transform/bene8_10.csv


In [127]:
bene = pd.read_csv(
    r"./data_transform/bene8_10.csv",
    dtype={'year':int,
        'DESYNPUF_ID':str,
        'BENE_BIRTH_DT':str,
        'BENE_DEATH_DT':str,
        'BENE_SEX_IDENT_CD':int,
        'BENE_RACE_CD':int,
        'BENE_ESRD_IND':str,
        'SP_STATE_CODE':int,
        'BENE_COUNTY_CD':int,
        'BENE_HI_CVRAGE_TOT_MONS':int,
        'BENE_SMI_CVRAGE_TOT_MONS':int,
        'BENE_HMO_CVRAGE_TOT_MONS':int,
        'PLAN_CVRG_MOS_NUM':int,
        'SP_ALZHDMTA':int,
        'SP_CHF':int,
        'SP_CHRNKIDN':int,
        'SP_CNCR':int,
        'SP_COPD':int,
        'SP_DEPRESSN':int,
        'SP_DIABETES':int,
        'SP_ISCHMCHT':int,
        'SP_OSTEOPRS':int,
        'SP_RA_OA':int,
        'SP_STRKETIA':int,
        'MEDREIMB_IP':float,
        'BENRES_IP':float,
        'PPPYMT_IP':float,
        'MEDREIMB_OP':float,
        'BENRES_OP':float,
        'PPPYMT_OP':float,
        'MEDREIMB_CAR':float,
        'BENRES_CAR':float,
        'PPPYMT_CAR':float},
#    compression="zip"
)
print(bene.shape)
bene.head(5)

(343858, 33)


Unnamed: 0,year,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,...,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,2008,00000B48BCF4AD29,19230901,,2,5,0,10,260,12,...,1,81000.0,3072.0,0.0,1520.0,80.0,0.0,6260.0,1520.0,0.0
1,2008,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,...,1,13260.0,2048.0,0.0,1760.0,670.0,0.0,3830.0,1010.0,50.0
2,2008,00009C897C3D8372,19320101,,1,1,Y,7,70,12,...,2,37500.0,4096.0,0.0,100.0,160.0,0.0,1540.0,280.0,60.0
3,2008,0001168CE43BE51B,19340901,,2,1,0,6,200,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2008,0002E494BC87CE10,19140701,,1,2,0,5,200,2,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Transform the Beneficiary Dataframe in Python Before Uploading Back to S3 as CSV

In [128]:
!pip install --disable-pip-version-check -q datetime
import datetime as dt
today = dt.date.today()

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [129]:
now = pd.to_datetime('now')
now

Timestamp('2022-03-31 00:03:40.179393')

In [130]:
#bene.insert(0,'today', 20220330)
#pd.to_datetime(bene['today'], format = '%Y%m%d')
#pd.to_datetime(bene['birth_date'], format = '%Y%m%d')
#
#def AGE(row):
#    return (row['today']-
#           row['birth_date'])
#df=bene.apply(lambda row: AGE(row), axis = 1)
#bene = bene.assign(age = df)

#now = pd.to_datetime('now')
#ages = (now - bene['birth_date']).astype('<m8[Y]')
#df=bene.apply(ages)
#bene = bene.assign(age = df)

In [131]:
#LIST OF bene COLUMNS AND TRANSFORMS

#        'DESYNPUF_ID':str,        Unique pt ID
#        'BENE_BIRTH_DT':str,      #Transform to datetime and ADD an age column and ane range column

bene['birth_date'] = pd.to_datetime(bene['BENE_BIRTH_DT'], format = '%Y%m%d')

df= (bene['year']-bene['BENE_BIRTH_DT'].str.slice(0,4).astype(int))
bene = bene.assign(age = df)

def AGECODE(row):
    if (row['age'] < 65):
        return 1
    if ((row['age'] >= 65)
       &(row['age'] <75)):
        return 2
    if ((row['age'] >= 75)
       &(row['age'] <85)):
        return 3
    if ((row['age'] >= 85)
       &(row['age'] <95)):
        return 4
    if (row['age'] >= 95):
        return 5
    return 0

df=bene.apply(lambda row: AGECODE(row), axis = 1)
print(df.value_counts())

bene = bene.assign(age_code = df)



#        'BENE_DEATH_DT':str,      #Transform to datetime
bene['death_date'] = pd.to_datetime(bene['BENE_DEATH_DT'], format = '%Y%m%d')

#        'BENE_SEX_IDENT_CD':int,  0 = Unknown, 1 = Male, 2 = Female
#        'BENE_RACE_CD':int,       0 Missing 1 White, 2 Black, 3 Other, 5 Hispanic

#        'BENE_ESRD_IND':str,      Y = the beneficiary has ESRD, 0 = the beneficiary does not have ESRD
def ESRD(row):
    if (row['BENE_ESRD_IND'] == 'Y'):
        return 1
    return 0

df=bene.apply(lambda row: ESRD(row), axis = 1)
df.value_counts()

bene = bene.assign(esrd = df)
print(bene.shape)
bene.head(5)

2    136403
3    100603
1     54108
4     42598
5     10146
dtype: int64
(343858, 38)


Unnamed: 0,year,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,...,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR,birth_date,age,age_code,death_date,esrd
0,2008,00000B48BCF4AD29,19230901,,2,5,0,10,260,12,...,80.0,0.0,6260.0,1520.0,0.0,1923-09-01,85,4,NaT,0
1,2008,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,...,670.0,0.0,3830.0,1010.0,50.0,1920-10-01,88,4,NaT,0
2,2008,00009C897C3D8372,19320101,,1,1,Y,7,70,12,...,160.0,0.0,1540.0,280.0,60.0,1932-01-01,76,3,NaT,1
3,2008,0001168CE43BE51B,19340901,,2,1,0,6,200,12,...,0.0,0.0,0.0,0.0,0.0,1934-09-01,74,2,NaT,0
4,2008,0002E494BC87CE10,19140701,,1,2,0,5,200,2,...,0.0,0.0,0.0,0.0,0.0,1914-07-01,94,4,NaT,0


In [132]:
################################ STATE AGGREGATION #######################################

#        'SP_STATE_CODE':int,
def STATECLASS(row):
    if ((row['SP_STATE_CODE'] == 1) #southeast 01, 04, 10, 11, 18, 19, 25, 26, 34, 42, 44, 47
       |(row['SP_STATE_CODE'] == 4)
       |(row['SP_STATE_CODE'] == 10)
       |(row['SP_STATE_CODE'] == 11)
       |(row['SP_STATE_CODE'] == 18)
       |(row['SP_STATE_CODE'] == 19)
       |(row['SP_STATE_CODE'] == 25)
       |(row['SP_STATE_CODE'] == 26)
       |(row['SP_STATE_CODE'] == 34)
       |(row['SP_STATE_CODE'] == 42)
       |(row['SP_STATE_CODE'] == 44)
       |(row['SP_STATE_CODE'] == 47)):
        return 1
    if ((row['SP_STATE_CODE'] == 8) #mid_atlantic 08, 09, 21, 49, 51,
       |(row['SP_STATE_CODE'] == 9)
       |(row['SP_STATE_CODE'] == 21)
       |(row['SP_STATE_CODE'] == 49)
       |(row['SP_STATE_CODE'] == 51)):
        return 2
    if ((row['SP_STATE_CODE'] == 7) #northeast 07, 20, 22, 30, 31, 33, 39, 41
       |(row['SP_STATE_CODE'] == 20)
       |(row['SP_STATE_CODE'] == 22)
       |(row['SP_STATE_CODE'] == 30)
       |(row['SP_STATE_CODE'] == 31)
       |(row['SP_STATE_CODE'] == 33)
       |(row['SP_STATE_CODE'] == 39)
       |(row['SP_STATE_CODE'] == 41)):
        return 3
    if ((row['SP_STATE_CODE'] == 14) #great_lakes 14, 15, 23, 36, 52
       |(row['SP_STATE_CODE'] == 15)
       |(row['SP_STATE_CODE'] == 23)
       |(row['SP_STATE_CODE'] == 36)
       |(row['SP_STATE_CODE'] == 52)):
        return 4
    if ((row['SP_STATE_CODE'] == 16) #plains 16, 17, 24, 28, 35, 37, 43, 45
       |(row['SP_STATE_CODE'] == 17)
       |(row['SP_STATE_CODE'] == 24)
       |(row['SP_STATE_CODE'] == 28)
       |(row['SP_STATE_CODE'] == 35)
       |(row['SP_STATE_CODE'] == 37)
       |(row['SP_STATE_CODE'] == 43)
       |(row['SP_STATE_CODE'] == 45)):
        return 5
    if ((row['SP_STATE_CODE'] == 3) #southwest 03, 32
       |(row['SP_STATE_CODE'] == 32)):
        return 6
    if ((row['SP_STATE_CODE'] == 6) #intermountain_west 06, 13, 27, 29, 46, 53
       |(row['SP_STATE_CODE'] == 13)
       |(row['SP_STATE_CODE'] == 27)
       |(row['SP_STATE_CODE'] == 29)
       |(row['SP_STATE_CODE'] == 46)
       |(row['SP_STATE_CODE'] == 53)):
        return 7
    if ((row['SP_STATE_CODE'] == 2) #west_coast_alk_hi 02, 05, 12, 38, 50
       |(row['SP_STATE_CODE'] == 5)
       |(row['SP_STATE_CODE'] == 12)
       |(row['SP_STATE_CODE'] == 38)
       |(row['SP_STATE_CODE'] == 50)):
        return 8
    if ((row['SP_STATE_CODE'] == 1) #other 40, 48, 54-63, 97
       |(row['SP_STATE_CODE'] == 40)
       |(row['SP_STATE_CODE'] == 48)
       |(row['SP_STATE_CODE'] == 54)
       |(row['SP_STATE_CODE'] == 55)
       |(row['SP_STATE_CODE'] == 56)
       |(row['SP_STATE_CODE'] == 57)
       |(row['SP_STATE_CODE'] == 58)
       |(row['SP_STATE_CODE'] == 59)
       |(row['SP_STATE_CODE'] == 60)
       |(row['SP_STATE_CODE'] == 61)
       |(row['SP_STATE_CODE'] == 62)
       |(row['SP_STATE_CODE'] == 63)    
       |(row['SP_STATE_CODE'] == 97)):
        return 9
    return 0

df=bene.apply(lambda row: STATECLASS(row), axis = 1)
print(df.value_counts())

bene = bene.assign(region_code = df)
bene.shape

#01 = Alabama
#02 = Alaska
#03 = Arizona
#04 = Arkansas
#05 = California
#06 = Colorado
#07 = Connecticut
#08 = Delaware
#09 = District of Columbia
#10 = Florida
#11 = Georgia
#12 = Hawaii
#13 = Idaho
#14 = Illinois
#15 = Indiana
#16 = Iowa
#17 = Kansas
#18 = Kentucky
#19 = Louisiana
#20 = Maine
#21 = Maryland
#22 = Massachusetts
#23 = Michigan
#24 = Minnesota
#25 = Mississippi
#26 = Missouri
#27 = Montana
#28 = Nebraska
#29 = Nevada
#30 = New Hampshire
#31 = New Jersey
#32 = New Mexico
#33 = New York
#34 = North Carolina
#35 = North Dakota
#36 = Ohio
#37 = Oklahoma
#38 = Oregon
#39 = Pennsylvania
#40 = Puerto Rico
#41 = Rhode Island
#42 = South Carolina
#43 = South Dakota
#44 = Tennessee
#45 = Texas
#46 = Utah
#47 = Vermont
#48 = Virgin Islands
#49 = Virginia
#50 = Washington
#51 = West Virginia
#52 = Wisconsin
#53 = Wyoming
#54 = Africa
#55 = Asia
#56 = Canada
#57 = Central America and West Indies
#58 = Europe
#59 = Mexico
#60 = Oceania
#61 = Philippines
#62 = South America
#63 = US Possessions
#97 = Saipan

bene.head(5)

#SEE NEXT CELL FOR REST OF RECODING

1    93230
3    62373
4    51949
8    43606
5    42276
2    19798
7    15743
6    10098
9     4785
dtype: int64


Unnamed: 0,year,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,...,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR,birth_date,age,age_code,death_date,esrd,region_code
0,2008,00000B48BCF4AD29,19230901,,2,5,0,10,260,12,...,0.0,6260.0,1520.0,0.0,1923-09-01,85,4,NaT,0,1
1,2008,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,...,0.0,3830.0,1010.0,50.0,1920-10-01,88,4,NaT,0,3
2,2008,00009C897C3D8372,19320101,,1,1,Y,7,70,12,...,0.0,1540.0,280.0,60.0,1932-01-01,76,3,NaT,1,3
3,2008,0001168CE43BE51B,19340901,,2,1,0,6,200,12,...,0.0,0.0,0.0,0.0,1934-09-01,74,2,NaT,0,7
4,2008,0002E494BC87CE10,19140701,,1,2,0,5,200,2,...,0.0,0.0,0.0,0.0,1914-07-01,94,4,NaT,0,8


In [133]:
################################ CONTINUOUS ENROLLMENT #######################################

#        'BENE_COUNTY_CD':int,                DELETE
#        'BENE_HI_CVRAGE_TOT_MONS':int,       Months of Part A coverage
print(bene['BENE_HI_CVRAGE_TOT_MONS'].value_counts())
#        'BENE_SMI_CVRAGE_TOT_MONS':int,      Months of Part B coverage
print(bene['BENE_SMI_CVRAGE_TOT_MONS'].value_counts())
#        'BENE_HMO_CVRAGE_TOT_MONS':int,      Total number of months of HMO coverage for the beneficiary
print(bene['BENE_HMO_CVRAGE_TOT_MONS'].value_counts())
#        'PLAN_CVRG_MOS_NUM':int,             Total number of months of part D plan coverage for the beneficiary
print(bene['PLAN_CVRG_MOS_NUM'].value_counts())

#Define people who were continuously enrolled for at least 1 full calendar year and had Part D coverage
def CONTENROLL(row):
    if ((row['BENE_HI_CVRAGE_TOT_MONS'] == 12) #mid_atlantic 08, 09, 21, 49, 51,
       &(row['BENE_SMI_CVRAGE_TOT_MONS'] == 12)
       &(row['BENE_HMO_CVRAGE_TOT_MONS'] == 0)
       &(row['PLAN_CVRG_MOS_NUM'] == 12)):
        return 1
    return 0

df=bene.apply(lambda row: CONTENROLL(row), axis = 1)
print(df.value_counts())
bene = bene.assign(cont_enroll = df)
bene.shape

################################ CHRONIC CONDITIONS #######################################

#        'SP_ALZHDMTA':int,                   0 Missing, 1 Yes, 2 No
print(bene['SP_ALZHDMTA'].value_counts())
def CC(row):
    if (row['SP_ALZHDMTA']==1):
        return 1
    return 0
df=bene.apply(lambda row: CC(row), axis = 1)
bene = bene.assign(has_alz = df)
#        'SP_CHF':int,
print(bene['SP_CHF'].value_counts())
def CC(row):
    if (row['SP_CHF']==1):
        return 1
    return 0
df=bene.apply(lambda row: CC(row), axis = 1)
bene = bene.assign(has_chf = df)
#        'SP_CHRNKIDN':int,
print(bene['SP_CHRNKIDN'].value_counts())
def CC(row):
    if (row['SP_CHRNKIDN']==1):
        return 1
    return 0
df=bene.apply(lambda row: CC(row), axis = 1)
bene = bene.assign(has_ckd = df)
#        'SP_CNCR':int,
print(bene['SP_CNCR'].value_counts())
def CC(row):
    if (row['SP_CNCR']==1):
        return 1
    return 0
df=bene.apply(lambda row: CC(row), axis = 1)
bene = bene.assign(has_can = df)
#        'SP_COPD':int,
print(bene['SP_COPD'].value_counts())
def CC(row):
    if (row['SP_COPD']==1):
        return 1
    return 0
df=bene.apply(lambda row: CC(row), axis = 1)
bene = bene.assign(has_copd = df)
#        'SP_DEPRESSN':int,
print(bene['SP_DEPRESSN'].value_counts())
def CC(row):
    if (row['SP_DEPRESSN']==1):
        return 1
    return 0
df=bene.apply(lambda row: CC(row), axis = 1)
bene = bene.assign(has_dep = df)
#        'SP_DIABETES':int,
print(bene['SP_DIABETES'].value_counts())
def CC(row):
    if (row['SP_DIABETES']==1):
        return 1
    return 0
df=bene.apply(lambda row: CC(row), axis = 1)
bene = bene.assign(has_dib = df)
#        'SP_ISCHMCHT':int,
print(bene['SP_ISCHMCHT'].value_counts())
def CC(row):
    if (row['SP_ISCHMCHT']==1):
        return 1
    return 0
df=bene.apply(lambda row: CC(row), axis = 1)
bene = bene.assign(has_ihd = df)
#        'SP_OSTEOPRS':int,
print(bene['SP_OSTEOPRS'].value_counts())
def CC(row):
    if (row['SP_OSTEOPRS']==1):
        return 1
    return 0
df=bene.apply(lambda row: CC(row), axis = 1)
bene = bene.assign(has_op = df)
#        'SP_RA_OA':int,
print(bene['SP_RA_OA'].value_counts())
def CC(row):
    if (row['SP_RA_OA']==1):
        return 1
    return 0
df=bene.apply(lambda row: CC(row), axis = 1)
bene = bene.assign(has_raoa = df)
#        'SP_STRKETIA':int,
print(bene['SP_STRKETIA'].value_counts())
def CC(row):
    if (row['SP_STRKETIA']==1):
        return 1
    return 0
df=bene.apply(lambda row: CC(row), axis = 1)
bene = bene.assign(has_stia = df)

def NUMCC(row):
    return (row['has_alz']+
           row['has_chf']+
           row['has_ckd']+
           row['has_can']+
           row['has_copd']+
           row['has_dep']+
           row['has_dia']+
           row['has_ihd']+
           row['has_op']+
           row['has_raoa']+
           row['has_stia'])
df=bene.apply(lambda row: CC(row), axis = 1)
bene = bene.assign(ccnum = df)


#        'MEDREIMB_IP':float, mean 2000, sd 7000. The sum of all Medicare fee–for–service reimbursements made during the calendar year for services covered by inpatient claims.
#        'BENRES_IP':float, mean 250, sd 700. The sum of all beneficiary fee–for–service payment obligations accrued during the calendar year for services covered by inpatient claims
#        'PPPYMT_IP':float, mean 100, sd 1800. The sum of all primary payer fee–for–service reimbursements made during the calendar year for services covered by inpatient claims

#        'MEDREIMB_OP':float,mean 700, sd 1800. The sum of all Medicare fee–for–service reimbursements made during the calendar year for services covered by outpatient claims
#        'BENRES_OP':float, mean 200, sd 500. The sum of all beneficiary fee–for–service payment obligations accrued during the calendar year for services covered by outpatient claims.
#        'PPPYMT_OP':float,mean 28, sd 300.The sum of all primary payer fee–for–service reimbursements made during the calendar year for services covered by outpatient claims.

#        'MEDREIMB_CAR':float, mean 1200, sd 1500. The sum of all Medicare fee–for–service reimbursements made during the calendar year for services covered by carrier claims.
#        'BENRES_CAR':float,mean 350, sd 400. The sum of all Medicare fee–for–service reimbursements made during the calendar year for ervices covered by carrier claims.
#        'PPPYMT_CAR':float},mean 20, sd 95. The sum of all primary payer fee–for–service reimbursements made during the calendar for services covered by carrier claims.

#Define people with very high yearly IP claims
def HIGHIP(row):
    if (row['MEDREIMB_IP'] + row['PPPYMT_IP'] > 5000):
        return 1
    return 0

df=bene.apply(lambda row: HIGHIP(row), axis = 1)
print(df.value_counts())
bene = bene.assign(high_IP = df)
bene.shape

#Define people with very high yearly OP claims
def HIGHOP(row):
    if (row['MEDREIMB_OP'] + row['PPPYMT_OP'] > 1000):
        return 1
    return 0

df=bene.apply(lambda row: HIGHOP(row), axis = 1)
print(df.value_counts())
bene = bene.assign(high_OP = df)
bene.shape
         
#Define people with very high yearly residual expenses
def HIGHBEN(row):
    if (row['BENRES_IP'] + row['BENRES_OP'] + row['BENRES_CAR'] > 500):
        return 1
    return 0

df=bene.apply(lambda row: HIGHBEN(row), axis = 1)
print(df.value_counts())
bene = bene.assign(high_ben = df)
bene.shape


#NEW COLUMNS
#birth_date
#death_date
#region (see above)
#bin_chron_cond 0, 1
#num_chron_cond 0, 1, 2, 3, >3
#beneficiary_obligations H, M, L
#total_nondrug_covered H, M, L
#drug_file_9['total_cost'] = drug_file_9['PTNT_PAY_AMT'] + drug_file_9['TOT_RX_CST_AMT']
#drug_file_9['cost_per_day'] = drug_file_9['total_cost']/drug_file_9['DAYS_SUPLY_NUM']
#total_hospital_covered H, M, L
bene.head(5)

12    314723
0      18937
11      1583
10      1303
9       1115
8        980
7        897
6        871
1        759
5        743
4        688
3        671
2        588
Name: BENE_HI_CVRAGE_TOT_MONS, dtype: int64
12    303675
0      27990
11      1781
6       1533
10      1505
9       1289
8       1191
7       1027
5        857
4        812
1        795
3        754
2        649
Name: BENE_SMI_CVRAGE_TOT_MONS, dtype: int64
0     246961
12     83335
9       1805
11      1558
10      1472
6       1303
1       1144
3       1108
8       1102
2       1081
7       1035
5        984
4        970
Name: BENE_HMO_CVRAGE_TOT_MONS, dtype: int64
12    226897
0      82401
6       4271
10      3262
11      3255
9       3242
8       3201
3       3009
7       2980
4       2925
1       2844
2       2804
5       2767
Name: PLAN_CVRG_MOS_NUM, dtype: int64
0    206543
1    137315
dtype: int64
2    275972
1     67886
Name: SP_ALZHDMTA, dtype: int64
2    242533
1    101325
Name: SP_CHF, dtype: int64
2    285

Unnamed: 0,year,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,...,has_dep,has_dib,has_ihd,has_op,has_raoa,has_stia,ccnum,high_IP,high_OP,high_ben
0,2008,00000B48BCF4AD29,19230901,,2,5,0,10,260,12,...,1,1,1,1,0,1,1,1,1,1
1,2008,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,...,0,1,1,0,0,1,1,1,1,1
2,2008,00009C897C3D8372,19320101,,1,1,Y,7,70,12,...,1,1,1,0,1,0,0,1,0,1
3,2008,0001168CE43BE51B,19340901,,2,1,0,6,200,12,...,0,0,0,0,0,0,0,0,0,0
4,2008,0002E494BC87CE10,19140701,,1,2,0,5,200,2,...,0,0,0,0,0,0,0,0,0,0


In [134]:
#Delete unused columns
bene = bene.drop('BENE_BIRTH_DT', 1)
bene = bene.drop('BENE_DEATH_DT', 1)
bene = bene.drop('BENE_ESRD_IND', 1)
bene = bene.drop('BENE_COUNTY_CD', 1)
bene = bene.drop('SP_ALZHDMTA', 1)
bene = bene.drop('SP_CHF', 1)
bene = bene.drop('SP_CHRNKIDN', 1)
bene = bene.drop('SP_CNCR', 1)
bene = bene.drop('SP_COPD', 1)
bene = bene.drop('SP_DEPRESSN', 1)
bene = bene.drop('SP_DIABETES', 1)
bene = bene.drop('SP_ISCHMCHT', 1)
bene = bene.drop('SP_OSTEOPRS', 1)
bene = bene.drop('SP_RA_OA', 1)
bene = bene.drop('SP_STRKETIA', 1)

In [135]:
bene.head(5)

Unnamed: 0,year,DESYNPUF_ID,BENE_SEX_IDENT_CD,BENE_RACE_CD,SP_STATE_CODE,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,BENE_HMO_CVRAGE_TOT_MONS,PLAN_CVRG_MOS_NUM,MEDREIMB_IP,...,has_dep,has_dib,has_ihd,has_op,has_raoa,has_stia,ccnum,high_IP,high_OP,high_ben
0,2008,00000B48BCF4AD29,2,5,10,12,12,0,0,81000.0,...,1,1,1,1,0,1,1,1,1,1
1,2008,0000525AB30E4DEF,2,1,31,12,12,0,12,13260.0,...,0,1,1,0,0,1,1,1,1,1
2,2008,00009C897C3D8372,1,1,7,12,12,12,12,37500.0,...,1,1,1,0,1,0,0,1,0,1
3,2008,0001168CE43BE51B,2,1,6,12,12,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,2008,0002E494BC87CE10,1,2,5,2,2,12,0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [136]:
bene.dtypes

year                                 int64
DESYNPUF_ID                         object
BENE_SEX_IDENT_CD                    int64
BENE_RACE_CD                         int64
SP_STATE_CODE                        int64
BENE_HI_CVRAGE_TOT_MONS              int64
BENE_SMI_CVRAGE_TOT_MONS             int64
BENE_HMO_CVRAGE_TOT_MONS             int64
PLAN_CVRG_MOS_NUM                    int64
MEDREIMB_IP                        float64
BENRES_IP                          float64
PPPYMT_IP                          float64
MEDREIMB_OP                        float64
BENRES_OP                          float64
PPPYMT_OP                          float64
MEDREIMB_CAR                       float64
BENRES_CAR                         float64
PPPYMT_CAR                         float64
birth_date                  datetime64[ns]
age                                  int64
age_code                             int64
death_date                  datetime64[ns]
esrd                                 int64
region_code

## Re-encode the Transformed Beneficiary df to csv and Send Back to S3

In [137]:
# PACKAGE DF AS CSV

filepath = "./data_transform/eng_bene.csv"
bene.to_csv(filepath, index=False, header=True)

# COPY CSVs TO S3

import time

# The timestamp is for creating a fresh folder.  Disabled here so S3 will write over with newly engineered files.
timestamp = int(time.time())
eng_df_s3_uri = gcgs_sm_sess.upload_data(bucket=gcgs_bucket, key_prefix="eng_bene", path=filepath)
#eng_df_s3_uri = gcgs_sm_sess.upload_data(bucket=gcgs_bucket, key_prefix="eng_df-{}".format(timestamp), path=filepath)
print(eng_df_s3_uri)

# LIST DATA IN THE NEW S3 LOCATION

!aws s3 ls $eng_df_s3_uri

s3://my-508-projects/eng_bene/eng_bene.csv
2022-03-31 00:07:38   46468386 eng_bene.csv


## What's in our S3 bucket?
#### Also, create a landing folder

In [138]:
my_path = "s3://{}/eng_ndc".format(gcgs_bucket)
s3_staging_dir = "s3://{}/athena/staging".format(gcgs_bucket)
my_landing_path = "s3://{}/landing".format(gcgs_bucket)
print(my_path)
print(s3_staging_dir)
print(my_landing_path)

s3://my-508-projects/eng_ndc
s3://my-508-projects/athena/staging
s3://my-508-projects/landing


In [139]:
!aws s3 ls $my_path

                           PRE eng_ndc/


In [140]:
try:
    my_landing_path
    print("OK")
except NameError:
    print("*****************************************************************************")
    print("[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************")
    print("[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************")
    print("*****************************************************************************")

OK


## Import PyAthena to Create Athena DB using SQL

In [141]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0  
# Had to reinstall after "not found" errors.  Rerunning 01_setup_dependencies did not fix the issue.

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [142]:
from pyathena import connect

#Athena parameters
database_name = "gen_db"
table_name = "ndc_table"
my_path = "s3://{}/eng_ndc".format(gcgs_bucket)

#Connect, create and run a statement
conn = connect(region_name=gcgs_region, s3_staging_dir=s3_staging_dir)
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS gen_db


## Drop Old Athena Tables

In [143]:
conn = connect(region_name=gcgs_region, s3_staging_dir=s3_staging_dir)

pd.read_sql("DROP TABLE IF EXISTS gen_db.ndc_table", conn)
pd.read_sql("DROP TABLE IF EXISTS gen_db.pde_table", conn)
pd.read_sql("DROP TABLE IF EXISTS gen_db.bene_table", conn)

statement = "SHOW TABLES in {}".format(database_name)

df_table = pd.read_sql(statement, conn)
df_table.head(5)

Unnamed: 0,tab_name


## Verify Athena DB Creation

In [144]:
#Verify DB creation

statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,default
1,gen_db


## Create SQL Statement to Create NDC Athena/Glue Table

In [145]:
#Athena parameters
database_name = "gen_db"
table_name = "ndc_table"
my_path = "s3://{}/eng_ndc".format(gcgs_bucket)

# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{} (
NDC5_4 string,
PRODUCTTYPENAME string, 
PROPRIETARYNAME string,
NONPROPRIETARYNAME string,
DOSAGEFORMNAME string,
ROUTENAME string,
isgeneric int,
isoral int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_name, my_path)

print(statement)

#'compressionType'='zip', 

CREATE EXTERNAL TABLE IF NOT EXISTS gen_db.ndc_table (
NDC5_4 string,
PRODUCTTYPENAME string, 
PROPRIETARYNAME string,
NONPROPRIETARYNAME string,
DOSAGEFORMNAME string,
ROUTENAME string,
isgeneric int,
isoral int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://my-508-projects/eng_ndc'
TBLPROPERTIES ('skip.header.line.count'='1')


## Run SQL Statement to Create Athena Table (IMPORTANT: Does NOT over write old attempts)

In [146]:
run = pd.read_sql(statement, conn)
print(run)

Empty DataFrame
Columns: []
Index: []


In [147]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,ndc_table


## Run SELECT * for ndc_table to Verify Table

In [148]:
statement = """SELECT * FROM gen_db.ndc_table"""
print(statement)
df_show = pd.read_sql(statement, conn)
df_show.head(5)

SELECT * FROM gen_db.ndc_table


Unnamed: 0,ndc5_4,producttypename,proprietaryname,nonproprietaryname,dosageformname,routename,isgeneric,isoral
0,21433,HUMAN PRESCRIPTION DRUG,trulicity,dulaglutide,INJECTION,SUBCUTANEOUS,0.0,0.0
1,21434,HUMAN PRESCRIPTION DRUG,trulicity,dulaglutide,INJECTION,SUBCUTANEOUS,0.0,0.0
2,21436,HUMAN PRESCRIPTION DRUG,emgality,galcanezumab-gnlm,INJECTION,SUBCUTANEOUS,0.0,0.0
3,21445,HUMAN PRESCRIPTION DRUG,taltz,ixekizumab,INJECTION,SUBCUTANEOUS,0.0,0.0
4,22236,HUMAN PRESCRIPTION DRUG,trulicity,dulaglutide,INJECTION,SUBCUTANEOUS,0.0,0.0


## Run Test Query to Pull Up One Drug NDC

In [149]:
productndc = '581180177'

statement = """SELECT proprietaryname FROM {}.{}
    WHERE ndc5_4 = '{}' LIMIT 100""".format(
    database_name, table_name, productndc
)

print(statement)

SELECT proprietaryname FROM gen_db.ndc_table
    WHERE ndc5_4 = '581180177' LIMIT 100


In [150]:
df = pd.read_sql(statement, conn)
df.head(5)

Unnamed: 0,proprietaryname
0,buprenorphine hcl


## Run Test Query to Compare Nonproprietary and Proprietary Columns

In [151]:
statement = """SELECT COUNT(*) FROM {}.{}
    WHERE proprietaryname = nonproprietaryname LIMIT 100""".format(
    database_name, table_name
)

print(statement)

SELECT COUNT(*) FROM gen_db.ndc_table
    WHERE proprietaryname = nonproprietaryname LIMIT 100


In [152]:
df = pd.read_sql(statement, conn)
df.head(5)

Unnamed: 0,_col0
0,31204


## Select the Forms and Routes

In [153]:
#dosageformname = ('AEROSOL', 'CAPSULE', 'GRANULE', 'INJECTION', 'LIQUID', 'PATCH', 'PILL', 'POWDER', 'SALVE', 'SUPPOSITORY', 'SUSPENSION', 'SYRUP', 'TABLET')
#routename = ('OTIC', 'CONJUNCTIVAL', 'CUTANEOUS', 'NASAL', 'OPHTHALMIC', 'ORAL', 'OROPHARYNGEAL', 'RECTAL', 'SUBCUTANEOUS', 'SUBLINGUAL', 'SUBMUCOSAL', 'TOPICAL', 'TRANSDERMAL', 'VAGNAL')

#statement = """SELECT * FROM {}.{}
#    WHERE dosageformname IN {}
#    AND routename IN {}
#    LIMIT 100""".format(
#    database_name, table_name, dosageformname, routename
#)

#print(statement)
#df = pd.read_sql(statement, conn)
#df.head(5)

## Select by Form/Route and for Generics

In [154]:
#dosageformname = ('AEROSOL', 'CAPSULE', 'GRANULE', 'INJECTION', 'LIQUID', 'PATCH', 'PILL', 'POWDER', 'SALVE', 'SUPPOSITORY', 'SUSPENSION', 'SYRUP', 'TABLET')
#routename = ('OTIC', 'CONJUNCTIVAL', 'CUTANEOUS', 'NASAL', 'OPHTHALMIC', 'ORAL', 'OROPHARYNGEAL', 'RECTAL', 'SUBCUTANEOUS', 'SUBLINGUAL', 'SUBMUCOSAL', 'TOPICAL', 'TRANSDERMAL', 'VAGNAL')

#statement = """SELECT COUNT(*) FROM {}.{}
#    WHERE dosageformname IN {}
#    AND routename IN {}
#    AND proprietaryname = nonproprietaryname
#    LIMIT 100""".format(
#    database_name, table_name, dosageformname, routename
#)

#print(statement)

#df = pd.read_sql(statement, conn)
#df.head(5)

## Select by Form/Route and for Non-Generics

In [155]:
#dosageformname = ('AEROSOL', 'CAPSULE', 'GRANULE', 'INJECTION', 'LIQUID', 'PATCH', 'PILL', 'POWDER', 'SALVE', 'SUPPOSITORY', 'SUSPENSION', 'SYRUP', 'TABLET')
#routename = ('OTIC', 'CONJUNCTIVAL', 'CUTANEOUS', 'NASAL', 'OPHTHALMIC', 'ORAL', 'OROPHARYNGEAL', 'RECTAL', 'SUBCUTANEOUS', 'SUBLINGUAL', 'SUBMUCOSAL', 'TOPICAL', 'TRANSDERMAL', 'VAGNAL')

#statement = """SELECT COUNT(*) FROM {}.{}
#    WHERE dosageformname IN {}
#    AND routename IN {}
#    AND proprietaryname <> nonproprietaryname
#    LIMIT 100""".format(
#    database_name, table_name, dosageformname, routename
#)

#print(statement)

#df = pd.read_sql(statement, conn)
#df.head(5)

In [156]:
#OLD CODE FOR SEPARATE DB NO LONGER NEEDED
#Connect, create and run a statement
#conn = connect(region_name=gcgs_region, s3_staging_dir=s3_staging_dir)
#database_name = "bene_db"
#statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
#print(statement)
#pd.read_sql(statement, conn)

## Create New Table for PDE File

In [157]:
statement = "SHOW TABLES in {}".format(database_name)

df_table = pd.read_sql(statement, conn)
df_table.head(5)

Unnamed: 0,tab_name
0,ndc_table


In [158]:
#Athena parameters
database_name = "gen_db"
table_name = "pde_table"
my_path = "s3://{}/eng_pde".format(gcgs_bucket)

# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
DESYNPUF_ID string,
PDE_ID int,
PROD_SRVC_ID int,
PTNT_PAY_AMT int,
TOT_RX_CST_AMT int,
NDC5_4 string,
total_cost int,
cost_per_day float,
date date

) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name, my_path
)

print(statement)

pd.read_sql(statement, conn)

CREATE EXTERNAL TABLE IF NOT EXISTS gen_db.pde_table(
DESYNPUF_ID string,
PDE_ID int,
PROD_SRVC_ID int,
PTNT_PAY_AMT int,
TOT_RX_CST_AMT int,
NDC5_4 string,
total_cost int,
cost_per_day float,
date date

) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://my-508-projects/eng_pde'
TBLPROPERTIES ('skip.header.line.count'='1')


In [159]:
statement = "SHOW TABLES in {}".format(database_name)

df_table = pd.read_sql(statement, conn)
df_table.head(5)

Unnamed: 0,tab_name
0,ndc_table
1,pde_table


In [160]:
statement = """SELECT ndc5_4, date FROM gen_db.pde_table"""
print(statement)
df_show = pd.read_sql(statement, conn)
df_show.head(5)

SELECT ndc5_4, date FROM gen_db.pde_table


Unnamed: 0,ndc5_4,date
0,3640595,2009-11-11
1,524930275,2008-12-21
2,1791205,2010-08-25
3,9046012,2010-06-19
4,172360759,2009-12-13


## Create New Table for Beneficiary File

In [161]:
statement = "SHOW TABLES in {}".format(database_name)

df_table = pd.read_sql(statement, conn)
df_table.head(5)

Unnamed: 0,tab_name
0,ndc_table
1,pde_table


In [162]:
database_name = "gen_db"
table_name = "bene_table"
my_path = "s3://{}/eng_bene".format(gcgs_bucket)

# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
year int,
DESYNPUF_ID string,
BENE_SEX_IDENT_CD int,
BENE_RACE_CD int,
SP_STATE_CODE int,
BENE_HI_CVRAGE_TOT_MONS int,
BENE_SMI_CVRAGE_TOT_MONS int,
BENE_HMO_CVRAGE_TOT_MONS int,
PLAN_CVRG_MOS_NUM int,
MEDREIMB_IP int,
BENRES_IP int,
PPPYMT_IP int,
MEDREIMB_OP int,
BENRES_OP int,
PPPYMT_OP int,
MEDREIMB_CAR int,
BENRES_CAR int,
PPPYMT_CAR int,
birth_date int,
age int,
age_code int,
death_date int,
esrd int,
region_code int,
cont_enroll int,
has_alz int,
has_chf int,
has_ckd int,
has_can int,
has_copd int,
has_dep int,
has_dib int,
has_ihd int,
has_op int,
has_raoa int,
has_stia int,
ccnum int,
high_IP int,
high_OP int,
high_ben int

) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name, my_path
)

print(statement)

pd.read_sql(statement, conn)

CREATE EXTERNAL TABLE IF NOT EXISTS gen_db.bene_table(
year int,
DESYNPUF_ID string,
BENE_SEX_IDENT_CD int,
BENE_RACE_CD int,
SP_STATE_CODE int,
BENE_HI_CVRAGE_TOT_MONS int,
BENE_SMI_CVRAGE_TOT_MONS int,
BENE_HMO_CVRAGE_TOT_MONS int,
PLAN_CVRG_MOS_NUM int,
MEDREIMB_IP int,
BENRES_IP int,
PPPYMT_IP int,
MEDREIMB_OP int,
BENRES_OP int,
PPPYMT_OP int,
MEDREIMB_CAR int,
BENRES_CAR int,
PPPYMT_CAR int,
birth_date int,
age int,
age_code int,
death_date int,
esrd int,
region_code int,
cont_enroll int,
has_alz int,
has_chf int,
has_ckd int,
has_can int,
has_copd int,
has_dep int,
has_dib int,
has_ihd int,
has_op int,
has_raoa int,
has_stia int,
ccnum int,
high_IP int,
high_OP int,
high_ben int

) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://my-508-projects/eng_bene'
TBLPROPERTIES ('skip.header.line.count'='1')


In [163]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,bene_table
1,ndc_table
2,pde_table


In [164]:
statement = "SELECT desynpuf_id, high_ben FROM gen_db.bene_table"
print(statement)
df_show = pd.read_sql(statement, conn)
df_show.head(5)

SELECT desynpuf_id, high_ben FROM gen_db.bene_table


Unnamed: 0,desynpuf_id,high_ben
0,00000B48BCF4AD29,1
1,0000525AB30E4DEF,1
2,00009C897C3D8372,1
3,0001168CE43BE51B,0
4,0002E494BC87CE10,0


In [165]:
#statement = """SELECT * FROM {}.{}
#    LIMIT 100""".format(
#    database_name, table_name
#)

#print(statement)
#pd.read_sql(statement, conn)

In [166]:
#statement = """SELECT * FROM {}.{}
#    LIMIT 100""".format(
#    database_name, table_name
#)

#print(statement)
#pd.read_sql(statement, conn)

In [167]:
#statement = """SELECT desynpuf_id, SUBSTRING(prod_srvc_id,1,9) AS NDC54 FROM {}.{}
#    LIMIT 100""".format(
#    database_name, table_name
#)

#print(statement)
#pd.read_sql(statement, conn)


In [168]:
#Keep example of join with nested select (it worked!)

#statement = """SELECT desynpuf_id, PRODUCTID, NDC5_4, PRODUCTNDC2
#FROM
#    (
#    SELECT desynpuf_id, SUBSTRING(prod_srvc_id,1,9) AS PRODUCTNDC2
#    FROM gen_db.drugevent_table
#    ) AS MCSubtable
#JOIN gen_db.ndc_table
#ON MCSubtable.PRODUCTNDC2 = gen_db.ndc_table.NDC5_4"""

#print(statement)
#pd.read_sql(statement, conn)

# Run Join Query to Pull Data for ML

In [169]:
pd.set_option("display.max_columns", None)

statement = """
SELECT 
gen_db.pde_table.DESYNPUF_ID, 
gen_db.pde_table.NDC5_4,
ptnt_pay_amt,
total_cost,
cost_per_day,

proprietaryname,
nonproprietaryname,
isgeneric,

TOT_RX_CST_AMT, 
SP_STATE_CODE, 
year,
bene_sex_ident_cd,
bene_race_cd,
age_code,
esrd,
region_code,
cont_enroll,
ccnum,
high_ben

FROM gen_db.pde_table

JOIN gen_db.ndc_table
ON gen_db.pde_table.NDC5_4 = gen_db.ndc_table.NDC5_4

JOIN gen_db.bene_table
ON gen_db.pde_table.DESYNPUF_ID = gen_db.bene_table.DESYNPUF_ID
"""

print(statement)
df_show = pd.read_sql(statement, conn)
print(df_show.shape)
df_show.head(100)


SELECT 
gen_db.pde_table.DESYNPUF_ID, 
gen_db.pde_table.NDC5_4,
ptnt_pay_amt,
total_cost,
cost_per_day,

proprietaryname,
nonproprietaryname,
isgeneric,

TOT_RX_CST_AMT, 
SP_STATE_CODE, 
year,
bene_sex_ident_cd,
bene_race_cd,
age_code,
esrd,
region_code,
cont_enroll,
ccnum,
high_ben

FROM gen_db.pde_table

JOIN gen_db.ndc_table
ON gen_db.pde_table.NDC5_4 = gen_db.ndc_table.NDC5_4

JOIN gen_db.bene_table
ON gen_db.pde_table.DESYNPUF_ID = gen_db.bene_table.DESYNPUF_ID

(94453, 19)


Unnamed: 0,DESYNPUF_ID,NDC5_4,ptnt_pay_amt,total_cost,cost_per_day,proprietaryname,nonproprietaryname,isgeneric,TOT_RX_CST_AMT,SP_STATE_CODE,year,bene_sex_ident_cd,bene_race_cd,age_code,esrd,region_code,cont_enroll,ccnum,high_ben
0,254DE5244C61F075,675440700,0,30,1.500000,loperamide hydrochloride,loperamide hydrochloride,1.0,30,10,2010,2,1,1,0,1,0,0,0
1,254DE5244C61F075,675440700,0,30,1.500000,loperamide hydrochloride,loperamide hydrochloride,1.0,30,10,2008,2,1,1,0,1,0,0,0
2,3FFDCD071F4F1AA4,000937350,0,170,5.666666,lansoprazole,lansoprazole,1.0,170,33,2009,2,1,4,0,3,0,0,1
3,3FFDCD071F4F1AA4,000937350,0,170,5.666666,lansoprazole,lansoprazole,1.0,170,33,2008,2,1,4,0,3,0,0,1
4,3FFDCD071F4F1AA4,000937350,0,170,5.666666,lansoprazole,lansoprazole,1.0,170,33,2010,2,1,4,0,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,315A0F24FCAC6736,005915557,0,10,0.333333,propranolol hydrochloride,propranolol hydrochloride,1.0,10,22,2008,2,1,4,0,3,1,0,0
96,B576A9BF24CBA8D5,607930855,0,10,0.111111,levoxyl,levothyroxine sodium,0.0,10,33,2008,1,2,3,0,3,1,0,1
97,B576A9BF24CBA8D5,607930855,0,10,0.111111,levoxyl,levothyroxine sodium,0.0,10,33,2010,1,2,3,0,3,1,0,1
98,B576A9BF24CBA8D5,607930855,0,10,0.111111,levoxyl,levothyroxine sodium,0.0,10,33,2009,1,2,3,1,3,1,0,1


In [170]:
# PACKAGE DF AS CSV

filepath = "./data_transform/eng_mlcsv.csv"
df_show.to_csv(filepath, index=False, header=True)

# COPY CSVs TO S3

import time

# The timestamp is for creating a fresh folder.  Disabled here so S3 will write over with newly engineered files.
timestamp = int(time.time())
eng_df_s3_uri = gcgs_sm_sess.upload_data(bucket=gcgs_bucket, key_prefix="eng_mlcsv", path=filepath)
#eng_df_s3_uri = gcgs_sm_sess.upload_data(bucket=gcgs_bucket, key_prefix="eng_df-{}".format(timestamp), path=filepath)
print(eng_df_s3_uri)

# LIST DATA IN THE NEW S3 LOCATION

!aws s3 ls $eng_df_s3_uri

s3://my-508-projects/eng_mlcsv/eng_mlcsv.csv
2022-03-31 00:11:24    9820919 eng_mlcsv.csv


In [171]:
try:
    setup_gcgs_dependencies_passed
except NameError:
    print("+++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL NOTEBOOKS IN THE SETUP FOLDER FIRST. You are missing Setup Dependencies.")
    print("+++++++++++++++++++++++++++++++")

In [172]:
%store

Stored variables and their in-db values:
setup_gcgs_dependencies_passed             -> True
setup_gcgs_iam_roles_passed                -> True
setup_gcgs_s3_bucket_passed                -> True


# Shutting Down Kernel To Release Resources