# Welcome!

### This notebook is the first in a series that builds the Green Cross Green Shield (GCGS) data analysis project using Medicare Data

### Note:  GCGS requires SageMaker Studio

# Start the "Data Science" Kernel
The kernel powers all of our notebook interactions.

### Click on "No Kernel" in the Upper Right
<!---  ![](img/select_kernel.png)  # Use HTML since ![]() does not support width  --->

<div>
<img src="img/select_kernel.png" width="500"/>  <!--- Use HTML since ![]() does not support width --->
</div>

### Select the `Data Science` Kernel
<!---  ![](img/select_data_science_kernel.png)  --->

<div>
<img src="img/select_data_science_kernel.png" width="500"/>
</div>

### Confirm the Kernel is Started in Upper Right
<!---  ![](img/confirm_kernel_started.png)   --->

<div>
<img src="img/confirm_kernel_started.png" width="500"/>
</div>

### NOTE:  YOU CANNOT CONTINUE UNTIL THE KERNEL IS STARTED
### ### PLEASE WAIT UNTIL THE KERNEL IS STARTED BEFORE CONTINUING!!! ###

# ----------------------------
# List of %StoreMagic local variables to avoid reuse

In [1]:
%store  
# List of %storemagic local variables from OTHER NOTEBOOKS (Avoid reuse)

Stored variables and their in-db values:
auto_ml_job_name                                      -> 'automl-dm-14-01-08-33'
autopilot_endpoint_arn                                -> 'arn:aws:sagemaker:us-east-1:850528502467:endpoint
autopilot_endpoint_name                               -> 'automl-dm-ep-14-03-53-31'
autopilot_model_arn                                   -> 'arn:aws:sagemaker:us-east-1:850528502467:model/au
autopilot_model_name                                  -> 'automl-dm-model-14-03-53-30'
autopilot_train_s3_uri                                -> 's3://sagemaker-us-east-1-850528502467/data/amazon
balanced_bias_data_jsonlines_s3_uri                   -> 's3://sagemaker-us-east-1-850528502467/bias-detect
balanced_bias_data_s3_uri                             -> 's3://sagemaker-us-east-1-850528502467/bias-detect
bias_data_s3_uri                                      -> 's3://sagemaker-us-east-1-850528502467/bias-detect
ingest_create_athena_db_passed                        -> Tr

# If not done, load all packages and dependencies using the 01_Setup_Dependencies.ipyn Notebook

In [2]:
setup_gcgs_dependencies_passed = True

In [3]:
%store setup_gcgs_dependencies_passed

Stored 'setup_gcgs_dependencies_passed' (bool)


# Load Packages and Create Session
### Sessions typically store the following: Credentials, AWS Region, Other configurations related to your profile
##### Like this: class sagemaker.session.Session(boto_session=None, sagemaker_client=None, sagemaker_runtime_client=None, sagemaker_featurestore_runtime_client=None, default_bucket=None, settings=<sagemaker.session_settings.SessionSettings object>)


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import boto3.session
import sagemaker

gcgs_sess = boto3.session.Session() # Create our own session
gcgs_sess1 = boto3.Session() # MZ EXPERIMENT
gcgs_region = gcgs_sess.region_name
gcgs_sm_sess = sagemaker.Session(default_bucket="my-508-projects") # Assigned variable with sagemaker resources available
# ASSIGNED NEW DEFAULT BUCKET: "my-508-projects"
gcgs_bucket = gcgs_sm_sess.default_bucket()

s3 = boto3.Session().client(service_name="s3", region_name=gcgs_region)
s31 = gcgs_sess.client(service_name="s3", region_name=gcgs_region) # MZ EXPERIMENT

print(gcgs_sess)
print(gcgs_sess1)
print(gcgs_region)
print(gcgs_sm_sess)
print(gcgs_bucket)
print(s3)
s31

Session(region_name='us-east-1')
Session(region_name='us-east-1')
us-east-1
<sagemaker.session.Session object at 0x7fe9eca45590>
my-508-projects
<botocore.client.S3 object at 0x7fe9eb6fa490>


<botocore.client.S3 at 0x7fe9eb09c290>

In [5]:
# CODE FOR A DEFAULT SESSION
# sqs = boto3.client('sqs')
# s3 = boto3.resource('s3')

# CODE TO CREATE OUR OWN SESSION (with low-level clients or resource clients from our custom session)
# my_session = boto3.session.Session()
# sqs = my_session.client('sqs')
# s3 = my_session.resource('s3')

In [6]:
setup_gcgs_s3_bucket_passed = False

In [7]:
print("Default bucket: {}".format(gcgs_bucket))

Default bucket: my-508-projects


# Verify S3_BUCKET Bucket Creation

In [8]:
%%bash

aws s3 ls s3://${bucket}/  # CLI command to list S3 buckets

2022-03-16 04:02:02 aws-athena-query-results-850528502467-us-east-1
2022-03-08 03:01:48 my-508-projects
2022-03-08 00:33:27 sagemaker-studio-850528502467-y0t6ilm83y9
2022-03-03 23:09:20 sagemaker-us-east-1-850528502467


In [9]:
from botocore.client import ClientError

response = None

try:
    response = s3.head_bucket(Bucket=gcgs_bucket)
    print(response)
    setup_gcgs_s3_bucket_passed = True
except ClientError as e:
    print("[ERROR] Cannot find bucket {} in {} due to {}.".format(gcgs_bucket, response, e))

{'ResponseMetadata': {'RequestId': 'T3P0JC8TH9PR48DT', 'HostId': '04KeGP/wLDjusyKKGQPd7NC+yO9RXelxUuT4O2OKNLZBN2c34/c366JCLfvSG3EguCFYVW6l1vM=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': '04KeGP/wLDjusyKKGQPd7NC+yO9RXelxUuT4O2OKNLZBN2c34/c366JCLfvSG3EguCFYVW6l1vM=', 'x-amz-request-id': 'T3P0JC8TH9PR48DT', 'date': 'Sun, 20 Mar 2022 16:51:24 GMT', 'x-amz-bucket-region': 'us-east-1', 'x-amz-access-point-alias': 'false', 'content-type': 'application/xml', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}


In [10]:
%store setup_gcgs_s3_bucket_passed

Stored 'setup_gcgs_s3_bucket_passed' (bool)


In [11]:
%store 
# Note new variables stored for later

Stored variables and their in-db values:
auto_ml_job_name                                      -> 'automl-dm-14-01-08-33'
autopilot_endpoint_arn                                -> 'arn:aws:sagemaker:us-east-1:850528502467:endpoint
autopilot_endpoint_name                               -> 'automl-dm-ep-14-03-53-31'
autopilot_model_arn                                   -> 'arn:aws:sagemaker:us-east-1:850528502467:model/au
autopilot_model_name                                  -> 'automl-dm-model-14-03-53-30'
autopilot_train_s3_uri                                -> 's3://sagemaker-us-east-1-850528502467/data/amazon
balanced_bias_data_jsonlines_s3_uri                   -> 's3://sagemaker-us-east-1-850528502467/bias-detect
balanced_bias_data_s3_uri                             -> 's3://sagemaker-us-east-1-850528502467/bias-detect
bias_data_s3_uri                                      -> 's3://sagemaker-us-east-1-850528502467/bias-detect
ingest_create_athena_db_passed                        -> Tr

# Update IAM Roles and Policies

In [12]:
import time
from time import gmtime, strftime

#sagemaker_session = sagemaker.Session()
#bucket = sagemaker_session.default_bucket()
#region = boto3.Session().region_name

from botocore.config import Config
config = Config(retries={"max_attempts": 10, "mode": "adaptive"})
iam = boto3.client("iam", config=config)
print(config)
print(iam)

<botocore.config.Config object at 0x7fe9eb074710>
<botocore.client.IAM object at 0x7fe9eaa84b10>


## Get SageMaker Execution Role Name

In [13]:
role = sagemaker.get_execution_role()
print(role)
role_name = role.split("/")[-1]
print("Role name: {}".format(role_name))

arn:aws:iam::850528502467:role/LabRole
Role name: LabRole


In [14]:
setup_gcgs_iam_roles_passed = False

# **Pre-Requisite:  SageMaker notebook instance ExecutionRole contains `AdministratorAccess` Policy.**
_Note:  The permissions used here are for demonstration purposes only.  Please follow least-privilege security principals appropriate for your environment._

In [15]:
admin = False
gcgs_post_policies = iam.list_attached_role_policies(RoleName=role_name)["AttachedPolicies"]
print(gcgs_post_policies)
for post_policy in gcgs_post_policies:
    if post_policy["PolicyName"] == "AdministratorAccess":
        admin = True
        setup_gcgs_iam_roles_passed = True
        print("[OK] You are all set up to continue with this workshop!")
        break
    else:
        print("*************** [ERROR] SageMakerExecutionRole needs the AdministratorAccess policy attached. *****************")

[{'PolicyName': 'c50727a768849l1711245t1w850528502467-VocLabPolicy1-2ET83KMVHOL2', 'PolicyArn': 'arn:aws:iam::850528502467:policy/c50727a768849l1711245t1w850528502467-VocLabPolicy1-2ET83KMVHOL2'}, {'PolicyName': 'c50727a768849l1711245t1w850528502467-VocLabPolicy2-1HMWW3L73VDPX', 'PolicyArn': 'arn:aws:iam::850528502467:policy/c50727a768849l1711245t1w850528502467-VocLabPolicy2-1HMWW3L73VDPX'}, {'PolicyName': 'c50727a768849l1711245t1w850528502467-VocLabPolicy3-16N7LPVUBPCK5', 'PolicyArn': 'arn:aws:iam::850528502467:policy/c50727a768849l1711245t1w850528502467-VocLabPolicy3-16N7LPVUBPCK5'}, {'PolicyName': 'IAMFullAccess', 'PolicyArn': 'arn:aws:iam::aws:policy/IAMFullAccess'}, {'PolicyName': 'AdministratorAccess', 'PolicyArn': 'arn:aws:iam::aws:policy/AdministratorAccess'}, {'PolicyName': 'AmazonSSMManagedInstanceCore', 'PolicyArn': 'arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore'}]
*************** [ERROR] SageMakerExecutionRole needs the AdministratorAccess policy attached. **********

In [16]:
%store setup_gcgs_iam_roles_passed

Stored 'setup_gcgs_iam_roles_passed' (bool)


In [17]:
%store

Stored variables and their in-db values:
auto_ml_job_name                                      -> 'automl-dm-14-01-08-33'
autopilot_endpoint_arn                                -> 'arn:aws:sagemaker:us-east-1:850528502467:endpoint
autopilot_endpoint_name                               -> 'automl-dm-ep-14-03-53-31'
autopilot_model_arn                                   -> 'arn:aws:sagemaker:us-east-1:850528502467:model/au
autopilot_model_name                                  -> 'automl-dm-model-14-03-53-30'
autopilot_train_s3_uri                                -> 's3://sagemaker-us-east-1-850528502467/data/amazon
balanced_bias_data_jsonlines_s3_uri                   -> 's3://sagemaker-us-east-1-850528502467/bias-detect
balanced_bias_data_s3_uri                             -> 's3://sagemaker-us-east-1-850528502467/bias-detect
bias_data_s3_uri                                      -> 's3://sagemaker-us-east-1-850528502467/bias-detect
ingest_create_athena_db_passed                        -> Tr

# *Final Check*

In [18]:
# role = iam.get_role(RoleName=role_name)
post_policies = iam.list_attached_role_policies(RoleName=role_name)["AttachedPolicies"]

required_policies = [
    "AdministratorAccess",
#     "SecretsManagerReadWrite",
#     "IAMFullAccess",
#     "AmazonS3FullAccess",
#     "AmazonAthenaFullAccess",
#     "ComprehendFullAccess",
#     "AmazonEC2ContainerRegistryFullAccess",
#     "AmazonRedshiftFullAccess",
#     "AWSStepFunctionsFullAccess",
#     "AmazonSageMakerFullAccess",
#     "AmazonKinesisFullAccess",
#     "AmazonKinesisFirehoseFullAccess",
#     "AmazonKinesisAnalyticsFullAccess",
]

admin = False

for post_policy in post_policies:
    if post_policy["PolicyName"] == "AdministratorAccess":
        admin = True
        try:
            required_policies.remove(post_policy["PolicyName"])
        except:
            break
    else:
        try:
            required_policies.remove(post_policy["PolicyName"])
        except:
            pass

if not admin and len(required_policies) > 0:
    setup_gcgs_iam_roles_passed = False
    print("*************** [ERROR] RE-RUN THIS NOTEBOOK *****************")
    for required_policy in required_policies:
        print("Not Attached: {}".format(required_policy))
else:
    setup_gcgs_iam_roles_passed = True
    print("[OK] You are all set up to continue with this workshop!")

[OK] You are all set up to continue with this workshop!


In [19]:
%store setup_gcgs_iam_roles_passed

Stored 'setup_gcgs_iam_roles_passed' (bool)


In [20]:
%store

Stored variables and their in-db values:
auto_ml_job_name                                      -> 'automl-dm-14-01-08-33'
autopilot_endpoint_arn                                -> 'arn:aws:sagemaker:us-east-1:850528502467:endpoint
autopilot_endpoint_name                               -> 'automl-dm-ep-14-03-53-31'
autopilot_model_arn                                   -> 'arn:aws:sagemaker:us-east-1:850528502467:model/au
autopilot_model_name                                  -> 'automl-dm-model-14-03-53-30'
autopilot_train_s3_uri                                -> 's3://sagemaker-us-east-1-850528502467/data/amazon
balanced_bias_data_jsonlines_s3_uri                   -> 's3://sagemaker-us-east-1-850528502467/bias-detect
balanced_bias_data_s3_uri                             -> 's3://sagemaker-us-east-1-850528502467/bias-detect
bias_data_s3_uri                                      -> 's3://sagemaker-us-east-1-850528502467/bias-detect
ingest_create_athena_db_passed                        -> Tr

# LOAD THE FILES FROM S3

### Load the NDC files from Excel

In [21]:
!aws s3 cp 's3://my-508-projects/ndc-file/ndc_excel2.csv' ./data/

fatal error: An error occurred (404) when calling the HeadObject operation: Key "ndc-file/ndc_excel2.csv" does not exist


In [22]:
import csv
# Change from UTF-8 encoding required for Window Excel file (from Stack Exchange)
ndc = pd.read_csv(
    r"./data/ndc_excel2.csv",
    encoding = "ISO-8859-1",
    engine = 'python'
# delimiter="\t",
# quoting=csv.QUOTE_NON##E,
# compression="gzip",
)
ndc.shape

(112259, 22)

In [23]:
ndc.head(5)

Unnamed: 0,PRODUCTID,PRODUCTNDCHYPH,PRODUCTNDCSTR,PRODUCTTYPENAME,PROPRIETARYNAME,PROPRIETARYNAMESUFFIX,NONPROPRIETARYNAME,DOSAGEFORMNAME,DOSAGEFORMNAME2,DOSAGEFORMNAME3,...,ROUTENAME2,ROUTENAME3,ROUTENAME4,SUBSTANCENAME,ACTIVE_NUMERATOR_STRENGTH,ACTIVE_INGRED_UNIT,PHARM_CLASSES,DEASCHEDULE,NDC_EXCLUDE_FLAG,LISTING_RECORD_CERTIFIED_THROUGH
0,0002-0800_662164fd-5ea0-4a08-bfd1-6b08bdd73342,0002-0800,20800,HUMAN OTC DRUG,Sterile Diluent,,diluent,INJECTION,SOLUTION,,...,,,,WATER,1.0,mL/mL,,,N,20221231.0
1,0002-1200_480fceef-6596-4478-97de-677c155506b3,0002-1200,21200,HUMAN PRESCRIPTION DRUG,Amyvid,,Florbetapir F 18,INJECTION,SOLUTION,,...,,,,FLORBETAPIR F-18,51.0,mCi/mL,"Positron Emitting Activity [MoA], Radioactive ...",,N,20221231.0
2,0002-1210_151a431b-f07b-4959-b6fa-c41ff80364c8,0002-1210,21210,HUMAN PRESCRIPTION DRUG,TAUVID,,Flortaucipir F-18,INJECTION,SOLUTION,,...,,,,FLORTAUCIPIR F-18,51.0,mCi/mL,,,N,20221231.0
3,0002-1433_69bd3896-91f6-4960-8538-2880159588c6,0002-1433,21433,HUMAN PRESCRIPTION DRUG,Trulicity,,Dulaglutide,INJECTION,SOLUTION,,...,,,,DULAGLUTIDE,0.75,mg/.5mL,"GLP-1 Receptor Agonist [EPC], Glucagon-Like Pe...",,N,20231231.0
4,0002-1434_69bd3896-91f6-4960-8538-2880159588c6,0002-1434,21434,HUMAN PRESCRIPTION DRUG,Trulicity,,Dulaglutide,INJECTION,SOLUTION,,...,,,,DULAGLUTIDE,1.5,mg/.5mL,"GLP-1 Receptor Agonist [EPC], Glucagon-Like Pe...",,N,20231231.0


### Load the CMS Drug File

In [24]:
!aws s3 cp 's3://my-508-projects/DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_2.zip' ./data/

download: s3://my-508-projects/DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_2.zip to data/DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_2.zip


In [25]:
import csv
# Change from UTF-8 encoding required for Window Excel file (from Stack Exchange)
drug_file = pd.read_csv(
    r"./data/DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_2.zip",
#encoding = "ISO-8859-1",
#engine = 'python'
# delimiter="\t",
# quoting=csv.QUOTE_NON##E,
    compression="zip"
)
drug_file.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(5561154, 8)

In [26]:
drug_file.head(5)

Unnamed: 0,DESYNPUF_ID,PDE_ID,SRVC_DT,PROD_SRVC_ID,QTY_DSPNSD_NUM,DAYS_SUPLY_NUM,PTNT_PAY_AMT,TOT_RX_CST_AMT
0,00000B48BCF4AD29,83224466404678,20100207,185010401,30.0,30,0.0,10.0
1,00000B48BCF4AD29,83654467130740,20100312,115163303,100.0,30,0.0,30.0
2,00000B48BCF4AD29,83574462630098,20100421,117193205,20.0,20,0.0,160.0
3,00000B48BCF4AD29,83734462622581,20100427,19458016707,30.0,30,10.0,0.0
4,00000B48BCF4AD29,83594462991534,20100611,59746011109,30.0,30,0.0,0.0


### Load the 3 Beneficiary Files

In [27]:
!aws s3 cp 's3://my-508-projects/176589_DE1_0_2008_Beneficiary_Summary_File_Sample_2.zip' ./data/
!aws s3 cp 's3://my-508-projects/176629_DE1_0_2009_Beneficiary_Summary_File_Sample_2.zip' ./data/
!aws s3 cp 's3://my-508-projects/176581_DE1_0_2010_Beneficiary_Summary_File_Sample_2.zip' ./data/

download: s3://my-508-projects/176589_DE1_0_2008_Beneficiary_Summary_File_Sample_2.zip to data/176589_DE1_0_2008_Beneficiary_Summary_File_Sample_2.zip
download: s3://my-508-projects/176629_DE1_0_2009_Beneficiary_Summary_File_Sample_2.zip to data/176629_DE1_0_2009_Beneficiary_Summary_File_Sample_2.zip
download: s3://my-508-projects/176581_DE1_0_2010_Beneficiary_Summary_File_Sample_2.zip to data/176581_DE1_0_2010_Beneficiary_Summary_File_Sample_2.zip


In [28]:
ben_2008 = pd.read_csv(
    r"./data/176589_DE1_0_2008_Beneficiary_Summary_File_Sample_2.zip",
    compression="zip"
)
ben_2008.shape

(116395, 32)

In [29]:
ben_2008.head(5)

Unnamed: 0,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,...,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,00000B48BCF4AD29,19230901,,2,5,0,10,260,12,12,...,1,81000.0,3072.0,0.0,1520.0,80.0,0.0,6260.0,1520.0,0.0
1,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,12,...,1,13260.0,2048.0,0.0,1760.0,670.0,0.0,3830.0,1010.0,50.0
2,00009C897C3D8372,19320101,,1,1,Y,7,70,12,12,...,2,37500.0,4096.0,0.0,100.0,160.0,0.0,1540.0,280.0,60.0
3,0001168CE43BE51B,19340901,,2,1,0,6,200,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0002E494BC87CE10,19140701,,1,2,0,5,200,2,2,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
ben_2009 = pd.read_csv(
    r"./data/176629_DE1_0_2009_Beneficiary_Summary_File_Sample_2.zip",
    compression="zip"
)
ben_2009.shape

(114618, 32)

In [31]:
ben_2009.head(5)

Unnamed: 0,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,...,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,00000B48BCF4AD29,19230901,,2,5,0,10,260,12,12,...,2,0.0,0.0,0.0,580.0,400.0,0.0,5720.0,1530.0,520.0
1,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,12,...,1,0.0,0.0,0.0,3380.0,1370.0,0.0,7970.0,2010.0,0.0
2,00009C897C3D8372,19320101,,1,1,0,7,70,12,12,...,2,0.0,0.0,0.0,2250.0,230.0,0.0,900.0,210.0,0.0
3,0001168CE43BE51B,19340901,,2,1,0,6,200,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0002E494BC87CE10,19140701,,1,2,0,5,200,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
ben_2010 = pd.read_csv(
    r"./data/176581_DE1_0_2010_Beneficiary_Summary_File_Sample_2.zip",
    compression="zip"
)
ben_2010.shape

(112845, 32)

In [33]:
ben_2010.head(5)

Unnamed: 0,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,...,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,00000B48BCF4AD29,19230901,,2,5,Y,10,260,12,12,...,2,0.0,0.0,0.0,600.0,30.0,0.0,3800.0,1460.0,0.0
1,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,12,...,2,0.0,0.0,0.0,240.0,190.0,0.0,440.0,60.0,0.0
2,00009C897C3D8372,19320101,,1,1,0,7,70,12,12,...,2,0.0,0.0,0.0,210.0,160.0,0.0,580.0,250.0,0.0
3,0001168CE43BE51B,19340901,,2,1,0,6,200,12,0,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0002E494BC87CE10,19140701,,1,2,0,5,200,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## What's in our S3 bucket?
#### Also, create a landing folder

In [34]:
my_path = "s3://{}/ndc-file".format(gcgs_bucket)
s3_staging_dir = "s3://{}/athena/staging".format(gcgs_bucket)
my_landing_path = "s3://{}/landing".format(gcgs_bucket)
print(my_path)
print(s3_staging_dir)
print(my_landing_path)

s3://my-508-projects/ndc-file
s3://my-508-projects/athena/staging
s3://my-508-projects/landing


In [35]:
!aws s3 ls $my_path

                           PRE ndc-file/


In [36]:
try:
    my_landing_path
    print("OK")
except NameError:
    print("*****************************************************************************")
    print("[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************")
    print("[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************")
    print("*****************************************************************************")

OK


## Import PyAthena and Create DB

In [37]:
from pyathena import connect

#Athena parameters
database_name = "gen_db"
table_name = "ndc_table"

#Connect, create and run a statement
conn = connect(region_name=gcgs_region, s3_staging_dir=s3_staging_dir)
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS gen_db


## Verify Athena DB Creation

In [38]:
#Verify DB creation

statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,bene_db
1,default
2,dsoaws
3,gen_db


## Delete Previous NDC Table to Prevent Overwrite Conflict

In [39]:
pd.read_sql("DROP TABLE IF EXISTS gen_db.ndc_table", conn)

statement = "SHOW TABLES in {}".format(database_name)

df_table = pd.read_sql(statement, conn)
df_table.head(5)

Unnamed: 0,tab_name
0,bene_2008_table
1,drugevent_table


## Create SQL Statement to Create Athena/Glue Table

In [40]:
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         PRODUCTID string,
         PRODUCTNDC string,
         PRODUCTTYPENAME string,
         PROPRIETARYNAME string,
         PROPRIETARYNAMESUFFIX string,
         NONPROPRIETARYNAME string,
         DOSAGEFORMNAME string,
         DOSAGEFORMNAME2 string,
         DOSAGEFORMNAME3 string,
         DOSAGEFORMNAME4 string,
         ROUTENAME string,
         ROUTENAME2 string,
         ROUTENAME3 string,
         ROUTENAME4 string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name, my_path
)

print(statement)

#'compressionType'='zip', 

CREATE EXTERNAL TABLE IF NOT EXISTS gen_db.ndc_table(
         PRODUCTID string,
         PRODUCTNDC string,
         PRODUCTTYPENAME string,
         PROPRIETARYNAME string,
         PROPRIETARYNAMESUFFIX string,
         NONPROPRIETARYNAME string,
         DOSAGEFORMNAME string,
         DOSAGEFORMNAME2 string,
         DOSAGEFORMNAME3 string,
         DOSAGEFORMNAME4 string,
         ROUTENAME string,
         ROUTENAME2 string,
         ROUTENAME3 string,
         ROUTENAME4 string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://my-508-projects/ndc-file'
TBLPROPERTIES ('skip.header.line.count'='1')


## Run SQL Statement to Create Athena Table (IMPORTANT: Does NOT over write old attempts)

In [41]:
run = pd.read_sql(statement, conn)
print(run)

Empty DataFrame
Columns: []
Index: []


In [42]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,bene_2008_table
1,drugevent_table
2,ndc_table


## Run Test Query to Pull Up One Drug NDC

In [43]:
productndc = "581180177"

statement = """SELECT * FROM {}.{}
    WHERE productndc = '{}' LIMIT 100""".format(
    database_name, table_name, productndc
)

print(statement)

SELECT * FROM gen_db.ndc_table
    WHERE productndc = '581180177' LIMIT 100


In [44]:
df = pd.read_sql(statement, conn)
df.head(5)

Unnamed: 0,productid,productndc,producttypename,proprietaryname,proprietarynamesuffix,nonproprietaryname,dosageformname,dosageformname2,dosageformname3,dosageformname4,routename,routename2,routename3,routename4
0,58118-0177_ab5e55ad-315a-d692-e053-2a95a90ae5a8,581180177,HUMAN PRESCRIPTION DRUG,Buprenorphine HCl,,Buprenorphine HCl,TABLET,,,,SUBLINGUAL,,,


## Run Test Query to Compare Nonproprietary and Proprietary Columns

In [45]:
statement = """SELECT COUNT(*) FROM {}.{}
    WHERE proprietaryname = nonproprietaryname LIMIT 100""".format(
    database_name, table_name
)

print(statement)

SELECT COUNT(*) FROM gen_db.ndc_table
    WHERE proprietaryname = nonproprietaryname LIMIT 100


In [46]:
df = pd.read_sql(statement, conn)
df.head(5)

Unnamed: 0,_col0
0,40358


## Select the Forms and Routes

In [47]:
dosageformname = ('AEROSOL', 'CAPSULE', 'GRANULE', 'INJECTION', 'LIQUID', 'PATCH', 'PILL', 'POWDER', 'SALVE', 'SUPPOSITORY', 'SUSPENSION', 'SYRUP', 'TABLET')
routename = ('OTIC', 'CONJUNCTIVAL', 'CUTANEOUS', 'NASAL', 'OPHTHALMIC', 'ORAL', 'OROPHARYNGEAL', 'RECTAL', 'SUBCUTANEOUS', 'SUBLINGUAL', 'SUBMUCOSAL', 'TOPICAL', 'TRANSDERMAL', 'VAGNAL')

statement = """SELECT * FROM {}.{}
    WHERE dosageformname IN {}
    AND routename IN {}
    LIMIT 100""".format(
    database_name, table_name, dosageformname, routename
)

print(statement)

SELECT * FROM gen_db.ndc_table
    WHERE dosageformname IN ('AEROSOL', 'CAPSULE', 'GRANULE', 'INJECTION', 'LIQUID', 'PATCH', 'PILL', 'POWDER', 'SALVE', 'SUPPOSITORY', 'SUSPENSION', 'SYRUP', 'TABLET')
    AND routename IN ('OTIC', 'CONJUNCTIVAL', 'CUTANEOUS', 'NASAL', 'OPHTHALMIC', 'ORAL', 'OROPHARYNGEAL', 'RECTAL', 'SUBCUTANEOUS', 'SUBLINGUAL', 'SUBMUCOSAL', 'TOPICAL', 'TRANSDERMAL', 'VAGNAL')
    LIMIT 100


In [48]:
df = pd.read_sql(statement, conn)
df.head(100)

Unnamed: 0,productid,productndc,producttypename,proprietaryname,proprietarynamesuffix,nonproprietaryname,dosageformname,dosageformname2,dosageformname3,dosageformname4,routename,routename2,routename3,routename4
0,0002-0800_662164fd-5ea0-4a08-bfd1-6b08bdd73342,000020800,HUMAN OTC DRUG,Sterile Diluent,,diluent,INJECTION,SOLUTION,,,SUBCUTANEOUS,,,
1,0002-1433_69bd3896-91f6-4960-8538-2880159588c6,000021433,HUMAN PRESCRIPTION DRUG,Trulicity,,Dulaglutide,INJECTION,SOLUTION,,,SUBCUTANEOUS,,,
2,0002-1434_69bd3896-91f6-4960-8538-2880159588c6,000021434,HUMAN PRESCRIPTION DRUG,Trulicity,,Dulaglutide,INJECTION,SOLUTION,,,SUBCUTANEOUS,,,
3,0002-1436_bec46346-20b5-4dbe-bac3-b8564e906941,000021436,HUMAN PRESCRIPTION DRUG,EMGALITY,,galcanezumab-gnlm,INJECTION,SOLUTION,,,SUBCUTANEOUS,,,
4,0002-1445_8ddfcec2-6a11-471d-92da-ea6a7365373f,000021445,HUMAN PRESCRIPTION DRUG,TALTZ,,ixekizumab,INJECTION,SOLUTION,,,SUBCUTANEOUS,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0003-6337_ed311e00-38c6-4457-b770-19d6faff45d7,000036337,HUMAN PRESCRIPTION DRUG,DROXIA,,HYDROXYUREA,CAPSULE,,,,ORAL,,,
96,0004-0038_d9030f4d-5514-4e98-8f17-74b879cae071,000040038,HUMAN PRESCRIPTION DRUG,Valcyte,,valganciclovir,TABLET,FILM COATED,,,ORAL,,,
97,0004-0039_d9030f4d-5514-4e98-8f17-74b879cae071,000040039,HUMAN PRESCRIPTION DRUG,Valcyte,,valganciclovir hydrochloride,POWDER,FOR SOLUTION,,,ORAL,,,
98,0004-0058_0d423291-6115-4522-88db-dcd1096bd432,000040058,HUMAN PRESCRIPTION DRUG,Klonopin,,Clonazepam,TABLET,,,,ORAL,,,


## Select by Form/Route and for Generics

In [49]:
dosageformname = ('AEROSOL', 'CAPSULE', 'GRANULE', 'INJECTION', 'LIQUID', 'PATCH', 'PILL', 'POWDER', 'SALVE', 'SUPPOSITORY', 'SUSPENSION', 'SYRUP', 'TABLET')
routename = ('OTIC', 'CONJUNCTIVAL', 'CUTANEOUS', 'NASAL', 'OPHTHALMIC', 'ORAL', 'OROPHARYNGEAL', 'RECTAL', 'SUBCUTANEOUS', 'SUBLINGUAL', 'SUBMUCOSAL', 'TOPICAL', 'TRANSDERMAL', 'VAGNAL')

statement = """SELECT COUNT(*) FROM {}.{}
    WHERE dosageformname IN {}
    AND routename IN {}
    AND proprietaryname = nonproprietaryname
    LIMIT 100""".format(
    database_name, table_name, dosageformname, routename
)

print(statement)

df = pd.read_sql(statement, conn)
df.head(100)

SELECT COUNT(*) FROM gen_db.ndc_table
    WHERE dosageformname IN ('AEROSOL', 'CAPSULE', 'GRANULE', 'INJECTION', 'LIQUID', 'PATCH', 'PILL', 'POWDER', 'SALVE', 'SUPPOSITORY', 'SUSPENSION', 'SYRUP', 'TABLET')
    AND routename IN ('OTIC', 'CONJUNCTIVAL', 'CUTANEOUS', 'NASAL', 'OPHTHALMIC', 'ORAL', 'OROPHARYNGEAL', 'RECTAL', 'SUBCUTANEOUS', 'SUBLINGUAL', 'SUBMUCOSAL', 'TOPICAL', 'TRANSDERMAL', 'VAGNAL')
    AND proprietaryname = nonproprietaryname
    LIMIT 100


Unnamed: 0,_col0
0,30881


## Select by Form/Route and for Non-Generics

In [50]:
dosageformname = ('AEROSOL', 'CAPSULE', 'GRANULE', 'INJECTION', 'LIQUID', 'PATCH', 'PILL', 'POWDER', 'SALVE', 'SUPPOSITORY', 'SUSPENSION', 'SYRUP', 'TABLET')
routename = ('OTIC', 'CONJUNCTIVAL', 'CUTANEOUS', 'NASAL', 'OPHTHALMIC', 'ORAL', 'OROPHARYNGEAL', 'RECTAL', 'SUBCUTANEOUS', 'SUBLINGUAL', 'SUBMUCOSAL', 'TOPICAL', 'TRANSDERMAL', 'VAGNAL')

statement = """SELECT COUNT(*) FROM {}.{}
    WHERE dosageformname IN {}
    AND routename IN {}
    AND proprietaryname <> nonproprietaryname
    LIMIT 100""".format(
    database_name, table_name, dosageformname, routename
)

print(statement)

df = pd.read_sql(statement, conn)
df.head(100)

SELECT COUNT(*) FROM gen_db.ndc_table
    WHERE dosageformname IN ('AEROSOL', 'CAPSULE', 'GRANULE', 'INJECTION', 'LIQUID', 'PATCH', 'PILL', 'POWDER', 'SALVE', 'SUPPOSITORY', 'SUSPENSION', 'SYRUP', 'TABLET')
    AND routename IN ('OTIC', 'CONJUNCTIVAL', 'CUTANEOUS', 'NASAL', 'OPHTHALMIC', 'ORAL', 'OROPHARYNGEAL', 'RECTAL', 'SUBCUTANEOUS', 'SUBLINGUAL', 'SUBMUCOSAL', 'TOPICAL', 'TRANSDERMAL', 'VAGNAL')
    AND proprietaryname <> nonproprietaryname
    LIMIT 100


Unnamed: 0,_col0
0,24426


In [51]:
#Connect, create and run a statement
conn = connect(region_name=gcgs_region, s3_staging_dir=s3_staging_dir)

database_name = "bene_db"
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS bene_db


## Delete Old and Create New Table for 2008 Beneficiary File

In [52]:
##########################  DELETE OLD TABLE ######################################################

pd.read_sql("DROP TABLE IF EXISTS gen_db.bene_2008_table", conn)

statement = "SHOW TABLES in {}".format(database_name)

df_table = pd.read_sql(statement, conn)
df_table.head(5)

##########################  CREATE NEW TABLE ######################################################

database_name = "gen_db"
table_name = "bene_2008_table"
my_path = "s3://{}/bene-2008".format(gcgs_bucket)
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
        DESYNPUF_ID string,
        BENE_BIRTH_DT string,
        BENE_DEATH_DT string,
        BENE_SEX_IDENT_CD integer,
        BENE_RACE_CD integer,
        BENE_ESRD_IND string,
        SP_STATE_CODE integer,
        BENE_COUNTY_CD integer,
        BENE_HI_CVRAGE_TOT_MONS integer,
        BENE_SMI_CVRAGE_TOT_MONS integer,
        BENE_HMO_CVRAGE_TOT_MONS integer,
        PLAN_CVRG_MOS_NUM integer,
        SP_ALZHDMTA integer,
        SP_CHF integer,
        SP_CHRNKIDN integer,
        SP_CNCR integer,
        SP_COPD integer,
        SP_DEPRESSN integer,
        SP_DIABETES integer,
        SP_ISCHMCHT integer,
        SP_OSTEOPRS integer,
        SP_RA_OA integer,
        SP_STRKETIA integer,
        MEDREIMB_IP float,
        BENRES_IP float,
        PPPYMT_IP float,
        MEDREIMB_OP float,
        BENRES_OP float,
        PPPYMT_OP float,
        MEDREIMB_CAR float,
        BENRES_CAR float,
        PPPYMT_CAR float

    
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name, my_path
)

print(statement)


CREATE EXTERNAL TABLE IF NOT EXISTS gen_db.bene_2008_table(
        DESYNPUF_ID string,
        BENE_BIRTH_DT string,
        BENE_DEATH_DT string,
        BENE_SEX_IDENT_CD integer,
        BENE_RACE_CD integer,
        BENE_ESRD_IND string,
        SP_STATE_CODE integer,
        BENE_COUNTY_CD integer,
        BENE_HI_CVRAGE_TOT_MONS integer,
        BENE_SMI_CVRAGE_TOT_MONS integer,
        BENE_HMO_CVRAGE_TOT_MONS integer,
        PLAN_CVRG_MOS_NUM integer,
        SP_ALZHDMTA integer,
        SP_CHF integer,
        SP_CHRNKIDN integer,
        SP_CNCR integer,
        SP_COPD integer,
        SP_DEPRESSN integer,
        SP_DIABETES integer,
        SP_ISCHMCHT integer,
        SP_OSTEOPRS integer,
        SP_RA_OA integer,
        SP_STRKETIA integer,
        MEDREIMB_IP float,
        BENRES_IP float,
        PPPYMT_IP float,
        MEDREIMB_OP float,
        BENRES_OP float,
        PPPYMT_OP float,
        MEDREIMB_CAR float,
        BENRES_CAR float,
        PPPYMT_CAR floa

In [53]:
##################################### Verify New Table in Athena ##########################
pd.read_sql(statement, conn)


##################################### Verify New Table in Athena ##########################

statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,bene_2008_table
1,drugevent_table
2,ndc_table


In [54]:
statement = """SELECT * FROM {}.{}
    LIMIT 100""".format(
    database_name, table_name
)

print(statement)
pd.read_sql(statement, conn)

SELECT * FROM gen_db.bene_2008_table
    LIMIT 100


Unnamed: 0,desynpuf_id,bene_birth_dt,bene_death_dt,bene_sex_ident_cd,bene_race_cd,bene_esrd_ind,sp_state_code,bene_county_cd,bene_hi_cvrage_tot_mons,bene_smi_cvrage_tot_mons,...,sp_strketia,medreimb_ip,benres_ip,pppymt_ip,medreimb_op,benres_op,pppymt_op,medreimb_car,benres_car,pppymt_car
0,00000B48BCF4AD29,19230901,,2,5,0,10,260,12,12,...,1,81000.0,3072.0,0.0,1520.0,80.0,0.0,6260.0,1520.0,0.0
1,0000525AB30E4DEF,19201001,,2,1,0,31,300,12,12,...,1,13260.0,2048.0,0.0,1760.0,670.0,0.0,3830.0,1010.0,50.0
2,00009C897C3D8372,19320101,,1,1,Y,7,70,12,12,...,2,37500.0,4096.0,0.0,100.0,160.0,0.0,1540.0,280.0,60.0
3,0001168CE43BE51B,19340901,,2,1,0,6,200,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0002E494BC87CE10,19140701,,1,2,0,5,200,2,2,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,002FD356DF03D45F,19430501,,2,1,0,26,380,12,12,...,2,0.0,0.0,0.0,240.0,130.0,0.0,640.0,180.0,100.0
96,0030143CD35CC0AF,19391201,,2,1,0,39,330,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,30.0,10.0,0.0
97,0031530C46348C97,19270901,,1,1,0,39,100,12,12,...,2,18020.0,2048.0,0.0,2350.0,590.0,0.0,300.0,80.0,0.0
98,00334D7C7D595A03,19330301,,2,1,0,23,690,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,470.0,120.0,0.0


## Delete Old and Create New Table for 2008 Beneficiary File

In [55]:
##########################  DELETE OLD TABLE ######################################################

pd.read_sql("DROP TABLE IF EXISTS gen_db.drugevent_table", conn)

statement = "SHOW TABLES in {}".format(database_name)

df_table = pd.read_sql(statement, conn)
df_table.head(5)

##########################  CREATE NEW TABLE ######################################################

database_name = "gen_db"
table_name = "drugevent_table"
my_path = "s3://{}/drugevent".format(gcgs_bucket)
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
        DESYNPUF_ID string,
        PDE_ID string,
        SRVC_DT string,
        PROD_SRVC_ID string,
        QTY_DSPNSD_NUM integer,
        DAYS_SUPLY_NUM integer,
        PTNT_PAY_AMT float,
        TOT_RX_CST_AMT float

) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name, my_path
)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS gen_db.drugevent_table(
        DESYNPUF_ID string,
        PDE_ID string,
        SRVC_DT string,
        PROD_SRVC_ID string,
        QTY_DSPNSD_NUM integer,
        DAYS_SUPLY_NUM integer,
        PTNT_PAY_AMT float,
        TOT_RX_CST_AMT float

) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://my-508-projects/drugevent'
TBLPROPERTIES ('skip.header.line.count'='1')


In [56]:
##################################### Verify New Table in Athena ##########################
pd.read_sql(statement, conn)


##################################### Verify New Table in Athena ##########################

statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,bene_2008_table
1,drugevent_table
2,ndc_table


In [57]:
statement = """SELECT * FROM {}.{}
    LIMIT 100""".format(
    database_name, table_name
)

print(statement)
pd.read_sql(statement, conn)

SELECT * FROM gen_db.drugevent_table
    LIMIT 100


Unnamed: 0,desynpuf_id,pde_id,srvc_dt,prod_srvc_id,qty_dspnsd_num,days_suply_num,ptnt_pay_amt,tot_rx_cst_amt
0,E1B6DB6E11A95247,83144465732434,20090114,62584009090,30,30,30.0,10.0
1,E1B6DB6E11A95247,83184462013196,20090116,36987245101,30,30,10.0,40.0
2,E1B6DB6E11A95247,83644462130597,20090129,54868482100,60,30,0.0,30.0
3,E1B6DB6E11A95247,83704464969308,20090130,51079078201,90,90,40.0,370.0
4,E1B6DB6E11A95247,83054466821122,20090203,26053016801,90,30,0.0,80.0
...,...,...,...,...,...,...,...,...
95,E1B9814FC70BA84F,83824463850158,20090417,63304042601,60,90,0.0,30.0
96,E1B9814FC70BA84F,83614464545584,20090419,51129315601,90,30,0.0,0.0
97,E1B9814FC70BA84F,83494467026945,20090430,00078047261,30,30,0.0,30.0
98,E1B9814FC70BA84F,83104465296184,20090523,36987344501,30,30,10.0,40.0


In [58]:
statement = """SELECT desynpuf_id, SUBSTRING(prod_srvc_id,1,9) AS NDC54 FROM {}.{}
    LIMIT 100""".format(
    database_name, table_name
)

print(statement)
pd.read_sql(statement, conn)


SELECT desynpuf_id, SUBSTRING(prod_srvc_id,1,9) AS NDC54 FROM gen_db.drugevent_table
    LIMIT 100


Unnamed: 0,desynpuf_id,NDC54
0,3DC7386EFAA125FC,580160683
1,3DC7386EFAA125FC,613920027
2,3DC7386EFAA125FC,001431171
3,3DC7386EFAA125FC,582631000
4,3DC7386EFAA125FC,008396618
...,...,...
95,3DCB1706864F3647,590120411
96,3DCB1706864F3647,551549650
97,3DCB1706864F3647,005015441
98,3DCB1706864F3647,609711176


In [87]:
statement = """
SELECT desynpuf_id, PRODUCTID, PRODUCTNDC, PRODUCTNDC2
FROM
    (
    SELECT desynpuf_id, SUBSTRING(prod_srvc_id,1,9) AS PRODUCTNDC2
    FROM gen_db.drugevent_table
    ) AS MCSubtable
JOIN gen_db.ndc_table
ON MCSubtable.PRODUCTNDC2 = gen_db.ndc_table.PRODUCTNDC

    """.format(
    database_name, table_name
)

print(statement)
pd.read_sql(statement, conn)


SELECT desynpuf_id, PRODUCTID, PRODUCTNDC, PRODUCTNDC2
FROM
    (
    SELECT desynpuf_id, SUBSTRING(prod_srvc_id,1,9) AS PRODUCTNDC2
    FROM gen_db.drugevent_table
    ) AS MCSubtable
JOIN gen_db.ndc_table
ON MCSubtable.PRODUCTNDC2 = gen_db.ndc_table.PRODUCTNDC

    


Unnamed: 0,desynpuf_id,PRODUCTID,PRODUCTNDC,PRODUCTNDC2
0,3EFB0F572165CF8C,60505-0141_cac2f51b-4cce-7646-8f00-0b6b03e2f3b2,605050141,605050141
1,3EFB44397DFA1F57,60760-377_b5805a6a-1580-1995-e053-2995a90a33c1,607600377,607600377
2,3EFB44397DFA1F57,68723-142_4b795804-92d5-4cd0-9747-8b510e3be5ac,687230142,687230142
3,3EFB44397DFA1F57,55289-039_d91ba9b1-07e8-32e9-e053-2a95a90a9bd1,552890039,552890039
4,3EFC1E19FDDA0150,0395-4202_b6c8bb19-c78f-4286-e053-2995a90afb17,003954202,003954202
...,...,...,...,...
470517,8F5A56BEC09F8751,62486-130_cd8ddff9-f8da-86d6-e053-2a95a90aa64c,624860130,624860130
470518,8F5A56BEC09F8751,10135-541_bb9fb25d-a126-99d5-e053-2a95a90a122b,101350541,101350541
470519,8F5A56BEC09F8751,68180-468_c604f04e-6251-4b55-9cf3-6b534605137b,681800468,681800468
470520,8F5A56BEC09F8751,63629-1742_449a89a6-c3f9-4da7-92ed-6ffa52583614,636291742,636291742


# Shutting Down Kernel To Release Resources