In [1]:
%%capture
%pip install kaggle

In [2]:
from typing import Iterable, Callable, Dict, Tuple
import pandas as pd
import numpy as np
from numpy.typing import ArrayLike
import data_utils as du
import os

In [3]:
REGION = os.getenv('us-central1')
shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = shell_output[0]

STORAGE_BUCKET = 'neo4j-insurance-claim-tutorial-data'
os.environ["GCLOUD_PROJECT"] = PROJECT_ID

In [4]:
!~/.local/bin/kaggle datasets download -d rohitrox/healthcare-provider-fraud-detection-analysis -p data

healthcare-provider-fraud-detection-analysis.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
!cd data && unzip -n '*.zip'

Archive:  healthcare-provider-fraud-detection-analysis.zip


In [6]:
provider_df = du.make_provider_df()
provider_df.to_csv('provider.csv', index=False)
provider_df

Unnamed: 0,provider,potentialFraud,potentialFraudInd
0,PRV51001,No,0
1,PRV51003,Yes,1
2,PRV51004,No,0
3,PRV51005,Yes,1
4,PRV51007,No,0
...,...,...,...
5405,PRV57759,No,0
5406,PRV57760,No,0
5407,PRV57761,No,0
5408,PRV57762,No,0


In [7]:
from google.cloud import storage

client = storage.Client()
bucket = client.bucket(STORAGE_BUCKET)
if not bucket.exists():
    bucket.create(location=REGION)

In [8]:
blob = bucket.blob('provider.csv')
blob.upload_from_filename('provider.csv')

In [9]:
claim_df = du.make_claim_df()
claim_df.to_csv('claim.csv', index=False)
claim_df

Unnamed: 0,beneID,claimID,claimStartDt,claimEndDt,provider,inscClaimAmtReimbursed,attendingPhysician,operatingPhysician,otherPhysician,admissionDt,...,chronicCondrheumatoidarthritis,chronicCondstroke,iPAnnualReimbursementAmt,iPAnnualDeductibleAmt,oPAnnualReimbursementAmt,oPAnnualDeductibleAmt,dobYear,isDeceased,maxDate,approxAge
0,BENE11001,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,...,1,1,36000,3204,60,70,1943,0,2009-12-01,67.0
1,BENE11001,CLM66048,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,...,1,1,36000,3204,60,70,1943,0,2009-12-01,67.0
2,BENE11001,CLM68358,2009-09-17,2009-09-20,PRV56046,5000,PHY372395,,PHY324689,2009-09-17,...,1,1,36000,3204,60,70,1943,0,2009-12-01,67.0
3,BENE11011,CLM38412,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,...,1,1,5000,1068,250,320,1914,0,2009-12-01,95.8
4,BENE11011,CLM144521,2009-01-18,2009-01-18,PRV52314,50,PHY379398,,,NaT,...,1,1,5000,1068,250,320,1914,0,2009-12-01,95.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558206,BENE159198,CLM510792,2009-08-06,2009-08-06,PRV53699,800,PHY364188,PHY364188,PHY385752,NaT,...,1,0,0,0,5470,1870,1952,0,2009-12-01,57.7
558207,BENE159198,CLM551294,2009-08-29,2009-08-29,PRV53702,400,PHY423019,PHY332284,,NaT,...,1,0,0,0,5470,1870,1952,0,2009-12-01,57.7
558208,BENE159198,CLM596444,2009-09-24,2009-09-24,PRV53676,60,PHY361063,,,NaT,...,1,0,0,0,5470,1870,1952,0,2009-12-01,57.7
558209,BENE159198,CLM636992,2009-10-18,2009-10-18,PRV53689,70,PHY403198,,PHY419379,NaT,...,1,0,0,0,5470,1870,1952,0,2009-12-01,57.7


In [10]:
blob = bucket.blob('claim.csv')
blob.upload_from_filename('claim.csv')