## Imports

In [92]:
import requests
import pandas as pd
import numpy as np
import pyodbc 
import import_ipynb
import functions as f

## Extract

In [93]:
# create DataFrame

df_cases = pd.read_csv('./Raw/WHO-COVID19-CASES.csv')

## Transform

In [94]:
# clean column names

df_cases.columns = f.fixColumnNames(df_cases)

In [95]:
# change data types

f.changeDataType(df_cases, 'DateReported', 'datetime64')

In [96]:
# create DataFrame for the 'countries' dataset. It will be used to bring country iso code to the main DataFrame

df_countries = pd.read_csv('./Processed/COUNTRIES_DETAILED.csv')

In [97]:
# left join df_countries to df_cases to bring the 3-letter country code

df_merged = df_cases.merge(df_countries[['IsoCode', 'Alpha2']], left_on='CountryCode', right_on='Alpha2', how='left').drop(columns=['Alpha2', 'CountryCode'])

In [98]:
# replace 0 with NaN in numeric columns

df_merged[f.colTypesList(df_merged, 'int64')] = df_merged[f.colTypesList(df_merged, 'int64')].replace(0, np.nan)

AttributeError: module 'functions' has no attribute 'colTypesList'

In [99]:
# drop rows with missing 'IsoCode'

df_merged = df_merged.loc[df_merged['IsoCode'].isna() == False]

In [100]:
# reorder columns

df_merged = f.reindexCols(df_merged, [0,1,7,2,3,4,5,6])

In [101]:
# create 'Key' column by concatenating 'CountryCode' and 'DateReported'

df_merged['Key'] = df_merged['IsoCode'] + df_merged['DateReported'].astype(str).str.replace('-','')

In [102]:
# set 'Key' as an index and ensure that there are no duplicates

df_merged.set_index('Key', inplace=True, verify_integrity=True)

## Load

In [107]:
# save as parquet to a local folder

df_merged.to_parquet('./Processed/WHO-COVID19-CASES.parquet')

# upload parquet file to Azure Blob Storage container

f.uploadAzureBlob('./Processed/WHO-COVID19-CASES.parquet', 'WHO-COVID19-CASES.parquet')

AttributeError: module 'functions' has no attribute 'uploadAzureBlob'