### PART 1 - Data pipeline with only Spark's 1-Master and 1-Worker node. Run main() to start ETL

In [2]:
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from datetime import datetime, timedelta
import numpy as np
import pandas as pd


In [3]:
## ETL - get Covid date from date range 
def end_date():
  cdate = spark.sql(""" select current_date() as cdate """).collect()[0]['cdate'].strftime("%m-%d-%Y") 
  end_date = (datetime.strptime(cdate, '%m-%d-%Y') - timedelta(days=1)).strftime('%m-%d-%Y')
  return end_date

def start_date():
  ## a log file that keeps infrmation of the dates for which data has been successfully download
  try:
      dbutils.fs.ls("dbfs:/Covid_datasets/log")
      max_date = spark.sql("SELECT max(*) FROM csv.`dbfs:/Covid_datasets/log`").collect()
      start_date = (datetime.strptime(max_date[0][0], '%m-%d-%Y') + timedelta(days=1)).strftime('%m-%d-%Y')
  except:
      start_date = "01-22-2020"
  return start_date

In [4]:
# date range for which the data willbe pulled from github respository
def dateRange():
  return(pd.date_range(start = start_date(), end = end_date()))

In [5]:
# Transform data, create a aggregated sum of cases found grouped Country wise
def transform():
    global df_conf
    global df_death
    global df_recover
    
    ## Load data into dataframes 
    df_conf = spark.createDataFrame([], schema)
    df_death = spark.createDataFrame([], schema)
    df_recover = spark.createDataFrame([], schema)
    
    print("IN transform")

    for d in date_range:
        date = d.strftime("%m-%d-%Y")
        file = "file:/tmp/Covid_datasets/"+str(date)+".csv"
        print('Transform file: ',file)
        df_data = spark.read.format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat') \
                 .option('header','true')\
                 .option('inferSchema','true')\
                 .load(file)
    
        # files dated older than 03-23 has different column name.
        if 'Country/Region' in df_data.columns:
            df_data = df_data.withColumnRenamed('Country/Region','Country_Region')
            
        try:   
            df_clean = df_data['Country_Region','Confirmed','Deaths','Recovered']
        except Exception as e:
            print(e)
        
        df_table = df_clean.groupBy('Country_Region').sum()
        
        conf_table = df_table['Country_Region','sum(Confirmed)'].withColumnRenamed('sum(Confirmed)',date)
        df_conf = df_conf.join(conf_table, on="Country_Region", how='full')

        death_table = df_table['Country_Region','sum(Deaths)'].withColumnRenamed('sum(Deaths)',date)
        df_death = df_death.join(death_table, on="Country_Region", how='full')

        recover_table = df_table['Country_Region','sum(Recovered)'].withColumnRenamed('sum(Recovered)',date)
        df_recover = df_recover.join(recover_table, on="Country_Region", how='full')

In [6]:
def rename_columns(df):
    old_cols = df.columns
    new_cols = [f.strip('sum(').strip(')') for f in df.columns]
    rename = list(zip(old_cols,new_cols))
    for old,new in rename:
          df = df.withColumnRenamed(old,new)
    return df

In [7]:
# store data in some dbfs and add to it the daily updates
def load():
    print("IN LOAD")    

    # The schema/structure of the data is to have a time-series data with dates on columns and each row is a country
    
    # command with absolute path
    df_conf.write.option("mergeSchema","true").format("delta").mode("append").save("dbfs:/Covid_datasets/Covid_Confirmed")
    df_death.write.option("mergeSchema","true").format("delta").mode("append").save("dbfs:/Covid_datasets/Covid_Deaths")
    df_recover.write.option("mergeSchema","true").format("delta").mode("append").save("dbfs:/Covid_datasets/Covid_Recovered")


    # The above command appends to the table and creates a new row for exisiting Country. 
    # Groupby Countrys and sum the rows and write back to dbfs
    delta_files = ["dbfs:/Covid_datasets/Covid_Confirmed","dbfs:/Covid_datasets/Covid_Deaths","dbfs:/Covid_datasets/Covid_Recovered"]
    for file in delta_files:
        df = spark.read.format("delta").load(file).groupby('Country_Region').sum()
        # suming the rows changes the column names to sum('')
        df = rename_columns(df)
        df.write.format("delta").option("overwriteSchema","true").mode("overwrite").save(file)
    
    # Above is the temporary solutionNeed to find a better way to update the Streaming data - Use DELTA LAKE
    # create a batch streaming using delta format and adding to the schema 
    
    print('Covid_Confirmed, Covid_Deaths and Covid_Recovered tables created')


``` We are extracting data from 
instead of this link https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
    .. because above link has cumulative numbers rather than just day by day counts.```

In [9]:
def extract():
    # mine/get all .csv files in date range from github daily reports url
    print('Fetching data to local cluster........')
    for d in date_range:
        dated = d.strftime("%m-%d-%Y")
        file_dated = str(dated) +'.csv'
        url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/" + file_dated
        try:
          # extract all the files to local disk
          !wget -P /tmp/Covid_datasets "$url"
          localpath = "file:/tmp/Covid_datasets/" + file_dated
          print('Written file to databricks local filesystem: ', file_dated)          
        except Exception as e:
          print(e)       

In [10]:
## Initial ETL pipeline 
def ETL_Pipeline():
  
    global date_range
    date_range = dateRange()

    # mine/get all .csv files in date range from github daily reports url
    extract()

    # transform/process data from each file
    transform()

    # load date in specified dataframes
    load()
 
    # write to log all the dates whose data is downloaded
    mylist = list(date_range.map(lambda x: x.strftime("%m-%d-%Y")))
    spark.createDataFrame(mylist, StringType()).write.format("csv").mode("append").save("dbfs:/Covid_datasets/log")

    # recursively delete all files and folders in the directory releasing memory once copied to dbgs #freeUpFile(file)
    print("Finally, freeing the space by deleting all the files download from git")
    dbutils.fs.rm("file:/tmp/Covid_datasets",True) 

In [11]:
def main():
    """main method starts a pipeline, extracts data, transforms it and loads it into a dbfs client"""
    # global empty dataframes
    global df_conf
    global df_death
    global df_recover
    # schema for the global dataframe
    schema = StructType([StructField("Country_Region", StringType(), True)])

    ## create a separate folder in dbfs filesystem for Covid19 dataset 
    try:
      dbutils.fs.ls("dbfs:/Covid_datasets")
    except:
      dbutils.fs.mkdirs("dbfs:/Covid_datatsets")
    #create a new directory in local filesystem as staging area
    dbutils.fs.rm("file:/tmp/Covid_datasets",True)
    dbutils.fs.mkdirs("file:/tmp/Covid_datasets/")

    # date range for which the data willbe pulled from github respository
    global date_range

    # Extract, trasnform and load data in DBFS from github's covid updated data master branch
    ETL_Pipeline()

In [12]:
if __name__ == "__main__":
    main()

In [13]:
%sql
SELECT * FROM delta.`dbfs:/Covid_datasets/Covid_Confirmed` where Country_Region = 'US'

Country_Region,01-22-2020,01-23-2020,01-24-2020,01-25-2020,01-26-2020,01-27-2020,01-28-2020,01-29-2020,01-30-2020,01-31-2020,02-01-2020,02-02-2020,02-03-2020,02-04-2020,02-05-2020,02-06-2020,02-07-2020,02-08-2020,02-09-2020,02-10-2020,02-11-2020,02-12-2020,02-13-2020,02-14-2020,02-15-2020,02-16-2020,02-17-2020,02-18-2020,02-19-2020,02-20-2020,02-21-2020,02-22-2020,02-23-2020,02-24-2020,02-25-2020,02-26-2020,02-27-2020,02-28-2020,02-29-2020,03-01-2020,03-02-2020,03-03-2020,03-04-2020,03-05-2020,03-06-2020,03-07-2020,03-08-2020,03-09-2020,03-10-2020,03-11-2020,03-12-2020,03-13-2020,03-14-2020,03-15-2020,03-16-2020,03-17-2020,03-18-2020,03-19-2020,03-20-2020,03-21-2020,03-22-2020,03-23-2020,03-24-2020,03-25-2020,03-26-2020,03-27-2020,03-28-2020,03-29-2020,03-30-2020,03-31-2020,04-01-2020,04-02-2020,04-03-2020,04-04-2020
US,1,1,2,2,5,5,5,5,5,6,8,8,11,11,12,12,12,12,12,12,13,13,15,15,15,15,15,15,15,15,35,35,35,53,53,59,60,62,70,76,101,122,153,221,278,417,537,605,959,1281,1663,2179,2726,3499,4632,6421,7786,13680,19101,25493,33746,43667,53740,65778,83836,101657,121478,140886,161807,188172,213372,243453,275586,308850
