In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, to_date, col, trim, ltrim, rtrim, when, regexp_replace, monotonically_increasing_id

import os
import configparser
import pandas as pd
import psycopg2
import time

from create_tables import create_tables, drop_tables

In [2]:
# Set Postgres parameters

db_properties={}
config = configparser.ConfigParser()
config.read("parameters.cfg")
db_prop = config['postgresql_jdbc']
db_url = db_prop['url']
db_user = db_prop['username']
db_password = db_prop['password']
db_properties['username']=db_prop['username']
db_properties['password']=db_prop['password']
db_properties['url']=db_prop['url']

In [3]:
#def create_spark_session():

"""
Description: This function  creates or get (if already exists) a Spark session 

Arguments:
    None

Returns:
    spark: Spark session
"""

spark = SparkSession \
    .builder \
    .getOrCreate()
#return spark

In [4]:
conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['postgresql_sql'].values()))
cur = conn.cursor()

print("1. Dropping tables...")
start_time = time.time()
drop_tables(cur, conn)
print("--- It took %s seconds ---" % (time.time() - start_time))

print("2. Creating tables...")
start_time = time.time()
create_tables(cur, conn)
print("--- It took %s seconds ---" % (time.time() - start_time))

1. Dropping tables...
--- It took 0.05234885215759277 seconds ---
2. Creating tables...
--- It took 0.07380223274230957 seconds ---


In [5]:
# get filepath to covid data file
input_data = "data/csse_covid_19_data/csse_covid_19_daily_reports/"
covid_data = os.path.join(input_data, "*.csv")

# read covid data and write it in postgres
print ("Reading covid data file") 
df = spark.read.csv(covid_data, header=True, inferSchema=True)

print("Writing covid data to Postgres")

df.select("*") \
   .withColumn("fact_covid_id", monotonically_increasing_id()) \
   .withColumn("country", col("country_region")) \
   .withColumn("Longtitude", col("Long_").cast("float")) \
   .withColumn("Latitude", col("Lat").cast("float")) \
   .withColumn("Incident_Rate", col("Incident_Rate").cast("float")) \
   .withColumn("Case_Fatality_Ratio", col("Case_Fatality_Ratio").cast("float")) \
   .withColumn("Confirmed", col("Confirmed").cast("integer")) \
   .withColumn("Deaths", col("Deaths").cast("integer")) \
   .withColumn("Recovered", col("Recovered").cast("integer")) \
   .withColumn("Active", col("Active").cast("integer")) \
   .withColumn("Last_Update", col("Last_Update").cast("date")) \
   .drop("FIPS", "Admin2", "Long_", "Lat", "country_region") \
   .write \
   .mode("append") \
   .format("jdbc") \
   .option("url", db_url) \
   .option("dbtable", "fact_covid") \
   .option("user", db_user) \
   .option("password", db_password) \
   .save()

Reading covid data file
Writing covid data to Postgres


In [6]:
# Import countries data

print("Creating countries table")

df_countries = spark \
    .read \
    .format("csv", ) \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .option("delimiter", ",") \
    .load("data/countries.csv")

df_countries.select("*") \
    .withColumn("Population", col("Population").cast("integer")) \
    .withColumn("Area", col("Area").cast("integer")) \
    .withColumn("Country", rtrim(col("Country"))) \
    .withColumn("Region", rtrim(col("Region"))) \
    .withColumn("InfantMortality", col("Infant mortality").cast("float")) \
    .withColumn("NetMigration", col("Net migration")) \
    .withColumn("Coastline", col("Coastline").cast("float")) \
    .withColumn("NetMigration", col("Net migration").cast("float")) \
    .withColumn("GDP", col("GDP").cast("float")) \
    .withColumn("Literacy", col("Literacy").cast("long")) \
    .withColumn("Phones", col("Phones").cast("float")) \
    .withColumn("Arable", col("Arable").cast("float")) \
    .withColumn("Crops", col("Crops").cast("float")) \
    .withColumn("Other", col("Other").cast("float")) \
    .drop("Net migration", "Infant mortality") \
    .write \
    .mode("append") \
    .format("jdbc") \
    .option("url", db_url) \
    .option("dbtable", "dim_countries") \
    .option("user", db_user) \
    .option("password", db_password) \
    .save()

Creating countries table


In [7]:
# Import Economic exposure data

print("Creating exposure table")

pd_df_exposure = pd.read_csv("data/exposure.csv", sep=";", encoding="ISO-8859-1", dtype=str)
pd_df_exposure = pd_df_exposure.replace({'x': None, 'No data': None, '0': None})
pd_df_exposure = pd_df_exposure.apply(lambda x: x.str.replace(',','.'))
pd_df_exposure.columns = pd_df_exposure.columns.str.strip()
pd_df_exposure.columns = pd_df_exposure.columns.str.replace("[ ]", "_")
pd_df_exposure.columns = pd_df_exposure.columns.str.replace("[,]", "")

df_exposure = spark.createDataFrame(pd_df_exposure.astype(str))

df_exposure.select("*") \
    .withColumn("Net_ODA_received_perc_of_GNI", col("Net_ODA_received_perc_of_GNI").cast("float")) \
    .withColumn("Aid_dependence", col("Aid_dependence").cast("float")) \
    .withColumn("Volume_of_remittances", col("Volume_of_remittances_in_USD_as_a_proportion_of_total_GDP_percent_2014-18").cast("float")) \
    .withColumn("Remittances", col("Remittances").cast("float")) \
    .withColumn("Food_imports_percent_of_total_merchandise_exports", col("Food_imports_percent_of_total_merchandise_exports").cast("float")) \
    .withColumn("food_import_dependence", col("food_import_dependence").cast("float")) \
    .withColumn("Fuels_ores_and_metals_exports", col("Fuels_ores_and_metals_exports").cast("float")) \
    .withColumn("primary_commodity_export_dependence", col("primary_commodity_export_dependence").cast("float")) \
    .withColumn("tourism_as_percentage_of_GDP", col("tourism_as_percentage_of_GDP").cast("float")) \
    .withColumn("tourism_dependence", col("tourism_dependence").cast("float")) \
    .withColumn("General_government_gross_debt_Percent_of_GDP_2019", col("General_government_gross_debt_Percent_of_GDP_2019").cast("float")) \
    .withColumn("Government_indeptedness", col("Government_indeptedness").cast("float")) \
    .withColumn("Total_reserves_in_months_of_imports_2018", col("Total_reserves_in_months_of_imports_2018").cast("float")) \
    .withColumn("Foreign_currency_reserves", col("Foreign_currency_reserves").cast("float")) \
    .withColumn("Foreign_direct_investment_net_inflows_percent_of_GDP", col("Foreign_direct_investment_net_inflows_percent_of_GDP").cast("float")) \
    .withColumn("Foreign_direct_investment", col("Foreign_direct_investment").cast("float")) \
    .withColumn("Covid_19_Economic_exposure_index", col("Covid_19_Economic_exposure_index").cast("float")) \
    .withColumn("Covid_19_Economic_exposure_index", col("Covid_19_Economic_exposure_index").cast("float")) \
    .withColumn("Covid_19_Economic_exposure_index_Ex_aid_and_FDI", col("Covid_19_Economic_exposure_index_Ex_aid_and_FDI").cast("float")) \
    .withColumn("Covid_19_Economic_exposure_index_Ex_aid_and_FDI_and_food_import", col("Covid_19_Economic_exposure_index_Ex_aid_and_FDI_and_food_import").cast("float")) \
    .drop("Volume_of_remittances_in_USD_as_a_proportion_of_total_GDP_percent_2014-18") \
    .write \
    .mode("append") \
    .format("jdbc") \
    .option("url", db_url) \
    .option("dbtable", "dim_exposure") \
    .option("user", db_user) \
    .option("password", db_password) \
    .save()

Creating exposure table


  pd_df_exposure.columns = pd_df_exposure.columns.str.replace("[ ]", "_")
  pd_df_exposure.columns = pd_df_exposure.columns.str.replace("[,]", "")


In [8]:
# Import Vaccination data

print("Creating vaccination table")

df_vaccination = spark \
    .read \
    .format("csv") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .option("delimiter", ",") \
    .load("data/vaccination.csv")

df_vaccination.select("*") \
    .withColumn("country", col("location")) \
    .withColumn("date", regexp_replace("date", "/", "").cast("date")) \
    .withColumn("total_vaccinations", col("total_vaccinations").cast("integer")) \
    .withColumn("people_vaccinated", col("people_vaccinated").cast("integer")) \
    .withColumn("people_fully_vaccinated", col("people_fully_vaccinated").cast("integer")) \
    .withColumn("total_boosters", col("total_boosters").cast("integer")) \
    .withColumn("daily_vaccinations_raw", col("daily_vaccinations_raw").cast("integer")) \
    .withColumn("daily_vaccinations", col("daily_vaccinations").cast("integer")) \
    .withColumn("total_vaccinations_per_hundred", col("total_vaccinations_per_hundred").cast("float")) \
    .withColumn("people_vaccinated_per_hundred", col("people_vaccinated_per_hundred").cast("float")) \
    .withColumn("people_fully_vaccinated_per_hundred", col("people_fully_vaccinated_per_hundred").cast("float")) \
    .withColumn("total_boosters_per_hundred", col("total_boosters_per_hundred").cast("float")) \
    .withColumn("daily_vaccinations_per_million", col("daily_vaccinations_per_million").cast("integer")) \
    .drop("location") \
    .write \
    .mode("append") \
    .format("jdbc") \
    .option("url", db_url) \
    .option("dbtable", "dim_vaccination") \
    .option("user", db_user) \
    .option("password", db_password) \
    .save()

Creating vaccination table
