In [142]:
import os
import re
import pyspark
# import matplotlib.pyplot as plt
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, FloatType, DateType, DoubleType
from pyspark.sql.functions import isnan, isnull, when, count, col, to_date, udf
from pyspark.sql import functions as F 
from pyspark.sql.window import Window

from airflow import DAG
from airflow.operators.python import PythonOperator
# from airflow.hooks.postgres_hook import PostgresHook
# from airflow.hooks.S3_hook import S3Hook
from io import StringIO
from datetime import datetime
from constants import COVID_EPIDEMIOLOGY_FILE, COVID_HOSPITALIZATIONS_FILE, SP500_FILE, NASDAQ_FILE, COVID_ECONOMY_FILE, COVID_WEATHER_FILE, \
    RDS_ENDPOINT, RDS_PORT, RDS_USERNAME, RDS_PASSWORD, RDS_DB_NAME, JDBC_DRIVER
from aws_helpers import s3_read_to_spark, spark_write_to_rds, spark_read_from_rds
from spark_helpers import get_spark_session_and_context

# import psycopg2

from data_warehouse_helpers import create_date_dim_df, create_company_dim_df

# airflow standalone
# give ec2 access to s3 using iam roles

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
spark, sc = get_spark_session_and_context()

In [None]:
# spark.conf.set("spark.executor.memory", "2g")
# print(sc._conf.get('spark.executor.memory'))
# spark.conf.get("spark.executor.memory")

In [9]:
conn = psycopg2.connect(  # engine
    database=RDS_DB_NAME,
    user=RDS_USERNAME,
    password=RDS_PASSWORD,
    host=RDS_ENDPOINT,
    port=RDS_PORT
)

curr = conn.cursor()
curr.execute("SELECT date FROM sp_five_hundred LIMIT(2);")

rows = curr.fetchall()
for row in rows:
    print(row[0])

2021-03-22
2021-03-19


### Stock Data

In [21]:
sp500_df = s3_read_to_spark(SP500_FILE, spark)  # seemed simpler to read s3 into pandas then spark
nasdaq_df = s3_read_to_spark(NASDAQ_FILE, spark)

sp500_df.printSchema()
sp500_df.show(5)

root
 |-- Date: string (nullable = true)
 |-- Close/Last: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)

+----------+----------+-------+-------+-------+
|      Date|Close/Last|   Open|   High|    Low|
+----------+----------+-------+-------+-------+
|07/12/2024|   5615.35|5590.76|5655.56|5590.44|
|07/11/2024|   5584.54|5635.21|5642.45|5576.53|
|07/10/2024|   5633.91|5591.26|5635.39|5586.44|
|07/09/2024|   5576.98|5584.24|5590.75|5574.57|
|07/08/2024|   5572.85|5572.75|5583.11|5562.51|
+----------+----------+-------+-------+-------+
only showing top 5 rows



In [36]:
# convert date format and lowercase all table names
def format_stock_df(df):
    df = df.withColumn('Date', to_date(col('Date'), 'MM/dd/yyyy')).withColumnRenamed('Date', 'date')  # specify the given date
    df = df.withColumnRenamed('Close/Last', 'close')
    df = df.withColumnRenamed('Open', 'open')
    df = df.withColumnRenamed('High', 'high')
    df = df.withColumnRenamed('Low', 'low')
    return df

In [7]:
sp500_df = format_stock_df(sp500_df)
nasdaq_df = format_stock_df(nasdaq_df)

spark_write_to_rds(sp500_df, table_name="sp_five_hundred")
spark_write_to_rds(nasdaq_df, table_name="nasdaq")

                                                                                

In [133]:
nasdaq_companies_file = "data/stocks/nasdaq_companies.csv"
nasdaq_companies_df = spark.read.option('header', True).csv(nasdaq_companies_file)
nasdaq_companies_df.createOrReplaceTempView("nasdaq_companies")


sp500_df = spark_read_from_rds(spark, "sp_five_hundred")
sp500_df.createOrReplaceTempView('sp_five_hundred')
nasdaq_df = spark_read_from_rds(spark, "nasdaq")
nasdaq_df.createOrReplaceTempView('nasdaq')
# nasdaq_df.show()

In [144]:
nasdaq_companies_df = create_company_dim_df(spark)
nasdaq_companies_df.show(5)
spark_write_to_rds(nasdaq_companies_df, "company_dim")
# nasdaq_companies_df.createOrReplaceTempView("nasdaq_companies")

+----------+------+--------------------+-------------+--------+-------+-----------+--------------------+
|company_id|symbol|                name|      country|ipo_year| volume|     sector|            industry|
+----------+------+--------------------+-------------+--------+-------+-----------+--------------------+
|         1|     A|Agilent Technolog...|United States|    1999|1064325|Industrials|Biotechnology: La...|
|         2|    AA|Alcoa Corporation...|United States|    2016|6201684|Industrials|            Aluminum|
|         3|  AACG|ATA Creativity Gl...|        China|    2008| 203999|Real Estate|Other Consumer Se...|
|         4|  AACI|Armada Acquisitio...|United States|    2021|  18159|    Finance|        Blank Checks|
|         5| AACIU|Armada Acquisitio...|United States|    2021|    244|    Finance|        Blank Checks|
+----------+------+--------------------+-------------+--------+-------+-----------+--------------------+
only showing top 5 rows



24/07/16 22:50:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/16 22:50:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/16 22:50:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [145]:
date_df = create_date_dim_df(spark)
spark_write_to_rds(date_df, "date_dim")
date_df.show(5)

24/07/16 22:51:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/16 22:51:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/16 22:51:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/16 22:51:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/16 22:51:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/16 22:51:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/16 2

+-------+----------+---+-------------+-----+---------------+----+-------+
|date_id|      date|day|day_name_abbr|month|month_name_abbr|year|quarter|
+-------+----------+---+-------------+-----+---------------+----+-------+
|      1|2018-01-01|  1|          Mon|    1|            Jan|2018|      1|
|      2|2018-01-02|  2|          Tue|    1|            Jan|2018|      1|
|      3|2018-01-03|  3|          Wed|    1|            Jan|2018|      1|
|      4|2018-01-04|  4|          Thu|    1|            Jan|2018|      1|
|      5|2018-01-05|  5|          Fri|    1|            Jan|2018|      1|
+-------+----------+---+-------------+-----+---------------+----+-------+
only showing top 5 rows



In [None]:
# spark.sql("""
#     SELECT
#     FROM nasdaq
#     LEFT JOIN nasdaq_companies ON symbol
# """)
## i realized we only have 

In [212]:
schema = StructType([
    StructField('symbol', StringType(), True),
    StructField('date', DateType(), True),
    StructField('open', FloatType(), True),
    StructField('high', DoubleType(), True),
    StructField('low', DoubleType(), True),
    StructField('close', DoubleType(), True),
    StructField('adj_close', DoubleType(), True),
    StructField('volume', DoubleType(), True)
])

# stock_prices_df = spark.createDataFrame()
indiv_stocks_path = 'data/stocks/indiv_stocks'

company_files = sorted(os.listdir(indiv_stocks_path))
data = []

# df = df.withColumn(
#   "num_id", F.row_number().over(Window.orderBy( 
#               F.monotonically_increasing_id()))) 

from constants import START_DATE, END_DATE

table_name = 'stock_prices'

# raw_col_names = 
last_idx = 0
for file_name in company_files:
    symbol = file_name.rstrip('.csv')

    file_path = f"{indiv_stocks_path}/{file_name}"
    temp_df = spark.read.option('header', True).csv(file_path)

    # lowercase all names
    for column in temp_df.columns:
        new_name = column.lower()
        temp_df = temp_df.withColumnRenamed(column, new_name)
    temp_df = temp_df.withColumnRenamed('adj close', 'adj_close')  # replace with underscore
    temp_df = temp_df.withColumn('date', temp_df['date'].cast('date'))

    temp_df = temp_df.where((F.col('date') >= F.lit(START_DATE)) & (F.col('date') <= F.lit(END_DATE))).select("*")


    # cast to float
    for col in ['open', 'high', 'low', 'close', 'adj_close']:
        temp_df = temp_df.withColumn(col, temp_df[col].cast('float'))
    temp_df = temp_df.withColumn('symbol', F.lit(symbol))

    temp_df = temp_df.withColumn('price_id', last_idx + F.row_number().over(Window().orderBy('date')))
    temp_df = temp_df.select('price_id', 'symbol', 'date', 'open', 'high', 'low', 'close', 'adj_close', 'volume')

    print(last_idx)
    temp_df.show(5)
    temp_df.orderBy(F.desc('date')).show(5)
    # last_idx = temp_df.select(F.last('price_id')).head()[0]
    # last_idx = temp_df.['price_id']
    # print(temp_df.orderBy(F.desc('price_id')).show(5))  # show(5)
    # if next_idx == 1:
    #     spark_write_to_rds(temp_df, table_name)
    # else:
    #     temp_df.write.insertInto(tableName=table_name)
    if last_idx != 0:
        break
    last_idx = temp_df.select('price_id').tail(1)[0][0]

0
+--------+------+----------+-----+-----+-----+-----+---------+-------+
|price_id|symbol|      date| open| high|  low|close|adj_close| volume|
+--------+------+----------+-----+-----+-----+-----+---------+-------+
|       1|     A|2018-01-02|67.42|67.89|67.34| 67.6|64.401215|1047800|
|       2|     A|2018-01-03|67.62|69.49| 67.6|69.32|66.039856|1698900|
|       3|     A|2018-01-04|69.54|69.82|68.78| 68.8|65.544426|2230700|
|       4|     A|2018-01-05|68.73| 70.1|68.73| 69.9|  66.5924|1632500|
|       5|     A|2018-01-08|69.73|70.33|69.55|70.05|66.735306|1613400|
+--------+------+----------+-----+-----+-----+-----+---------+-------+
only showing top 5 rows



24/07/17 00:25:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 0

+--------+------+----------+------+------+------+------+---------+-------+
|price_id|symbol|      date|  open|  high|   low| close|adj_close| volume|
+--------+------+----------+------+------+------+------+---------+-------+
|    1509|     A|2023-12-29|139.07| 139.7|138.36|139.03|138.54736|1014400|
|    1508|     A|2023-12-28|140.54|140.81|139.65|139.77|139.04964| 892600|
|    1507|     A|2023-12-27|139.78|140.16|139.08|139.82| 139.0994|1182300|
|    1506|     A|2023-12-26|139.31|140.47|139.09|139.81|139.08943| 948400|
|    1505|     A|2023-12-22|139.61|140.36|138.79|139.57|138.85066|1204100|
+--------+------+----------+------+------+------+------+---------+-------+
only showing top 5 rows



24/07/17 00:25:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


1509
+--------+------+----------+-----+-----+------+-----+---------+-------+
|price_id|symbol|      date| open| high|   low|close|adj_close| volume|
+--------+------+----------+-----+-----+------+-----+---------+-------+
|    1510|    AA|2018-01-02|54.06|55.22| 53.91|55.17|53.676037|2928900|
|    1511|    AA|2018-01-03|54.92|55.15| 52.96| 54.5| 53.02418|4100000|
|    1512|    AA|2018-01-04|54.81|55.43|54.075| 54.7|53.218773|3555100|
|    1513|    AA|2018-01-05|54.65|54.66| 53.41|54.09| 52.62529|3371400|
|    1514|    AA|2018-01-08|53.96|56.15| 53.66| 55.0| 53.51064|5028100|
+--------+------+----------+-----+-----+------+-----+---------+-------+
only showing top 5 rows



24/07/17 00:25:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------+------+----------+-----+------+------+-----+---------+-------+
|price_id|symbol|      date| open|  high|   low|close|adj_close| volume|
+--------+------+----------+-----+------+------+-----+---------+-------+
|    3018|    AA|2023-12-29|34.31| 34.74| 33.93| 34.0| 33.79498|3294300|
|    3017|    AA|2023-12-28|34.68|34.812|34.198|34.55|34.341667|3909900|
|    3016|    AA|2023-12-27|34.23|35.035| 33.75|34.81|34.600098|5798100|
|    3015|    AA|2023-12-26|33.86| 34.12|  33.5|33.87|33.665768|4505900|
|    3014|    AA|2023-12-22|32.96| 34.44| 32.88|33.77| 33.56637|9064600|
+--------+------+----------+-----+------+------+-----+---------+-------+
only showing top 5 rows



24/07/17 00:25:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/17 00:25:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [12]:
hospitalizations_df = s3_read_to_spark(COVID_HOSPITALIZATIONS_FILE, spark)

In [18]:
hospitalizations_df = hospitalizations_df.withColumn('date', to_date(col('date'), 'yyyy-mm-dd'))
hospitalizations_df.show(5)
hospitalizations_df.dtypes
# spark_write_to_rds(hospitalizations_df, table_name="hospitalizations")

24/07/16 08:03:32 WARN TaskSetManager: Stage 6 contains a task of very large size (14155 KiB). The maximum recommended task size is 1000 KiB.
[Stage 6:>                                                          (0 + 1) / 1]

+----------+------------+-------------------------+--------------------------------+-----------------------------+---------------------------+----------------------------------+-------------------------------+-----------------------+------------------------------+---------------------------+
|      date|location_key|new_hospitalized_patients|cumulative_hospitalized_patients|current_hospitalized_patients|new_intensive_care_patients|cumulative_intensive_care_patients|current_intensive_care_patients|new_ventilator_patients|cumulative_ventilator_patients|current_ventilator_patients|
+----------+------------+-------------------------+--------------------------------+-----------------------------+---------------------------+----------------------------------+-------------------------------+-----------------------+------------------------------+---------------------------+
|0022-01-10|          AR|                      0.0|                             0.0|                          NaN|       

24/07/16 08:03:36 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 6 (TID 6): Attempting to kill Python Worker
                                                                                

[('date', 'date'),
 ('location_key', 'string'),
 ('new_hospitalized_patients', 'double'),
 ('cumulative_hospitalized_patients', 'double'),
 ('current_hospitalized_patients', 'double'),
 ('new_intensive_care_patients', 'double'),
 ('cumulative_intensive_care_patients', 'double'),
 ('current_intensive_care_patients', 'double'),
 ('new_ventilator_patients', 'double'),
 ('cumulative_ventilator_patients', 'double'),
 ('current_ventilator_patients', 'double')]

In [22]:
spark_write_to_rds(hospitalizations_df, table_name="hospitalizations")

24/07/16 08:06:40 WARN TaskSetManager: Stage 9 contains a task of very large size (14155 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [30]:
df = spark.read.option('header', True).csv('data/covid/us_location.csv')

In [None]:
df.select(col('location_key'))

In [None]:
# # %%configure
# # {"conf": {"spark.jars.packages": "org.mongodb.spark:mongo-spark-connector_2.11:2.3.2"}}
# data_file = "data/covid/epidemiology.json"
# temp_df = spark.read.json(data_file)

In [None]:
import json
import pymongo
from constants import MONGO_URI
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

client = MongoClient(MONGO_URI, server_api=ServerApi('1'))

covid_db = client['covid_pipeline']

epid_collection = covid_db['epidemiology']


import sys
data_file = "data/covid/epidemiology.json"
with open(data_file, "r") as f:
    json_data = json.load(f)

print(sys.getsizeof(json_data))

columns, data = json_data['columns'], json_data['data']
print(columns)
documents = []

# Create a new client and connect to the server
for entry in data:
    d = {col: entry[i] for i, col in enumerate(columns)}
    documents.append(d)

x = epid_collection.insert_many(documents)
client.close()


# spark.read.format('mongodb').load()

In [None]:
df = sp500_df

print("null count")
df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).show()

# print("nan count")  # none
# df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

# Drop Nulls
cleaned_df = df.na.drop()  # subset=['gdp_usd', 'gdp_per_capita_usd']
# df.na.fill({"age": 0, "name": "unknown"}).show()  # Fill missing values


for col_name in ['new_confirmed', 'new_deceased', 'new_recovered', 'new_tested']:
    print(f"Changing {cleaned_df.where(col(col_name) < 0).select(col(col_name)).count()}, negative values in {col_name} to 0")

    cleaned_df = cleaned_df.withColumn(col_name, when(col(col_name) < 0, 0).otherwise(col(col_name)))

# print("Duplicates")  # none
duplicate_rows = cleaned_df.count() - cleaned_df.dropDuplicates().count()
print(f"Number of duplicate rows: {duplicate_rows}")
cleaned_df = cleaned_df.dropDuplicates()
cleaned_df.limit(10).show()
print(f"Reduced {df.count()} rows to {cleaned_df.count()} rows")
claned_epidemiology_df = cleaned_df

In [79]:
# https://github.com/GoogleCloudPlatform/covid-19-open-data/blob/main/docs/table-epidemiology.md

schema = StructType([
    StructField('date', DateType(), True),
    StructField('location_key', StringType(), True),
    StructField('new_confirmed', IntegerType(), True),
    StructField('new_deceased', IntegerType(), True),
    StructField('new_recovered', IntegerType(), True),
    StructField('new_tested', IntegerType(), True),
    StructField('cumulative_confirmed', IntegerType(), True),
    StructField('cumulative_deceased', IntegerType(), True),
    StructField('cumulative_recovered', IntegerType(), True),
    StructField('cumulative_tested', IntegerType(), True)
])

epidemiology_df = spark.read.format("csv").schema(schema).option("header", True).load(COVID_EPIDEMIOLOGY_FILE)

epidemiology_df.limit(5).show()

+----------+------------+-------------+------------+-------------+----------+--------------------+-------------------+--------------------+-----------------+
|      date|location_key|new_confirmed|new_deceased|new_recovered|new_tested|cumulative_confirmed|cumulative_deceased|cumulative_recovered|cumulative_tested|
+----------+------------+-------------+------------+-------------+----------+--------------------+-------------------+--------------------+-----------------+
|2020-01-01|          AD|            0|           0|         null|      null|                   0|                  0|                null|             null|
|2020-01-02|          AD|            0|           0|         null|      null|                   0|                  0|                null|             null|
|2020-01-03|          AD|            0|           0|         null|      null|                   0|                  0|                null|             null|
|2020-01-04|          AD|            0|           0|

In [80]:
df = epidemiology_df

print("null count")
df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).show()

# print("nan count")  # none
# df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

# Drop Nulls
cleaned_df = df.na.drop()  # subset=['gdp_usd', 'gdp_per_capita_usd']
# df.na.fill({"age": 0, "name": "unknown"}).show()  # Fill missing values


for col_name in ['new_confirmed', 'new_deceased', 'new_recovered', 'new_tested']:
    print(f"Changing {cleaned_df.where(col(col_name) < 0).select(col(col_name)).count()}, negative values in {col_name} to 0")

    cleaned_df = cleaned_df.withColumn(col_name, when(col(col_name) < 0, 0).otherwise(col(col_name)))

# print("Duplicates")  # none
duplicate_rows = cleaned_df.count() - cleaned_df.dropDuplicates().count()
print(f"Number of duplicate rows: {duplicate_rows}")
cleaned_df = cleaned_df.dropDuplicates()
cleaned_df.limit(10).show()
print(f"Reduced {df.count()} rows to {cleaned_df.count()} rows")
claned_epidemiology_df = cleaned_df

null count


                                                                                

+----+------------+-------------+------------+-------------+----------+--------------------+-------------------+--------------------+-----------------+
|date|location_key|new_confirmed|new_deceased|new_recovered|new_tested|cumulative_confirmed|cumulative_deceased|cumulative_recovered|cumulative_tested|
+----+------------+-------------+------------+-------------+----------+--------------------+-------------------+--------------------+-----------------+
|   0|           0|        50025|      858687|      8545363|   9331336|              198780|            1051000|             8534668|          9512906|
+----+------------+-------------+------------+-------------+----------+--------------------+-------------------+--------------------+-----------------+



                                                                                

Changing 26453, negative values in new_confirmed to 0


                                                                                

Changing 6986, negative values in new_deceased to 0


                                                                                

Changing 1888, negative values in new_recovered to 0


                                                                                

Changing 2769, negative values in new_tested to 0


                                                                                

Number of duplicate rows: 0


                                                                                

+----------+------------+-------------+------------+-------------+----------+--------------------+-------------------+--------------------+-----------------+
|      date|location_key|new_confirmed|new_deceased|new_recovered|new_tested|cumulative_confirmed|cumulative_deceased|cumulative_recovered|cumulative_tested|
+----------+------------+-------------+------------+-------------+----------+--------------------+-------------------+--------------------+-----------------+
|2020-12-23|          AT|         2843|          99|         3115|     38440|              344938|               6994|              307537|          3682136|
|2021-02-07|          AT|          993|          40|         1052|    212647|              420176|               9780|              391279|         10313170|
|2021-06-08|          AT|          356|           4|          607|    412011|              641360|              12964|              620881|         45983005|
|2021-08-15|          AT|          797|           1|



Reduced 12525825 rows to 2450433 rows


                                                                                

In [35]:
# https://github.com/GoogleCloudPlatform/covid-19-open-data/blob/main/docs/table-economy.md

schema = StructType([
    StructField('location_key', StringType(), True),
    StructField('gdp_usd', LongType(), True),
    StructField('gdp_per_capita_usd', IntegerType(), True),
    StructField('human_capital_index', FloatType(), True)
])

economy_df = spark.read.format("csv").schema(schema).option("header", True).load(COVID_ECONOMY_FILE)

economy_df.limit(5).show()

+------------+------------+------------------+-------------------+
|location_key|     gdp_usd|gdp_per_capita_usd|human_capital_index|
+------------+------------+------------------+-------------------+
|          AD|  3154057987|             40886|               null|
|          AE|421142267937|             43103|              0.659|
|          AF| 19101353832|               502|              0.389|
|          AG|  1727759259|             17790|               null|
|          AL| 15278077446|              5352|              0.621|
+------------+------------+------------------+-------------------+



In [34]:
print("null count")
economy_df.select([count(when(isnull(c), c)).alias(c) for c in economy_df.columns]).show()
# print("nan count")  # none
# economy_df.select([count(when(isnan(c), c)).alias(c) for c in economy_df.columns]).show()

# Drop Nulls
cleaned_economy_df = economy_df.na.drop(subset=['gdp_usd', 'gdp_per_capita_usd'])
# df.na.fill({"age": 0, "name": "unknown"}).show()  # Fill missing values

print("TODO: still need to handle human_capital_index")

print(f"Reduced {economy_df.count()} rows to {cleaned_economy_df.count()} rows")

print("Duplicates")  # none
duplicate_rows = cleaned_economy_df.count() - cleaned_economy_df.dropDuplicates().count()
print(f"Number of duplicate rows: {duplicate_rows}")
cleaned_economy_df = cleaned_economy_df.dropDuplicates()
cleaned_economy_df.limit(10).show()

null count
+------------+-------+------------------+-------------------+
|location_key|gdp_usd|gdp_per_capita_usd|human_capital_index|
+------------+-------+------------------+-------------------+
|           0|     31|                39|                248|
+------------+-------+------------------+-------------------+

TODO: still need to handle human_capital_index
Reduced 404 rows to 334 rows
Duplicates
Number of duplicate rows: 0
+------------+------------+------------------+-------------------+
|location_key|     gdp_usd|gdp_per_capita_usd|human_capital_index|
+------------+------------+------------------+-------------------+
|       DE_ST| 73969539000|             33394|               null|
|       IT_25|457916381400|             45548|               null|
|        AT_5| 34272274000|             61832|               null|
|       IT_88| 41212125400|             25016|               null|
|          GN| 13590281808|              1064|              0.374|
|      BE_BRU| 99104176200|

[[34m2024-07-14T14:08:44.722-0600[0m] {[34mcredentials.py:[0m621} INFO[0m - Found credentials in shared credentials file: ~/.aws/credentials[0m
root
 |-- Date: string (nullable = true)
 |-- Close/Last: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)



[Stage 0:>                                                          (0 + 1) / 1]

+----------+----------+-------+-------+-------+
|      Date|Close/Last|   Open|   High|    Low|
+----------+----------+-------+-------+-------+
|07/12/2024|   5615.35|5590.76|5655.56|5590.44|
|07/11/2024|   5584.54|5635.21|5642.45|5576.53|
+----------+----------+-------+-------+-------+
only showing top 2 rows



                                                                                

In [81]:
# df.cache()

Below: Not in use

In [78]:
# https://github.com/GoogleCloudPlatform/covid-19-open-data/blob/main/docs/table-weather.md

schema = StructType([
    StructField('date', DateType(), True),
    StructField('location_key', StringType(), True),
    StructField('average_temperature_celsius', FloatType(), True),
    StructField('minimum_temperature_celsius', FloatType(), True),
    StructField('maximum_temperature_celsius', FloatType(), True),
    StructField('rainfall_mm', FloatType(), True),
    StructField('snowfall_mm', FloatType(), True),
    StructField('dew_point', FloatType(), True),
    StructField('relative_humidity', FloatType(), True),
])

weather_df = spark.read.format("csv").schema(schema).option("header", True).load(COVID_WEATHER_FILE)  # 

weather_df.limit(5).show()

+----------+------------+---------------------------+---------------------------+---------------------------+-----------+-----------+---------+-----------------+
|      date|location_key|average_temperature_celsius|minimum_temperature_celsius|maximum_temperature_celsius|rainfall_mm|snowfall_mm|dew_point|relative_humidity|
+----------+------------+---------------------------+---------------------------+---------------------------+-----------+-----------+---------+-----------------+
|2020-01-01|          AD|                   4.236111|                   0.138889|                   8.208333|      3.302|       null|-0.972222|         72.77305|
|2020-01-02|          AD|                      3.875|                  -0.722222|                  10.055556|   6.688667|       null|   -1.625|         70.84132|
|2020-01-03|          AD|                   4.763889|                   0.597222|                   8.402778|     5.0165|       null|-0.611111|         71.11725|
|2020-01-04|          AD|   