In [1]:
import pyspark
from azure.storage.blob import BlobServiceClient
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, DoubleType, IntegerType, LongType
from pyspark.sql.functions import count,lit, when,col,expr, udf, avg,to_date,broadcast,regexp_replace,last,lpad,concat_ws,date_format,year,month
from pyspark.sql import Window
import sys
from pyspark import SparkContext
import pyspark.sql 

import pandas as pd
import matplotlib.pyplot as plt



STORAGEACCOUNTURL = "https://trainingbatchaccount.blob.core.windows.net"
STORAGEACCOUNTKEY = "2QPPHsAtQ8/fh33VE7wqg/ZaeJoxdq/pnevAEmCh0n32tC5eXa8dTEEwMHdD9Ff5k1/wVh97aubqgKzQSwOLnQ=="
CONTAINERNAME = "datasets"
# HOSPITALIZATION = "economy.csv"
# INDEX = "index.csv"


spark = SparkSession.builder.appName('azure').getOrCreate()
spark.conf.set(
        "fs.azure.account.key.trainingbatchaccount.blob.core.windows.net",
        STORAGEACCOUNTKEY
    
)

#-----------------------------Schema for hospitalizations dataset-------------------------------

hospitalizations_schema = StructType([StructField("date",StringType(), True),\
    StructField("location_key",StringType(), True),\
    StructField("new_hospitalized_patients", IntegerType(), True),\
    StructField("cumulative_hospitalized_patients", IntegerType(), True),\
    StructField("current_hospitalized_patients", IntegerType(), True),\
    StructField("new_intensive_care_patients", IntegerType(), True),\
    StructField("cumulative_intensive_care_patients", IntegerType(), True),\
    StructField("current_intensive_care_patients", IntegerType(), True),\
    StructField("new_ventilator_patients", StringType(), True),\
    StructField("cumulative_ventilator_patients", StringType(), True),\
    StructField("current_ventilator_patients", IntegerType(), True)])
#-----------------------------Schema for vaccination dataset-------------------------------   
vaccination_schema = StructType([StructField("date",StringType(), True),\
    StructField("location_key",StringType(), True),\
    StructField("new_persons_vaccinated", IntegerType(), True),\
    StructField("cumulative_persons_vaccinated", IntegerType(), True),\
    StructField("new_persons_fully_vaccinated", IntegerType(), True),\
    StructField("cumulative_persons_fully_vaccinated", IntegerType(), True),\
    StructField("new_vaccine_doses_administered", IntegerType(), True),\
    StructField("cumulative_vaccine_doses_administered", LongType(), True),\
    StructField("new_persons_vaccinated_pfizer", IntegerType(), True),\
    StructField("cumulative_persons_vaccinated_pfizer", IntegerType(), True),\
    StructField("new_persons_fully_vaccinated_pfizer", IntegerType(), True),\
    StructField("cumulative_persons_fully_vaccinated_pfizer", IntegerType(), True),\
    StructField("new_vaccine_doses_administered_pfizer", IntegerType(), True),\
    StructField("cumulative_vaccine_doses_administered_pfizer", IntegerType(), True),\
    StructField("new_persons_vaccinated_moderna", IntegerType(), True),\
    StructField("cumulative_persons_vaccinated_moderna", IntegerType(), True),\
    StructField("new_persons_fully_vaccinated_moderna", IntegerType(), True),\
    StructField("cumulative_persons_fully_vaccinated_moderna", IntegerType(), True),\
    StructField("new_vaccine_doses_administered_moderna", IntegerType(), True),\
    StructField("cumulative_vaccine_doses_administered_moderna", IntegerType(), True),\
    StructField("new_persons_vaccinated_janssen", IntegerType(), True),\
    StructField("cumulative_persons_vaccinated_janssen", IntegerType(), True),\
    StructField("new_persons_fully_vaccinated_janssen", IntegerType(), True),\
    StructField("cumulative_persons_fully_vaccinated_janssen", IntegerType(), True),\
    StructField("new_vaccine_doses_administered_janssen", IntegerType(), True),\
    StructField("cumulative_vaccine_doses_administered_janssen", IntegerType(), True),\
    StructField("new_persons_vaccinated_sinovac", IntegerType(), True),\
    StructField("total_persons_vaccinated_sinovac", IntegerType(), True),\
    StructField("new_persons_fully_vaccinated_sinovac", StringType(), True),\
    StructField("total_persons_fully_vaccinated_sinovac", StringType(), True),\
    StructField("new_vaccine_doses_administered_sinovac", StringType(), True),\
    StructField("total_vaccine_doses_administered_sinovac", StringType(), True)])
  #-----------------------------Schema for epidemiology dataset-------------------------------

epidemiology_schema = StructType([StructField("date",StringType(), True),\
    StructField("location_key",StringType(), True),\
    StructField("new_confirmed", IntegerType(), True),\
    StructField("new_deceased", IntegerType(), True),\
    StructField("new_recovered", IntegerType(), True),\
    StructField("new_tested", IntegerType(), True),\
    StructField("cumulative_confirmed", IntegerType(), True),\
    StructField("cumulative_deceased", IntegerType(), True),\
    StructField("cumulative_recovered", IntegerType(), True),\
    StructField("cumulative_tested", IntegerType(), True)])

  

hosp_df = spark.read.format('csv').option('header',True).schema(hospitalizations_schema).load("wasbs://datasets@trainingbatchaccount.blob.core.windows.net/hospitalizations.csv")
vacc_df = spark.read.format('csv').option('header',True).schema(vaccination_schema).load("wasbs://datasets@trainingbatchaccount.blob.core.windows.net/vaccinations.csv")
epi_df = spark.read.format('csv').option('header',True).schema(epidemiology_schema).load("wasbs://datasets@trainingbatchaccount.blob.core.windows.net/epidemiology.csv")



In [2]:
# to find shape of dataframe 
def sparkShape(dataFrame):
    return (dataFrame.count(), len(dataFrame.columns))
pyspark.sql.dataframe.DataFrame.shape = sparkShape

spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [3]:
from pyspark.sql import SparkSession
import functools
 

In [24]:
hosp_df.shape()

(1048563, 11)

In [4]:
epi_df.shape()

(10909296, 10)

In [5]:
vacc_df.shape()

(1783850, 32)

In [32]:
hosp_df.printSchema()

root
 |-- date: string (nullable = true)
 |-- location_key: string (nullable = true)
 |-- new_hospitalized_patients: integer (nullable = true)
 |-- cumulative_hospitalized_patients: integer (nullable = true)
 |-- current_hospitalized_patients: integer (nullable = true)
 |-- new_intensive_care_patients: integer (nullable = true)
 |-- cumulative_intensive_care_patients: integer (nullable = true)
 |-- current_intensive_care_patients: integer (nullable = true)
 |-- new_ventilator_patients: string (nullable = true)
 |-- cumulative_ventilator_patients: string (nullable = true)
 |-- current_ventilator_patients: integer (nullable = true)



In [4]:
#---------------------------------Opening hospitalization dataset and cleaning it----------------

hosp_df = hosp_df.withColumn('Date',to_date(hosp_df['date'],'dd-MM-yyyy'))
#fill empty values with zero
hosp_df = hosp_df.fillna(value = 0, subset = ['new_hospitalized_patients','current_hospitalized_patients','current_intensive_care_patients','new_ventilator_patients','cumulative_ventilator_patients','current_ventilator_patients'])
#drop unwanted column
hosp_df = hosp_df.drop('cumulative_ventilator_patients','new_ventilator_patients')
hosp_df = hosp_df.where((hosp_df.location_key == 'AR')&(hosp_df.Date >= '2021-01-01')&(hosp_df.Date <= '2021-12-31'))
# for test model 
# hosp_df = hosp_df.drop('location_key')

# hosp_df = hosp_df.filter(hosp_df["Date"].gt(lit("2021-01-01"))) 
# hosp_df= hosp_df[hosp_df['Date'] >= '01-01-2021']

In [34]:
hosp_df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- new_hospitalized_patients: integer (nullable = true)
 |-- cumulative_hospitalized_patients: integer (nullable = true)
 |-- current_hospitalized_patients: integer (nullable = true)
 |-- new_intensive_care_patients: integer (nullable = true)
 |-- cumulative_intensive_care_patients: integer (nullable = true)
 |-- current_intensive_care_patients: integer (nullable = true)
 |-- current_ventilator_patients: integer (nullable = true)



In [36]:
hosp_df.toPandas().to_csv("hosp_df_bfgb1.csv")

In [None]:
 #---------------------------------Opening epidemology dataset and cleaning it----------------
    
epi_df = epi_df.withColumn('date',to_date(epi_df['date'],format='yyyy-mm-dd'))
epi_df = epi_df.na.fill(value=0)

In [5]:
 #---------------------------------Opening vaccination dataset and cleaning it----------------
    
#selecting needed columns only
vacc_df = vacc_df.select("date","location_key","new_persons_vaccinated","cumulative_persons_vaccinated","new_persons_fully_vaccinated","cumulative_persons_fully_vaccinated","new_vaccine_doses_administered","cumulative_vaccine_doses_administered")

        #Drop rows if all the values are null
vacc_df = vacc_df.na.drop(subset=["new_persons_vaccinated","cumulative_persons_vaccinated","new_persons_fully_vaccinated","cumulative_persons_fully_vaccinated","new_vaccine_doses_administered","cumulative_vaccine_doses_administered"] ,how="all")

        #fill cumulative value with previous field value (Forward Fill)
vacc_df = vacc_df.withColumn("cumulative_persons_vaccinated", last('cumulative_persons_vaccinated', True).over(Window.partitionBy('location_key').rowsBetween(-sys.maxsize, 0)))
vacc_df = vacc_df.withColumn("cumulative_persons_fully_vaccinated",last('cumulative_persons_fully_vaccinated', True).over(Window.partitionBy('location_key').rowsBetween(-sys.maxsize, 0)))
vacc_df = vacc_df.withColumn("cumulative_vaccine_doses_administered",last('cumulative_vaccine_doses_administered', True).over(Window.partitionBy('location_key').rowsBetween(-sys.maxsize, 0)))

        #replace null with zero
vacc_df = vacc_df.na.fill(value=0)

        #correct date format
vacc_df = vacc_df.withColumn('Date_v',to_date(vacc_df['Date'],format='yyyy-MM-dd'))
# vacc_df = vacc_df.withColumn('Date_1',(vacc_df['Date']))
vacc_df = vacc_df.withColumn("Date_string",vacc_df.Date.cast(StringType()))
vacc_df = vacc_df.where((vacc_df.location_key == 'AR')&(vacc_df.Date >= '2021-01-01')&(vacc_df.Date <= '2021-12-31'))
#  hosp_df.where((hosp_df.location_key == 'AR')&(hosp_df.Date >= '2021-01-01')&(hosp_df.Date <= '2021-12-31'))
    

In [23]:
hosp_df.select('Date').tail

<bound method DataFrame.tail of +----------+
|      Date|
+----------+
|2021-01-01|
|2021-01-02|
|2021-01-03|
|2021-01-04|
|2021-01-05|
|2021-01-06|
|2021-01-07|
|2021-01-08|
|2021-01-09|
|2021-01-10|
|2021-01-11|
|2021-01-12|
|2021-01-13|
|2021-01-14|
|2021-01-15|
|2021-01-16|
|2021-01-17|
|2021-01-18|
|2021-01-19|
|2021-01-20|
+----------+
only showing top 20 rows
>

In [6]:
hosp_df.printSchema()
vacc_df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- location_key: string (nullable = true)
 |-- new_hospitalized_patients: integer (nullable = true)
 |-- cumulative_hospitalized_patients: integer (nullable = true)
 |-- current_hospitalized_patients: integer (nullable = true)
 |-- new_intensive_care_patients: integer (nullable = true)
 |-- cumulative_intensive_care_patients: integer (nullable = true)
 |-- current_intensive_care_patients: integer (nullable = true)
 |-- current_ventilator_patients: integer (nullable = true)

root
 |-- Date: date (nullable = true)
 |-- location_key: string (nullable = true)
 |-- new_persons_vaccinated: integer (nullable = true)
 |-- cumulative_persons_vaccinated: integer (nullable = true)
 |-- new_persons_fully_vaccinated: integer (nullable = true)
 |-- cumulative_persons_fully_vaccinated: integer (nullable = true)
 |-- new_vaccine_doses_administered: integer (nullable = true)
 |-- cumulative_vaccine_doses_administered: long (nullable = true)
 |-- Date_string: str

In [61]:
 
# # explicit function
# def unionAll(dfs):
#     return functools.reduce(lambda df1, df2: df1.union(df2.select(df1.columns)), dfs)
 

# df1 = spark.createDataFrame([[1, 1], [2, 2]], ['a', 'b'])
  
# # different column order.
# df2 = spark.createDataFrame([[3, 333], [4, 444]], ['b', 'a'])
# df3 = spark.createDataFrame([[555, 5], [666, 6]], ['b', 'a'])
  
unioned_df = hosp_df.unionAll(vacc_df)
unioned_df.show()

+----------+------------+-------------------------+--------------------------------+-----------------------------+---------------------------+----------------------------------+-------------------------------+---------------------------+
|      Date|location_key|new_hospitalized_patients|cumulative_hospitalized_patients|current_hospitalized_patients|new_intensive_care_patients|cumulative_intensive_care_patients|current_intensive_care_patients|current_ventilator_patients|
+----------+------------+-------------------------+--------------------------------+-----------------------------+---------------------------+----------------------------------+-------------------------------+---------------------------+
|2021-01-01|          AR|                      450|                          233586|                            0|                         87|                             37363|                              0|                          0|
|2021-01-02|          AR|                      5

In [62]:
unioned_df.shape()

(730, 9)

In [63]:
unioned_df.head()

Row(Date=datetime.date(2021, 1, 1), location_key='AR', new_hospitalized_patients=450, cumulative_hospitalized_patients=233586, current_hospitalized_patients=0, new_intensive_care_patients=87, cumulative_intensive_care_patients=37363, current_intensive_care_patients=0, current_ventilator_patients='0')

In [None]:
unioned_df.head()

In [30]:
# adding a date string column to each dataframe 

hosp_df = hosp_df.withColumn("Date_string",hosp_df.Date.cast(StringType()))
hosp_df = hosp_df.where(hosp_df.location_key == 'AR')
vacc_df = vacc_df.withColumn("Date_string",vacc_df.Date_v.cast(StringType()))
vacc_df = vacc_df.where(vacc_df.location_key == 'AR')

In [None]:
# aggregating date field

hosp_df= hosp_df.groupby("Date").sum()
vacc_df= vacc_df.groupby("Date").sum()
hosp_df.shape()


In [6]:
hosp_df= hosp_df.groupby("Date").sum()

In [7]:
hosp_df.shape()

(31, 8)

In [8]:
hosp_df.toPandas().to_csv('hosp_df.csv')


In [15]:
vacc_df.shape()

(365, 8)

In [2]:
from pyspark.sql import SQLContext

In [22]:
# joining 
# hosp_df
# vacc_df


# cond = ((hosp_df['Date'] == vacc_df['Date']))
# df_hosp_vacc = hosp_df.join(vacc_df, on = cond ,how = 'inner')


# df_hosp_vacc = hosp_df.join(vacc_df).filter(hosp_df.Date < "01-01-2021")
# joined_df = hosp_df.join(vacc_df).filter(hosp_df.Date == vacc_df.Date_v)
# joined_df = joined_df.drop['Date']
# joined_df= joined_df.groupby("Date").sum()

# dates_df.registerTempTable("dates")

# events_df.registerTempTable("events")

# df_hosp_vacc = spark.sql("SELECT * FROM hosp_df, vacc_df where hosp_df.Date >= 01-01-2021")

# df_hosp_vacc = hosp_df.join(vacc_df,hosp_df.Date == vacc_df.Date, "inner").show(truncate=False)
# cond = ((hosp_df['Date'] == vacc_df['Date']))
# df_hosp_vacc = hosp_df.join(vacc_df, on = cond ,how = 'outer')
# df_hosp_vacc = hosp_df.join(vacc_df,hosp_df.Date ==  vacc_df.Date,"inner")
# results.explain()
# df_hosp_vacc = hosp_df.drop(vacc_df["Date"])
df_hosp_vacc = hosp_df.join(vacc_df,hosp_df.Date ==  vacc_df.Date,"inner")


df_hosp_vacc.show(truncate=False)




+----------+------------+-------------------------+--------------------------------+-----------------------------+---------------------------+----------------------------------+-------------------------------+---------------------------+----------+------------+----------------------+-----------------------------+----------------------------+-----------------------------------+------------------------------+-------------------------------------+-----------+
|Date      |location_key|new_hospitalized_patients|cumulative_hospitalized_patients|current_hospitalized_patients|new_intensive_care_patients|cumulative_intensive_care_patients|current_intensive_care_patients|current_ventilator_patients|Date      |location_key|new_persons_vaccinated|cumulative_persons_vaccinated|new_persons_fully_vaccinated|cumulative_persons_fully_vaccinated|new_vaccine_doses_administered|cumulative_vaccine_doses_administered|Date_string|
+----------+------------+-------------------------+---------------------------

In [23]:
df_hosp_vacc.shape()

(365, 18)

In [25]:
df_hosp_vacc.toPandas().to_csv("df_hosp_vacc_18rows.csv")

In [10]:
df_hosp_vacc.printSchema()



root
 |-- Date: date (nullable = true)
 |-- location_key: string (nullable = true)
 |-- new_hospitalized_patients: integer (nullable = true)
 |-- cumulative_hospitalized_patients: integer (nullable = true)
 |-- current_hospitalized_patients: integer (nullable = true)
 |-- new_intensive_care_patients: integer (nullable = true)
 |-- cumulative_intensive_care_patients: integer (nullable = true)
 |-- current_intensive_care_patients: integer (nullable = true)
 |-- current_ventilator_patients: integer (nullable = true)



In [26]:
df_hosp_vacc = df_hosp_vacc.toPandas()

In [27]:
df_hosp_vacc.dtypes

Date                                     object
location_key                             object
new_hospitalized_patients                 int32
cumulative_hospitalized_patients          int32
current_hospitalized_patients             int32
new_intensive_care_patients               int32
cumulative_intensive_care_patients        int32
current_intensive_care_patients           int32
current_ventilator_patients               int32
Date                                     object
location_key                             object
new_persons_vaccinated                    int32
cumulative_persons_vaccinated             int32
new_persons_fully_vaccinated              int32
cumulative_persons_fully_vaccinated       int32
new_vaccine_doses_administered            int32
cumulative_vaccine_doses_administered     int64
Date_string                              object
dtype: object

In [9]:
hosp_df['Date'] = pd.to_datetime(hosp_df.Date , format = '%Y-%m-%d')
data = hosp_df.drop(['Date'], axis=1)
data.index = hosp_df.Date

In [76]:
pip install statsmodels

Collecting statsmodels
  Using cached statsmodels-0.13.2-cp310-cp310-win_amd64.whl (9.1 MB)
Collecting patsy>=0.5.2
  Using cached patsy-0.5.2-py2.py3-none-any.whl (233 kB)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.2 statsmodels-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [38]:
import numpy as np
n = 100
#checking stationarity
from statsmodels.tsa.vector_ar.vecm import coint_johansen
#since the test works for only 12 variables, I have randomly dropped
#in the next iteration, I would drop another and check the eigenvalues
johan_test_temp = data.drop([ 'current_hospitalized_patients'], axis=1)
johan_test_temp_dirty = johan_test_temp+0.00001*np.random.rand(365, 6)
coint_johansen(johan_test_temp_dirty,-1,1).eig


# from statsmodels.tsa.stattools import grangercausalitytests

# n = 1000
# ls = np.linspace(0, 2*np.pi, n)
# df1Clean = pd.DataFrame(np.sin(ls))
# df2Clean = pd.DataFrame(2*np.sin(ls+1))
# dfClean = pd.concat([df1Clean, df2Clean], axis=1)
# dfDirty = dfClean+0.00001*np.random.rand(n, 2)

# grangercausalitytests(dfClean, maxlag=20, verbose=False)    # Raises LinAlgError
# grangercausalitytests(dfDirty, maxlag=20, verbose=False)  

array([0.95964478, 0.6685237 , 0.39533153, 0.34395627, 0.17469942,
       0.01154349])

In [25]:
hosp_df.dtypes

Date                                  datetime64[ns]
location_key                                  object
new_hospitalized_patients                      int32
cumulative_hospitalized_patients               int32
current_hospitalized_patients                  int32
new_intensive_care_patients                    int32
cumulative_intensive_care_patients             int32
current_intensive_care_patients                int32
current_ventilator_patients                    int32
dtype: object

In [53]:
hosp_df.columns

Index(['Date', 'new_hospitalized_patients', 'cumulative_hospitalized_patients',
       'current_hospitalized_patients', 'new_intensive_care_patients',
       'cumulative_intensive_care_patients', 'current_intensive_care_patients',
       'current_ventilator_patients'],
      dtype='object')

In [54]:
# hosp_df = pd.get_dummies(hosp_df, columns=['location_key'], drop_first=True)
# hosp_df.convert_objects(convert_numeric=True)
#creating the train and validation set
train = data[:int(0.8*(len(data)))]
valid = data[int(0.8*(len(data))):]

#fit the model
from statsmodels.tsa.vector_ar.var_model import VAR

model = VAR(endog=train)
model_fit = model.fit()

# make prediction on validation
prediction = model_fit.forecast(model_fit.y, steps=len(valid))
# prediction = model_fit.forecast(model_fit(1 ,1 ,0), steps=len(valid))

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


AttributeError: 'VARResults' object has no attribute 'y'

In [None]:
# pip install statsmodels==0.12.0