In [46]:
import pyspark
from azure.storage.blob import BlobServiceClient
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import StructType, StructField, StringType, FloatType, DoubleType, IntegerType, LongType
from pyspark.sql.functions import count,lit, when,col,expr, udf, avg,to_date,broadcast,regexp_replace,last,lpad,concat_ws,date_format,year,month
from pyspark.sql import Window
import sys
from pyspark import SparkContext
import pyspark.sql 

import pandas as pd
import matplotlib.pyplot as plt



STORAGEACCOUNTURL = "https://trainingbatchaccount.blob.core.windows.net"
STORAGEACCOUNTKEY = "2QPPHsAtQ8/fh33VE7wqg/ZaeJoxdq/pnevAEmCh0n32tC5eXa8dTEEwMHdD9Ff5k1/wVh97aubqgKzQSwOLnQ=="
CONTAINERNAME = "datasets"
# HOSPITALIZATION = "economy.csv"
# INDEX = "index.csv"


spark = SparkSession.builder.appName('azure').getOrCreate()
spark.conf.set(
        "fs.azure.account.key.trainingbatchaccount.blob.core.windows.net",
        STORAGEACCOUNTKEY
    
)

#-----------------------------Schema for hospitalizations dataset-------------------------------

hospitalizations_schema = StructType([StructField("date",StringType(), True),\
    StructField("location_key",StringType(), True),\
    StructField("new_hospitalized_patients", IntegerType(), True),\
    StructField("cumulative_hospitalized_patients", IntegerType(), True),\
    StructField("current_hospitalized_patients", IntegerType(), True),\
    StructField("new_intensive_care_patients", IntegerType(), True),\
    StructField("cumulative_intensive_care_patients", IntegerType(), True),\
    StructField("current_intensive_care_patients", IntegerType(), True),\
    StructField("new_ventilator_patients", StringType(), True),\
    StructField("cumulative_ventilator_patients", StringType(), True),\
    StructField("current_ventilator_patients", IntegerType(), True)])
#-----------------------------Schema for vaccination dataset-------------------------------   
vaccination_schema = StructType([StructField("date",StringType(), True),\
    StructField("location_key",StringType(), True),\
    StructField("new_persons_vaccinated", IntegerType(), True),\
    StructField("cumulative_persons_vaccinated", IntegerType(), True),\
    StructField("new_persons_fully_vaccinated", IntegerType(), True),\
    StructField("cumulative_persons_fully_vaccinated", IntegerType(), True),\
    StructField("new_vaccine_doses_administered", IntegerType(), True),\
    StructField("cumulative_vaccine_doses_administered", LongType(), True),\
    StructField("new_persons_vaccinated_pfizer", IntegerType(), True),\
    StructField("cumulative_persons_vaccinated_pfizer", IntegerType(), True),\
    StructField("new_persons_fully_vaccinated_pfizer", IntegerType(), True),\
    StructField("cumulative_persons_fully_vaccinated_pfizer", IntegerType(), True),\
    StructField("new_vaccine_doses_administered_pfizer", IntegerType(), True),\
    StructField("cumulative_vaccine_doses_administered_pfizer", IntegerType(), True),\
    StructField("new_persons_vaccinated_moderna", IntegerType(), True),\
    StructField("cumulative_persons_vaccinated_moderna", IntegerType(), True),\
    StructField("new_persons_fully_vaccinated_moderna", IntegerType(), True),\
    StructField("cumulative_persons_fully_vaccinated_moderna", IntegerType(), True),\
    StructField("new_vaccine_doses_administered_moderna", IntegerType(), True),\
    StructField("cumulative_vaccine_doses_administered_moderna", IntegerType(), True),\
    StructField("new_persons_vaccinated_janssen", IntegerType(), True),\
    StructField("cumulative_persons_vaccinated_janssen", IntegerType(), True),\
    StructField("new_persons_fully_vaccinated_janssen", IntegerType(), True),\
    StructField("cumulative_persons_fully_vaccinated_janssen", IntegerType(), True),\
    StructField("new_vaccine_doses_administered_janssen", IntegerType(), True),\
    StructField("cumulative_vaccine_doses_administered_janssen", IntegerType(), True),\
    StructField("new_persons_vaccinated_sinovac", IntegerType(), True),\
    StructField("total_persons_vaccinated_sinovac", IntegerType(), True),\
    StructField("new_persons_fully_vaccinated_sinovac", StringType(), True),\
    StructField("total_persons_fully_vaccinated_sinovac", StringType(), True),\
    StructField("new_vaccine_doses_administered_sinovac", StringType(), True),\
    StructField("total_vaccine_doses_administered_sinovac", StringType(), True)])
  #-----------------------------Schema for epidemiology dataset-------------------------------

epidemiology_schema = StructType([StructField("date",StringType(), True),\
    StructField("location_key",StringType(), True),\
    StructField("new_confirmed", IntegerType(), True),\
    StructField("new_deceased", IntegerType(), True),\
    StructField("new_recovered", IntegerType(), True),\
    StructField("new_tested", IntegerType(), True),\
    StructField("cumulative_confirmed", IntegerType(), True),\
    StructField("cumulative_deceased", IntegerType(), True),\
    StructField("cumulative_recovered", IntegerType(), True),\
    StructField("cumulative_tested", IntegerType(), True)])

  

hosp_df = spark.read.format('csv').option('header',True).schema(hospitalizations_schema).load("wasbs://datasets@trainingbatchaccount.blob.core.windows.net/hospitalizations.csv")
vacc_df = spark.read.format('csv').option('header',True).schema(vaccination_schema).load("wasbs://datasets@trainingbatchaccount.blob.core.windows.net/vaccinations.csv")
epi_df = spark.read.format('csv').option('header',True).schema(epidemiology_schema).load("wasbs://datasets@trainingbatchaccount.blob.core.windows.net/epidemiology.csv")



In [47]:
# to find shape of dataframe 
def sparkShape(dataFrame):
    return (dataFrame.count(), len(dataFrame.columns))
pyspark.sql.dataframe.DataFrame.shape = sparkShape

spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [1]:
from pyspark.sql import SparkSession
import functools
 

In [49]:
#---------------------------------Opening hospitalization dataset and cleaning it----------------

hosp_df = hosp_df.withColumn('Date',to_date(hosp_df['date'],'dd-MM-yyyy'))
#fill empty values with zero
hosp_df = hosp_df.fillna(value = 0, subset = ['new_hospitalized_patients','current_hospitalized_patients','current_intensive_care_patients','new_ventilator_patients','cumulative_ventilator_patients','current_ventilator_patients'])
#drop unwanted column
hosp_df = hosp_df.drop('cumulative_ventilator_patients','new_ventilator_patients')
hosp_df = hosp_df.where((hosp_df.location_key == 'AR')&(hosp_df.Date >= '2021-01-01')&(hosp_df.Date <= '2021-12-31'))


In [50]:
 #---------------------------------Opening epidemology dataset and cleaning it----------------
    
# epi_df = epi_df.withColumn('date',to_date(epi_df['date'],format='yyyy-mm-dd'))
# epi_df = epi_df.na.fill(value=0)


#correct date format
epi_df = epi_df.withColumn('Date',to_date(epi_df['date'],format='yyyy-MM-dd'))
epi_df = epi_df.na.fill(value=0)
# vacc_df = vacc_df.withColumn('Date_1',(vacc_df['Date']))
# vacc_df = vacc_df.withColumn("Date_string",vacc_df.Date.cast(StringType()))
epi_df = epi_df.where((epi_df.location_key == 'AR')&(epi_df.Date >= '2021-01-01')&(epi_df.Date <= '2021-12-31'))
epi_df = epi_df.drop('location_key')

In [51]:
epi_df.tail(5)

[Row(Date=datetime.date(2021, 12, 27), new_confirmed=37421, new_deceased=31, new_recovered=0, new_tested=101951, cumulative_confirmed=5544836, cumulative_deceased=117462, cumulative_recovered=0, cumulative_tested=20840125),
 Row(Date=datetime.date(2021, 12, 28), new_confirmed=50165, new_deceased=34, new_recovered=0, new_tested=127203, cumulative_confirmed=5595001, cumulative_deceased=117496, cumulative_recovered=0, cumulative_tested=20967328),
 Row(Date=datetime.date(2021, 12, 29), new_confirmed=59711, new_deceased=33, new_recovered=0, new_tested=149689, cumulative_confirmed=5654712, cumulative_deceased=117529, cumulative_recovered=0, cumulative_tested=21117017),
 Row(Date=datetime.date(2021, 12, 30), new_confirmed=63734, new_deceased=45, new_recovered=0, new_tested=162734, cumulative_confirmed=5718446, cumulative_deceased=117574, cumulative_recovered=0, cumulative_tested=21279751),
 Row(Date=datetime.date(2021, 12, 31), new_confirmed=45805, new_deceased=26, new_recovered=0, new_tested

In [52]:
epi_df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- new_confirmed: integer (nullable = true)
 |-- new_deceased: integer (nullable = true)
 |-- new_recovered: integer (nullable = true)
 |-- new_tested: integer (nullable = true)
 |-- cumulative_confirmed: integer (nullable = true)
 |-- cumulative_deceased: integer (nullable = true)
 |-- cumulative_recovered: integer (nullable = true)
 |-- cumulative_tested: integer (nullable = true)



In [53]:
 #---------------------------------Opening vaccination dataset and cleaning it----------------
    
#selecting needed columns only
vacc_df = vacc_df.select("date","location_key","new_persons_vaccinated","cumulative_persons_vaccinated","new_persons_fully_vaccinated","cumulative_persons_fully_vaccinated","new_vaccine_doses_administered","cumulative_vaccine_doses_administered")

#Drop rows if all the values are null
vacc_df = vacc_df.na.drop(subset=["new_persons_vaccinated","cumulative_persons_vaccinated","new_persons_fully_vaccinated","cumulative_persons_fully_vaccinated","new_vaccine_doses_administered","cumulative_vaccine_doses_administered"] ,how="all")

#fill cumulative value with previous field value (Forward Fill)
vacc_df = vacc_df.withColumn("cumulative_persons_vaccinated", last('cumulative_persons_vaccinated', True).over(Window.partitionBy('location_key').rowsBetween(-sys.maxsize, 0)))
vacc_df = vacc_df.withColumn("cumulative_persons_fully_vaccinated",last('cumulative_persons_fully_vaccinated', True).over(Window.partitionBy('location_key').rowsBetween(-sys.maxsize, 0)))
vacc_df = vacc_df.withColumn("cumulative_vaccine_doses_administered",last('cumulative_vaccine_doses_administered', True).over(Window.partitionBy('location_key').rowsBetween(-sys.maxsize, 0)))

#replace null with zero
vacc_df = vacc_df.na.fill(value=0)

#correct date format
vacc_df = vacc_df.withColumn('Date',to_date(vacc_df['Date'],format='yyyy-MM-dd'))
# vacc_df = vacc_df.withColumn('Date_1',(vacc_df['Date']))
# vacc_df = vacc_df.withColumn("Date_string",vacc_df.Date.cast(StringType()))
vacc_df = vacc_df.where((vacc_df.location_key == 'AR')&(vacc_df.Date >= '2021-01-01')&(vacc_df.Date <= '2021-12-31'))
#  hosp_df.where((hosp_df.location_key == 'AR')&(hosp_df.Date >= '2021-01-01')&(hosp_df.Date <= '2021-12-31'))
vacc_df = vacc_df.drop('location_key')
    

In [54]:
hosp_df.printSchema()
vacc_df.printSchema()
epi_df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- location_key: string (nullable = true)
 |-- new_hospitalized_patients: integer (nullable = true)
 |-- cumulative_hospitalized_patients: integer (nullable = true)
 |-- current_hospitalized_patients: integer (nullable = true)
 |-- new_intensive_care_patients: integer (nullable = true)
 |-- cumulative_intensive_care_patients: integer (nullable = true)
 |-- current_intensive_care_patients: integer (nullable = true)
 |-- current_ventilator_patients: integer (nullable = true)

root
 |-- Date: date (nullable = true)
 |-- new_persons_vaccinated: integer (nullable = true)
 |-- cumulative_persons_vaccinated: integer (nullable = true)
 |-- new_persons_fully_vaccinated: integer (nullable = true)
 |-- cumulative_persons_fully_vaccinated: integer (nullable = true)
 |-- new_vaccine_doses_administered: integer (nullable = true)
 |-- cumulative_vaccine_doses_administered: long (nullable = true)

root
 |-- Date: date (nullable = true)
 |-- new_confirmed: integ

In [55]:
# joining 
# hosp_df
# vacc_df

df_hosp_vacc = hosp_df.join(vacc_df,['Date'],"inner")

df_hosp_vacc_epi = df_hosp_vacc.join(epi_df,['Date'],"inner")
# df_hosp_vacc = hosp_df.join(vacc_df,hosp_df.Date ==  vacc_df.Date,"inner")


# df_hosp_vacc_epi.show(truncate=False)




In [35]:
df_hosp_vacc_epi.shape()

(365, 23)

In [None]:
# df_hosp_vacc_epi = df_hosp_vacc_epi.drop('location_key')

In [37]:
df_hosp_vacc_epi.toPandas().to_csv("df_hosp_vacc_epi1.csv")

In [56]:
df_hosp_vacc_epi.printSchema()

root
 |-- Date: date (nullable = true)
 |-- location_key: string (nullable = true)
 |-- new_hospitalized_patients: integer (nullable = true)
 |-- cumulative_hospitalized_patients: integer (nullable = true)
 |-- current_hospitalized_patients: integer (nullable = true)
 |-- new_intensive_care_patients: integer (nullable = true)
 |-- cumulative_intensive_care_patients: integer (nullable = true)
 |-- current_intensive_care_patients: integer (nullable = true)
 |-- current_ventilator_patients: integer (nullable = true)
 |-- new_persons_vaccinated: integer (nullable = true)
 |-- cumulative_persons_vaccinated: integer (nullable = true)
 |-- new_persons_fully_vaccinated: integer (nullable = true)
 |-- cumulative_persons_fully_vaccinated: integer (nullable = true)
 |-- new_vaccine_doses_administered: integer (nullable = true)
 |-- cumulative_vaccine_doses_administered: long (nullable = true)
 |-- new_confirmed: integer (nullable = true)
 |-- new_deceased: integer (nullable = true)
 |-- new_recov

In [57]:
df_hosp_vacc_epi=df_hosp_vacc_epi.drop('location_key')


df_hosp_vacc_epi=df_hosp_vacc_epi.toPandas()

In [58]:
df_hosp_vacc_epi.dtypes

Date                                     object
new_hospitalized_patients                 int32
cumulative_hospitalized_patients          int32
current_hospitalized_patients             int32
new_intensive_care_patients               int32
cumulative_intensive_care_patients        int32
current_intensive_care_patients           int32
current_ventilator_patients               int32
new_persons_vaccinated                    int32
cumulative_persons_vaccinated             int32
new_persons_fully_vaccinated              int32
cumulative_persons_fully_vaccinated       int32
new_vaccine_doses_administered            int32
cumulative_vaccine_doses_administered     int64
new_confirmed                             int32
new_deceased                              int32
new_recovered                             int32
new_tested                                int32
cumulative_confirmed                      int32
cumulative_deceased                       int32
cumulative_recovered                    

In [80]:
df_hosp_vacc_epi['Date'] = pd.to_datetime(df_hosp_vacc_epi.Date , format = '%Y-%m-%d')

df_hosp_vacc_epi = df_hosp_vacc_epi.sort_values(by='Date')


data = df_hosp_vacc_epi.drop(['Date'], axis=1)
data.index = df_hosp_vacc_epi.Date

In [None]:
pip install statsmodels

In [81]:
import numpy as np
n = 100
#checking stationarity
from statsmodels.tsa.vector_ar.vecm import coint_johansen
#since the test works for only 12 variables, I have randomly dropped
#in the next iteration, I would drop another and check the eigenvalues
johan_test_temp = data.drop([ 'current_hospitalized_patients'], axis=1)
johan_test_temp_dirty = johan_test_temp+0.00001*np.random.rand(365, 20)
coint_johansen(johan_test_temp_dirty,0,1).eig


# from statsmodels.tsa.stattools import grangercausalitytests

# n = 1000
# ls = np.linspace(0, 2*np.pi, n)
# df1Clean = pd.DataFrame(np.sin(ls))
# df2Clean = pd.DataFrame(2*np.sin(ls+1))
# dfClean = pd.concat([df1Clean, df2Clean], axis=1)
# dfDirty = dfClean+0.00001*np.random.rand(n, 2)

# grangercausalitytests(dfClean, maxlag=20, verbose=False)    # Raises LinAlgError
# grangercausalitytests(dfDirty, maxlag=20, verbose=False)  



LinAlgError: Matrix is not positive definite

In [96]:
# hosp_df = pd.get_dummies(hosp_df, columns=['location_key'], drop_first=True)
# hosp_df.convert_objects(convert_numeric=True)
#creating the train and validation set
train = data[:int(0.8*(len(data)))]
valid = data[int(0.8*(len(data))):]

#fit the model
from statsmodels.tsa.vector_ar.var_model import VAR

model = VAR(endog=train)
model_fit = model.fit()

# make prediction on validation
prediction = model_fit.forecast(model_fit.endog, steps=len(valid))
# prediction = model_fit.forecast(model_fit(1 ,1 ,0), steps=len(valid))

  self._init_dates(dates, freq)


In [89]:
#converting predictions to dataframe
pred = pd.DataFrame(index=range(0,len(prediction)),columns=[df_hosp_vacc_epi.columns])
for j in range(0,21):
    for i in range(0, len(prediction)):
        pred.iloc[i][j] = prediction[i][j]



In [86]:
#check rmse
from sklearn.metrics import mean_squared_error
for i in df_hosp_vacc_epi.columns:
    print('rmse value for', i, 'is : ', np.sqrt(mean_squared_error(pred[i], valid[i])))

KeyError: 'Date'

In [92]:
#make final predictions
model = VAR(endog=data)
model_fit = model.fit()
yhat = model_fit.forecast(model_fit.endog, steps=1)
print(yhat)

[[3.98108377e+02 4.37227108e+05 0.00000000e+00 6.82837363e+01
  7.66392837e+04 0.00000000e+00 0.00000000e+00 3.36154983e+03
  5.46101555e+06 1.14467280e+04 4.69945573e+06 1.48082778e+04
  1.01604713e+07 5.66724852e+04 9.42688025e+01 0.00000000e+00
  1.37618712e+05 5.82092349e+06 1.17694269e+05 0.00000000e+00
  2.15230897e+07]]


  self._init_dates(dates, freq)


In [93]:
pred

Unnamed: 0,Date,new_hospitalized_patients,cumulative_hospitalized_patients,current_hospitalized_patients,new_intensive_care_patients,cumulative_intensive_care_patients,current_intensive_care_patients,current_ventilator_patients,new_persons_vaccinated,cumulative_persons_vaccinated,...,new_vaccine_doses_administered,cumulative_vaccine_doses_administered,new_confirmed,new_deceased,new_recovered,new_tested,cumulative_confirmed,cumulative_deceased,cumulative_recovered,cumulative_tested
0,186.644236,419811.644236,0.0,14.149853,74228.149853,0.0,0.0,33865.03188,4616143.031879,13355.603254,...,8231702.635133,-2052.045837,-6.656931,0.0,25514.933243,5294170.954163,116494.343069,0.0,18599375.933242,
1,167.277656,419978.921892,0.0,10.135797,74238.28565,0.0,0.0,28025.998941,4644169.03082,13402.483229,...,8273131.117303,-3051.250217,-19.947368,0.0,21657.412556,5291119.703946,116474.395701,0.0,18621033.345798,
2,172.796228,420151.71812,0.0,9.838718,74248.124368,0.0,0.0,24945.691149,4669114.72197,14696.311277,...,8312773.11973,-2546.943021,-30.09646,0.0,23050.292221,5288572.760925,116444.299241,0.0,18644083.638019,
3,177.748795,420329.466915,0.0,9.98989,74258.114258,0.0,0.0,23102.772513,4692217.494482,16197.608145,...,8352073.500387,-1822.578585,-40.052786,0.0,25547.494281,5286750.182341,116404.246455,0.0,18669631.132299,
4,175.766239,420505.233154,0.0,9.385337,74267.499595,0.0,0.0,21624.58822,4713842.082702,17485.743621,...,8391183.832227,-1422.986939,-49.788332,0.0,27360.334417,5285327.195401,116354.458123,0.0,18696991.466716,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,368.920768,435300.515573,0.0,68.335835,76070.87525,0.0,0.0,4079.433663,4994828.618982,14662.62059,...,10086846.64501,7148.247752,113.352761,0.0,31102.428593,5421625.175239,115358.461761,0.0,20712630.332921,
69,371.875282,435672.390855,0.0,69.264744,76140.139994,0.0,0.0,4389.518792,4999218.137773,14310.981349,...,10105547.14515,7248.941872,116.661283,0.0,31034.631061,5428874.11711,115475.123044,0.0,20743664.963982,
70,374.693918,436047.084773,0.0,70.155168,76210.295162,0.0,0.0,4701.992906,5003920.130679,13965.649529,...,10124214.787586,7345.172712,119.842907,0.0,30968.111926,5436219.289822,115594.965951,0.0,20774633.075908,
71,377.377709,436424.462482,0.0,71.007207,76281.302369,0.0,0.0,5016.073176,5008936.203855,13627.128033,...,10142857.988795,7436.986681,122.897209,0.0,30903.048735,5443656.276503,115717.86316,0.0,20805536.124643,


In [95]:
pred.to_csv("prediction_1.csv")