In [1]:
import time

import pandas as pd
import os
from prophet import Prophet
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import *
from pyspark import SparkContext

jars = "" \
       "./jars/mongo-spark-connector_2.12-3.0.1.jar," \
       "./jars/mongo-java-driver-3.12.5.jar," \
       "./jars/bson-4.0.5.jar," \
       "./jars/spark-core_2.12-3.0.1.jar," \
       "./jars/spark-sql_2.12-3.0.1.jar"

SPARK_MASTER = os.environ["SPARK_MASTER"]
DB_NAME = os.environ["DB_NAME"]
DB_HOST = os.environ["DB_HOST"]
DB_PORT = os.environ["DB_PORT"]
# SPARK_THREAD_COUNT = os.environ["SPARK_THREAD_COUNT"]
SPARK_EXECUTOR_CORES = os.environ["SPARK_EXECUTOR_CORES"]
SPARK_EXECUTOR_MEMORY = os.environ["SPARK_EXECUTOR_MEMORY"]
SPARK_INITIAL_EXECUTORS = os.environ["SPARK_INITIAL_EXECUTORS"]
SPARK_MIN_EXECUTORS = os.environ["SPARK_MIN_EXECUTORS"]
SPARK_MAX_EXECUTORS = os.environ["SPARK_MAX_EXECUTORS"]
SPARK_BACKLOG_TIMEOUT = os.environ["SPARK_BACKLOG_TIMEOUT"]
SPARK_IDLE_TIMEOUT = os.environ["SPARK_IDLE_TIMEOUT"]

GISJOIN = "GISJOIN"

spark = SparkSession \
    .builder \
    .master(SPARK_MASTER) \
    .appName("COVID-19 Time-series - PySpark") \
    .config("spark.jars", jars) \
    .config("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true") \
    .config("spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true") \
    .config("park.executor.cores", SPARK_EXECUTOR_CORES) \
    .config("spark.executor.memory", SPARK_EXECUTOR_MEMORY) \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.initialExecutors", SPARK_INITIAL_EXECUTORS) \
    .config("spark.dynamicAllocation.minExecutors", SPARK_MIN_EXECUTORS) \
    .config("spark.dynamicAllocation.maxExecutors", SPARK_MAX_EXECUTORS) \
    .config("spark.dynamicAllocation.schedulerBacklogTimeout", SPARK_BACKLOG_TIMEOUT) \
    .config("spark.dynamicAllocation.executorIdleTimeout", SPARK_IDLE_TIMEOUT) \
    .getOrCreate()

In [4]:
@pandas_udf(result_schema, PandasUDFType.GROUPED_MAP)
def predict(df0):
    # instantiate the model, configure the parameters
    print('>>> predict(): call')
    m = Prophet()
    m.fit(df0)
    df0_future = m.make_future_dataframe(periods=365)
    df0_forecast = m.predict(df0_future)

    return df0_forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

In [5]:
sc = spark.sparkContext
temp = sc._jsc.sc()

mongo_connection_uri = f'mongodb://{DB_HOST}:{DB_PORT}/{DB_NAME}.covid_county_formatted'

df = spark.read.format("mongo").option("uri", mongo_connection_uri).load()

df = df.select(GISJOIN, 'cases', 'deaths', 'date', 'formatted_date')

df.show()

result_schema = StructType([
    StructField("ds", DateType(), True),
    StructField("yhat", DoubleType(), True),
    StructField("yhat_lower", DoubleType(), True),
    StructField("yhat_upper", DoubleType(), True)
])


df_cases = df.select(GISJOIN, 'date', 'cases').withColumnRenamed('date', 'ds').withColumnRenamed('cases', 'y')

df_cases.show()

+--------+-----+------+----------+-------------------+
| GISJOIN|cases|deaths|      date|     formatted_date|
+--------+-----+------+----------+-------------------+
|G0100010|    0|     0|2020-03-27|2020-03-26 18:00:00|
|G0100010|    3|     0|2020-03-25|2020-03-24 18:00:00|
|G0100010|    0|     0|2020-03-28|2020-03-27 18:00:00|
|G0100010|    0|     0|2020-03-29|2020-03-28 18:00:00|
|G0100010|    1|     0|2020-03-30|2020-03-29 18:00:00|
|G0100010|    0|     0|2020-03-31|2020-03-30 18:00:00|
|G0100010|    3|     0|2020-04-01|2020-03-31 18:00:00|
|G0100010|    2|     0|2020-03-26|2020-03-25 18:00:00|
|G0100010|    0|     0|2020-04-04|2020-04-03 18:00:00|
|G0100010|    0|     0|2020-04-05|2020-04-04 18:00:00|
|G0100010|    0|     1|2020-04-06|2020-04-05 18:00:00|
|G0100010|    0|     0|2020-04-02|2020-04-01 18:00:00|
|G0100010|    0|     0|2020-04-07|2020-04-06 18:00:00|
|G0100010|    0|     0|2020-04-08|2020-04-07 18:00:00|
|G0100010|    5|     0|2020-04-09|2020-04-08 18:00:00|
|G0100010|

In [18]:
# df.select('GISJOIN')
p_df = df.toPandas()

In [19]:
p_df

Unnamed: 0,GISJOIN,cases,deaths,date,formatted_date
0,G0100010,0,0,2020-03-27,2020-03-26 18:00:00
1,G0100010,3,0,2020-03-25,2020-03-24 18:00:00
2,G0100010,0,0,2020-03-28,2020-03-27 18:00:00
3,G0100010,0,0,2020-03-29,2020-03-28 18:00:00
4,G0100010,1,0,2020-03-30,2020-03-29 18:00:00
...,...,...,...,...,...
1064945,,10,0,2021-02-24,2021-02-23 17:00:00
1064946,,8,0,2021-03-04,2021-03-03 17:00:00
1064947,,0,0,2021-03-07,2021-03-06 17:00:00
1064948,,4,0,2021-03-05,2021-03-04 17:00:00


In [23]:
gis_joins = df.select('GISJOIN').distinct().rdd.map(lambda r: r[0]).collect()

In [26]:
gis_join = gis_joins[0]

In [34]:
df0 = df.where(df.GISJOIN == gis_join)
df0

DataFrame[GISJOIN: string, cases: int, deaths: int, date: string, formatted_date: timestamp]

In [35]:
predict(df0)

TypeError: Invalid argument, not a string or column: DataFrame[GISJOIN: string, cases: int, deaths: int, date: string, formatted_date: timestamp] of type <class 'pyspark.sql.dataframe.DataFrame'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

In [36]:
df_cases = df.select('GISJOIN', 'date', 'cases').withColumnRenamed('date', 'ds').withColumnRenamed('cases', 'y')

results = (df_cases.groupBy('GISJOIN').apply(predict))



In [None]:
results.collect()