In [5]:
#****************************************************************************
# (C) Cloudera, Inc. 2020-2023
#  All rights reserved.
#
#  Applicable Open Source License: GNU Affero General Public License v3.0
#
#  NOTE: Cloudera open source products are modular software products
#  made up of hundreds of individual components, each of which was
#  individually copyrighted.  Each Cloudera open source product is a
#  collective work under U.S. Copyright Law. Your license to use the
#  collective work is as provided in your written agreement with
#  Cloudera.  Used apart from the collective work, this file is
#  licensed for your use pursuant to the open source license
#  identified above.
#
#  This code is provided to you pursuant a written agreement with
#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
#  this code. If you do not have a written agreement with Cloudera nor
#  with an authorized and properly licensed third party, you do not
#  have any rights to access nor to use this code.
#
#  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
#  DATA.
#
# #  Author(s): Paul de Fusco
#***************************************************************************/

In [6]:
import os
import numpy as np
import pandas as pd
from prophet import Prophet, serialize
from prophet.diagnostics import cross_validation, performance_metrics
import mlflow

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [7]:
import cml.data_v1 as cmldata
import pyspark.pandas as ps
from pyspark.sql.functions import *



In [9]:
USERNAME = os.environ["PROJECT_OWNER"]
CONNECTION_NAME = "telefonicabr-az-dl"

In [20]:
from pyspark import SparkContext
SparkContext.setSystemProperty('spark.driver.cores', '2')
SparkContext.setSystemProperty('spark.driver.memory', '4g')
SparkContext.setSystemProperty('spark.executor.cores', '2')
SparkContext.setSystemProperty('spark.executor.memory', '4g')

conn = cmldata.get_connection(CONNECTION_NAME)
spark = conn.get_spark_session()

Spark Application Id:spark-application-1729905181281


In [21]:
df = spark.sql("SELECT DT_PRMR_ATCV_LNHA, COMPANY FROM SPARK_CATALOG.TELCO_MEDALLION.PRODUCTS_SILVER")
df = df.withColumnRenamed("COMPANY", "y")
df = df.withColumnRenamed("DT_PRMR_ATCV_LNHA", "ds")
df = df.withColumn("ds", to_date("ds")) 
#df.count()
df.printSchema()

root
 |-- ds: date (nullable = true)
 |-- y: integer (nullable = true)



In [22]:
df = df.filter(col("ds") > "2022-12-31")
df = df.toPandas()

                                                                                

In [23]:
def extract_params(pr_model):
    params = {attr: getattr(pr_model, attr) for attr in serialize.SIMPLE_ATTRIBUTES}
    return {k: v for k, v in params.items() if isinstance(v, (int, float, str, bool))}

In [24]:
mlflow.set_experiment("prophet-forecast")
with mlflow.start_run():
    model = Prophet().fit(df)
    params = extract_params(model)

    metrics_raw = cross_validation(
        model=model,
        horizon="30 days",
        period="30 days",
        initial="60 days",
        parallel="threads",
        disable_tqdm=True,
    )

    cv_metrics = performance_metrics(metrics_raw)
    metrics = cv_metrics.drop(columns=["horizon"]).mean().to_dict()

    # The training data can be retrieved from the fit model for convenience
    train = model.history

    model_info = mlflow.prophet.log_model(
        model, artifact_path="prophet_model", input_example=train[["ds"]].head(10)
    )
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)

2024/10/26 01:15:17 INFO mlflow.tracking.fluent: Experiment with name 'prophet-forecast' does not exist. Creating a new experiment.
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmplppovb8y/02u3gh_h.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmplppovb8y/174qlv6e.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/home/cdsw/.local/lib/python3.10/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=1991', 'data', 'file=/tmp/tmplppovb8y/02u3gh_h.json', 'init=/tmp/tmplppovb8y/174qlv6e.json', 'output', 'file=/tmp/tmplppovb8y/prophet_modeljmbvg1ou/prophet_model-20241026011518.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
01:15:18 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing

In [25]:
loaded_model = mlflow.prophet.load_model(model_info.model_uri)

In [26]:
forecast = loaded_model.predict(loaded_model.make_future_dataframe(60))
forecast = forecast[["ds", "yhat"]].tail(90)
print(f"forecast:\n${forecast.head(30)}")

forecast:
$            ds      yhat
203 2023-11-20  5.047902
204 2023-11-21  6.014548
205 2023-11-22  6.027425
206 2023-11-24  5.633633
207 2023-11-25  5.653667
208 2023-11-26  5.584663
209 2023-11-27  5.033298
210 2023-11-28  5.999944
211 2023-11-29  6.012820
212 2023-12-01  5.619029
213 2023-12-04  5.018694
214 2023-12-06  5.998216
215 2023-12-07  5.268505
216 2023-12-08  5.604425
217 2023-12-09  5.624458
218 2023-12-10  5.555455
219 2023-12-12  5.970735
220 2023-12-13  5.983612
221 2023-12-14  5.253900
222 2023-12-17  5.540851
223 2023-12-19  5.956131
224 2023-12-20  5.969008
225 2023-12-22  5.575216
226 2023-12-23  5.595250
227 2023-12-24  5.526246
228 2023-12-25  4.974881
229 2023-12-26  5.941527
230 2023-12-28  5.224692
231 2023-12-29  5.560612
232 2023-12-31  5.511642
