In [1]:
import findspark
findspark.init()

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [3]:

import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pandas as pd
import os

SPARK_VERSION = pyspark.__version__
SPARK_MINOR_VERSION = '.'.join(SPARK_VERSION.split('.')[:2])
ICEBERG_VERSION = "1.6.1"

In [4]:
cur_dir = os.path.realpath("./")
external_libs_dir = os.path.join(cur_dir, "engines")
external_libs_jars = [
    os.path.join(external_libs_dir, f) for f in os.listdir(external_libs_dir)
]

In [5]:
config = {
    "spark.jars": ",".join(external_libs_jars),
    "spark.jars.repositories": "http://packages.confluent.io/maven/,https://repo1.maven.org/maven2/",
    "spark.jars.packages": f"org.apache.spark:spark-avro_2.12:3.5.3,za.co.absa:abris_2.12:6.4.0,com.lihaoyi:os-lib_2.12:0.8.1,org.apache.kafka:kafka-clients:3.8.0,io.delta:delta-spark_2.12:3.2.0,org.apache.iceberg:iceberg-aws-bundle:1.6.1,org.apache.hadoop:hadoop-aws:3.2.2,org.apache.hadoop:hadoop-client:3.2.2,org.apache.hadoop:hadoop-client-runtime:3.2.2,org.apache.iceberg:iceberg-spark-runtime-{SPARK_MINOR_VERSION}_2.12:{ICEBERG_VERSION}",
    "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension,org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    "spark.sql.defaultCatalog": "spark_catalog"
}

In [None]:
spark_config = SparkConf().setMaster('local').setAppName("seeknal-test")
for k, v in config.items():
    spark_config = spark_config.set(k, v)

spark = SparkSession.builder.config(conf=spark_config).getOrCreate()

spark.sql("USE spark_catalog")

In [7]:
from datetime import datetime, timedelta

from seeknal.entity import Entity
from seeknal.featurestore.feature_group import (
    FeatureGroup,
    Materialization,
    OfflineMaterialization,
    OfflineStore,
    OfflineStoreEnum,
    FeatureStoreFileOutput,
    OnlineStore,
    OnlineStoreEnum,
    HistoricalFeatures,
    FeatureLookup,
    FillNull,
    GetLatestTimeStrategy,
    OnlineFeatures,
)

from seeknal.flow import *
from seeknal.featurestore.featurestore import Feature
from seeknal.common_artifact import Source, Rule, Common, Dataset
from seeknal.project import Project
from seeknal.workspace import Workspace
from seeknal.tasks.sparkengine import aggregators as G
from seeknal.tasks.sparkengine import transformers as T
from seeknal.tasks.sparkengine.transformers.spark_engine_transformers import (
    JoinTablesByExpr,
    JoinType,
    TableJoinDef,
)
from pyspark import SparkContext
from pyspark.sql import DataFrame
import pandas as pd

In [None]:
project = Project(name="demo_project", description="demo project")
# attach project
project.get_or_create()
# create or use a workspace
Workspace(name="dev", description="dev workspace").get_or_create()

# check which workspace is active
Workspace.current()

In [9]:
columns = "day:string, feature1:float, feature2:float, id:string"
vals = [
    ("20190620", 1.0, 1.0, "1"),
    ("20190610", -1.0, -1.0, "1"),
    ("20190602", 50.0, 50.0, "1"),
    ("20190601", 0.0, 0.0, "1"),
    ("20190520", 22.2, 22.2, "1"),
    ("20190510", 2.0, 2.0, "1"),
    ("20190501", 2.1, 2.1, "1"),
    ("20190620", 1.0, 1.0, "2"),
    ("20190710", None, None, "2"),
    ("20190602", 50.0, 50.0, "2"),
    ("20190601", 0.0, 0.0, "2"),
    ("20190520", 22.2, 22.2, "2"),
    ("20190510", 2.0, 2.0, "2"),
    ("20190501", 2.1, 2.1, "2"),
]

daily_features_1 = spark.createDataFrame(vals, columns)

## Feature Engineering

In [None]:
# example transformer
format_date = T.Transformer(
    T.ClassName.ADD_DATE,
    inputCol="day",
    outputCol="new_date",
    inputDateFormat="yyyyMMdd",
    outputDateFormat="yyyy-MM-dd",
)
res = (
    SparkEngineTask()
    .add_input(dataframe=daily_features_1)
    .add_stage(transformer=format_date)
    .add_stage(transformer=T.SQL(statement="SELECT id as msisdn, feature1, feature2, new_date as day FROM __THIS__"))
    .transform(spark)
)
res.show()

## Save feature group to offline store

In [11]:
# set materialization to dummy feature store
dummy_store = OfflineStore(kind=OfflineStoreEnum.FILE, 
                           name="demo_feature_store",
                           value=FeatureStoreFileOutput(path=f"file:///{cur_dir}/feature_store")
                          )
materialization = Materialization(event_time_col="day", 
                                  offline_materialization=OfflineMaterialization(
                                                            store=dummy_store,
                                                            mode="overwrite", 
                                                            ttl=None
                                                          ),
                                  offline=True)

In [None]:
dummy_fg = FeatureGroup(
    name="dummy_feature_group",
    entity=Entity(name="msisdn", join_keys=["msisdn"]).get_or_create(),
    materialization=materialization,
).set_dataframe(res)
dummy_fg.set_features()
dummy_fg.get_or_create()

In [None]:
res.printSchema()

In [None]:
# write feature group
dummy_fg.set_dataframe(res).write(
    feature_start_time=datetime(2019, 3, 5)
)

## Load feature group from offline store

In [None]:
my_fg = FeatureGroup(name="dummy_feature_group")
fs = FeatureLookup(source=my_fg)
fillnull = FillNull(value="0.0", dataType="double")
hist = HistoricalFeatures(lookups=[fs], fill_nulls=[fillnull])
df = hist.to_dataframe(feature_start_time=datetime(2019, 3, 5))
df.show()