# Feature-store ingestion using spark engine
### Pypsark dataframe source & V3IO KV target

In [1]:
import os
import mlrun
import mlrun.feature_store as fs
import storey
import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql.types import *

project_name = 'test-notebooks'
# Initialize the MLRun project object
project = mlrun.get_or_create_project(project_name, context="./", user_project=True)

> 2022-12-06 09:35:16,268 [info] loaded project test-notebooks from MLRun DB


In [2]:
# Fetch the transactions dataset from the server
dataset_path = 'https://s3.wasabisys.com/iguazio/data/fraud-demo-mlrun-fs-docs/data.csv'
transactions_data = pd.read_csv(dataset_path, parse_dates=['timestamp']).sort_values(by='source', axis=0)[:500]

In [3]:
# Creating feature-set
transaction_set = fs.FeatureSet('transactions',
                                entities=[fs.Entity('source')], 
                                timestamp_key = 'timestamp', 
                                engine='spark',
                                description="transactions feature set")

# setting default targets
transaction_set.set_targets(targets=['nosql'],with_defaults=False)

In [4]:
from pyspark.sql import SparkSession

# Creating spark dataframe to ingest
spark = SparkSession.builder \
                    .master("local[1]") \
                    .appName("SparkByExamples.com") \
                    .getOrCreate()
        
spark_dataframe = spark.createDataFrame(transactions_data)

# Ingesting
# fs.ingest(transaction_set, spark_dataframe, overwrite=True)

### S3 CSV source & target

In [5]:
# from mlrun.datastore.sources import CSVSource
# from mlrun.datastore import CSVTarget

# user_events_set = fs.FeatureSet("events",
#                                 entities=[fs.Entity("source")],
#                                 timestamp_key='timestamp', 
#                                 description="user events feature set",
#                                 engine='spark')

# s3_filepath = 's3://igz-app-lab/transactions_cut.csv'
# s3_target = 's3://igz-app-lab/spark_operator_FS/'

# user_events_source_s3 = CSVSource("user_events", path=s3_filepath)
# user_events_target_s3 = CSVTarget("user_events", path=s3_target)

# user_events_set.set_targets(targets = [user_events_target_s3], with_defaults=False)

In [6]:
# ingestion_df = fs.ingest(featureset=user_events_set, source=user_events_source_s3, overwrite=True)

In [7]:
# Second ingestion fails 
# ingestion_df = fs.ingest(featureset=user_events_set, source=user_events_source_s3, overwrite=True)

### GCS dataframe source & V3IO KV target

In [8]:
from mlrun.datastore.sources import CSVSource
from mlrun.datastore import CSVTarget

user_events_set = fs.FeatureSet("events",
                                entities=[fs.Entity("source")],
                                timestamp_key='timestamp', 
                                description="user events feature set",
                                engine='spark')

gs_filepath = 'gs://test_bucket-igz/dani/transactions_cut.csv'
gs_target = 'gs://test_bucket-igz/dani/'

user_events_source_gs = CSVSource("user_events", path=gs_filepath)
user_events_target_gs = CSVTarget("user_events", path=gs_target)

user_events_set.set_targets(targets = [user_events_target_gs], with_defaults=False)

In [None]:
ingestion_df = fs.ingest(featureset=user_events_set, source=user_events_source_gs, overwrite=True)