# Feature-store ingestion using spark engine
Please make sure required env variables are set - `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `GOOGLE_APPLICATION_CREDENTIALS`

### Pypsark dataframe source & V3IO KV targe

In [1]:
import os
import mlrun
import mlrun.feature_store as fs
import storey
import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql.types import *

project_name = 'test-notebooks'
# Initialize the MLRun project object
project = mlrun.get_or_create_project(project_name, context="./", user_project=True)

> 2022-12-08 11:05:04,452 [info] loaded project test-notebooks from MLRun DB


In [3]:
# Fetch the transactions dataset from the server
if not os.path.exists('data.csv'):
    dataset_path = 'https://s3.wasabisys.com/iguazio/data/fraud-demo-mlrun-fs-docs/data.csv'
    transactions_data = pd.read_csv(dataset_path, parse_dates=['timestamp']).sort_values(by='source', axis=0)[:500]
    transactions_data.to_csv('data.csv')
else:
    transactions_data = pd.read_csv('data.csv', parse_dates=['timestamp'])

In [4]:
# Creating feature-set
transaction_set = fs.FeatureSet('transactions',
                                entities=[fs.Entity('source')], 
                                timestamp_key = 'timestamp', 
                                engine='spark',
                                description="transactions feature set")

# setting default targets
transaction_set.set_targets(targets=['nosql'],with_defaults=False)

In [5]:
from pyspark.sql import SparkSession

# Creating spark dataframe to ingest
spark = SparkSession.builder \
                    .master("local[1]") \
                    .appName("SparkByExamples.com") \
                    .getOrCreate()
        
spark_dataframe = spark.createDataFrame(transactions_data)

# Ingesting
fs.ingest(transaction_set, spark_dataframe, overwrite=True)

> 2022-12-08 11:05:51,956 [info] writing to target nosql, spark options {'path': 'v3io://projects/test-notebooks-dani/FeatureStore/transactions/nosql/sets/transactions/1670497551956_602/', 'format': 'io.iguaz.v3io.spark.sql.kv', 'key': 'source'}


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


### S3 CSV source & target

In [6]:
from mlrun.datastore.sources import CSVSource
from mlrun.datastore import CSVTarget

user_events_set = fs.FeatureSet("events2",
                                entities=[fs.Entity("source")],
                                timestamp_key='timestamp', 
                                description="user events feature set",
                                engine='spark')

s3_filepath = 's3://igz-app-lab/transactions_cut.csv'
s3_target = 's3://igz-app-lab/spark_operator_FS2/'

user_events_source_s3 = CSVSource("user_events", path=s3_filepath)
user_events_target_s3 = CSVTarget("user_events", path=s3_target)

user_events_set.set_targets(targets = [user_events_target_s3], with_defaults=False)

In [7]:
ingestion_df = fs.ingest(featureset=user_events_set, source=user_events_source_s3, overwrite=True)

> 2022-12-08 11:06:06,697 [info] writing to target user_events, spark options {'path': 's3a://igz-app-lab/spark_operator_FS2/1670497566697_81/', 'format': 'csv', 'header': 'true'}


### GCS dataframe source & V3IO KV target

In [12]:
from mlrun.datastore.sources import CSVSource
from mlrun.datastore import CSVTarget

user_events_set = fs.FeatureSet("events",
                                entities=[fs.Entity("source")],
                                timestamp_key='timestamp', 
                                description="user events feature set",
                                engine='spark')

gs_filepath = 'gs://test_bucket-igz/dani/transactions_cut.csv'
gs_target = 'gs://test_bucket-igz/dani/test/'

user_events_source_gs = CSVSource("user_events", path=gs_filepath)
user_events_target_gs = CSVTarget("user_events", path=gs_target)

user_events_set.set_targets(targets = [user_events_target_gs], with_defaults=False)

In [None]:
# ingestion_df = fs.ingest(featureset=user_events_set, source=user_events_source_gs, overwrite=True)