# Part 1: Data ingestion


This notebook introduces how to **Ingest** different data sources to the **Feature Store**.

The following FeatureSets are created:
- **Transactions**: Monetary transactions between a source and a target.
- **Events**: Account events such as account login or a password change.
- **Label**: Fraud label for the data.

## Step 1 - Fetch, process and ingest the datasets

## 1.1 - Transactions

### Transactions

In [11]:
import pandas as pd
from src.date_adjust import adjust_data_timespan

# Fetch the transactions dataset from the server
transactions_data = pd.read_csv('https://s3.wasabisys.com/iguazio/data/fraud-demo-mlrun-fs-docs/data.csv', parse_dates=['timestamp'])

# use only first 10k
transactions_data = transactions_data.sort_values(by='source', axis=0)[:10000]

# Adjust the samples timestamp for the past 2 days
transactions_data = adjust_data_timespan(transactions_data, new_period='2d')

# Preview
transactions_data.head(3)

Unnamed: 0,step,age,gender,zipcodeOri,zipMerchant,category,amount,fraud,timestamp,source,target,device
274633,91,5,F,28007,28007,es_transportation,26.92,0,2023-06-19 10:54:09.670515000,C1022153336,M1823072687,33832bb8607545df97632a7ab02d69c4
286902,94,2,M,28007,28007,es_transportation,48.22,0,2023-06-19 10:54:27.964140913,C1006176917,M348934600,fadd829c49e74ffa86c8da3be75ada53
416998,131,3,M,28007,28007,es_transportation,17.56,0,2023-06-19 10:54:33.071310939,C1010936270,M348934600,58d0422a50bc40c89d2b4977b2f1beea


In [12]:
transactions_data.columns

Index(['step', 'age', 'gender', 'zipcodeOri', 'zipMerchant', 'category',
       'amount', 'fraud', 'timestamp', 'source', 'target', 'device'],
      dtype='object')

### Transactions - offline df

In [13]:
# Define and add value mapping
main_categories = ["es_transportation", "es_health", "es_otherservices",
       "es_food", "es_hotelservices", "es_barsandrestaurants",
       "es_tech", "es_sportsandtoys", "es_wellnessandbeauty",
       "es_hyper", "es_fashion", "es_home", "es_contents",
       "es_travel", "es_leisure"]
transactions_data_p = transactions_data

# transactions_data_p.set_index(['source'], inplace=True)
transactions_data_p['timestamp_day_of_week'] = transactions_data_p['timestamp'].dt.weekday
transactions_data_p['timestamp_hour'] = transactions_data_p['timestamp'].dt.hour
transactions_data_p["age_mapped"] = transactions_data_p["age"].map(
    lambda x: {'U': '0'}.get(x, x)
)

transactions_data_p = pd.get_dummies(transactions_data_p, columns=['category', 'gender'])
transactions_data_for_agg = transactions_data_p.set_index(['timestamp'],)


windows=['2H', '12H', '24H']
operation = ['mean','sum', 'count','max']
for window in windows:
    for op in operation:
        transactions_data_p[f'amount_{op}_{window}'] = transactions_data_for_agg.groupby(['source', pd.Grouper(freq=window)])['amount'].transform(op).values


for category in main_categories:
    transactions_data_p[f'{category}_sum_14D'] = transactions_data_for_agg.groupby(['source', pd.Grouper(freq='14D')])[f'category_{category}'].transform('sum').values

transactions_data_p.set_index(['source'], inplace=True)
transactions_data_p.head()

Unnamed: 0_level_0,step,age,zipcodeOri,zipMerchant,amount,fraud,timestamp,target,device,timestamp_day_of_week,...,es_barsandrestaurants_sum_14D,es_tech_sum_14D,es_sportsandtoys_sum_14D,es_wellnessandbeauty_sum_14D,es_hyper_sum_14D,es_fashion_sum_14D,es_home_sum_14D,es_contents_sum_14D,es_travel_sum_14D,es_leisure_sum_14D
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1022153336,91,5,28007,28007,26.92,0,2023-06-19 10:54:09.670515000,M1823072687,33832bb8607545df97632a7ab02d69c4,0,...,1,1,1,1,0,1,0,0,0,0
C1006176917,94,2,28007,28007,48.22,0,2023-06-19 10:54:27.964140913,M348934600,fadd829c49e74ffa86c8da3be75ada53,0,...,4,0,1,1,0,2,0,0,0,0
C1010936270,131,3,28007,28007,17.56,0,2023-06-19 10:54:33.071310939,M348934600,58d0422a50bc40c89d2b4977b2f1beea,0,...,4,0,0,6,6,0,0,0,0,0
C1033736586,108,4,28007,28007,4.5,0,2023-06-19 10:54:45.364062118,M1823072687,30b269ae55984e5584f1dd5f642ac1a3,0,...,3,2,0,1,3,0,2,0,1,0
C1019071188,72,4,28007,28007,1.83,0,2023-06-19 10:55:19.253144001,M348934600,97bee3503a984f59aa6139b59f933c0b,0,...,1,0,0,0,1,4,0,1,1,0


## 1.2 - User events

### User events - fetching

In [14]:
# Fetch the user_events dataset from the server
user_events_data = pd.read_csv('https://s3.wasabisys.com/iguazio/data/fraud-demo-mlrun-fs-docs/events.csv', 
                               index_col=0, quotechar="\'", parse_dates=['timestamp'])

# Adjust to the last 2 days to see the latest aggregations in the online feature vectors
user_events_data = adjust_data_timespan(user_events_data, new_period='2d')

# Preview
user_events_data.head(3)

Unnamed: 0,source,event,timestamp
45553,C137986193,password_change,2023-06-19 10:54:13.720044000
24134,C1940951230,details_change,2023-06-19 10:54:14.768226091
64444,C247537602,login,2023-06-19 10:54:16.423009103


In [15]:
user_events_data_p = user_events_data
user_events_data_p = pd.get_dummies(user_events_data_p, columns=['event'])
user_events_data_p.set_index(['source'], inplace=True)
user_events_data_p.head()

Unnamed: 0_level_0,timestamp,event_details_change,event_login,event_password_change
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C137986193,2023-06-19 10:54:13.720044000,0,0,1
C1940951230,2023-06-19 10:54:14.768226091,1,0,0
C247537602,2023-06-19 10:54:16.423009103,0,1,0
C470079617,2023-06-19 10:54:17.713458428,0,0,1
C1142118359,2023-06-19 10:54:18.503750830,0,1,0


## Step 2 - Create a labels data set for model training

### Label df

In [16]:
def create_labels(df):
    labels = df[['fraud','timestamp']].copy()
    labels = labels.rename(columns={"fraud": "label"})
    labels['timestamp'] = labels['timestamp'].astype("datetime64[ms]")
    labels['label'] = labels['label'].astype(int)
    return labels

In [17]:
labels_set = create_labels(transactions_data_p)
labels_set.head()

Unnamed: 0_level_0,label,timestamp
source,Unnamed: 1_level_1,Unnamed: 2_level_1
C1022153336,0,2023-06-19 10:54:09.670
C1006176917,0,2023-06-19 10:54:27.964
C1010936270,0,2023-06-19 10:54:33.071
C1033736586,0,2023-06-19 10:54:45.364
C1019071188,0,2023-06-19 10:55:19.253


## Train

In [18]:
from src.train_sklearn import train_and_val, prepare_data_to_train

X_train, X_test, y_train, y_test = prepare_data_to_train(transactions_data_p, user_events_data_p, labels_set)
rf_best = train_and_val(X_train, X_test, y_train, y_test)
rf_best

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Accuracy: 0.9947643979057592
Precision: 1.0
Recall: 0.5652173913043478
F1 Score: 0.7222222222222222
