# <span style="font-width:bold; font-size: 3rem; color:#1EB182;"><img src="images/icon102.png" width="38px"></img> **Hopsworks Feature Store** </span><span style="font-width:bold; font-size: 3rem; color:#333;">- Part 02: Feature Pipeline</span>

## 🗒️ This notebook is divided in 3 sections:
1. Parsing Data.
2. Preparing dataframes.
3. Feature Group Insertion.

## <span style='color:#ff5f27'> 📝 Imports

In [1]:
import pandas as pd
from datetime import datetime
import time 
import os 

from parsing_functions import *

## <span style='color:#ff5f27'> 🧑🏻‍🏫 Dataset Preparation

#### <span style='color:#ff5f27'> 🚖 Rides Data

In [2]:
df_rides = generate_rides_data(150)

df_rides

Unnamed: 0,ride_id,pickup_datetime,pickup_longitude,dropoff_longitude,pickup_latitude,dropoff_latitude,passenger_count,taxi_id,driver_id
0,0ea3513dae8903e5354e58457598333c,1596576300000,-73.11600,-74.27999,41.74669,40.72651,4,150,190
1,91c7d626ae99d1fc869a300e16386e99,1607076100000,-73.72674,-73.05540,41.12311,40.66890,4,56,186
2,f140fd39672d9019b8cb78c59555e2a7,1585877300000,-73.54131,-73.09743,41.46730,41.53585,3,88,66
3,66cde0617c72fc842642700fd8905a43,1601700700000,-73.10467,-74.35972,41.46837,40.65167,4,146,99
4,c8b524cb197f7bd6774caaf37e6cbd1e,1605607900000,-73.35801,-73.13973,41.32444,41.59873,1,51,64
...,...,...,...,...,...,...,...,...,...
145,37c5f09edd202f5cc6162434406ba3ac,1579697900000,-73.31275,-72.80552,41.70019,41.14509,2,61,167
146,f15eb508ca545e32a1bce88fd7c5a65a,1606801500000,-72.81739,-74.45853,41.71455,41.03197,4,39,75
147,5b87854031ed2c9decf2f665c45a0846,1604936500000,-73.19607,-73.72320,40.92672,41.41583,4,2,132
148,068ae8d68d964742f820bec1c066fd11,1602136500000,-73.13963,-73.33528,41.38667,41.39011,1,94,105


In [3]:
df_rides["distance"] = distance(df_rides["pickup_latitude"], df_rides["pickup_longitude"],
                            df_rides["dropoff_latitude"], df_rides["dropoff_longitude"])

In [4]:
# Distances to nearby airports
jfk = (-73.7781, 40.6413)
ewr = (-74.1745, 40.6895)
lgr = (-73.8740, 40.7769)

df_rides['pickup_distance_to_jfk'] = distance(jfk[1], jfk[0],
                                     df_rides['pickup_latitude'], df_rides['pickup_longitude'])
df_rides['dropoff_distance_to_jfk'] = distance(jfk[1], jfk[0],
                                       df_rides['dropoff_latitude'], df_rides['dropoff_longitude'])
df_rides['pickup_distance_to_ewr'] = distance(ewr[1], ewr[0], 
                                      df_rides['pickup_latitude'], df_rides['pickup_longitude'])
df_rides['dropoff_distance_to_ewr'] = distance(ewr[1], ewr[0],
                                       df_rides['dropoff_latitude'], df_rides['dropoff_longitude'])
df_rides['pickup_distance_to_lgr'] = distance(lgr[1], lgr[0],
                                      df_rides['pickup_latitude'], df_rides['pickup_longitude'])
df_rides['dropoff_distance_to_lgr'] = distance(lgr[1], lgr[0],
                                       df_rides['dropoff_latitude'], df_rides['dropoff_longitude'])

In [5]:
df_rides["pickup_datetime"] = (pd.to_datetime(df_rides["pickup_datetime"],unit='ms'))

In [6]:
df_rides['year'] = df_rides.pickup_datetime.apply(lambda t: t.year)
df_rides['weekday'] = df_rides.pickup_datetime.apply(lambda t: t.weekday())
df_rides['hour'] = df_rides.pickup_datetime.apply(lambda t: t.hour)

In [7]:
df_rides["pickup_datetime"] = df_rides["pickup_datetime"].values.astype(np.int64) // 10 ** 6

In [8]:
# lets save our newly-generated ride_ids to the csv so
# we will retrieve them and use in fares_fg
df_rides.ride_id.to_csv("new_ride_ids.csv")

In [9]:
for col in ["passenger_count", "taxi_id", "driver_id"]:
    df_rides[col] = df_rides[col].astype("int64")


#### <span style='color:#ff5f27'> 💸 Fares Data

In [10]:
df_fares = generate_fares_data(150)

df_fares

Unnamed: 0,total_fare,tip,tolls,taxi_id,driver_id
0,190,14,0,6,35
1,89,51,3,23,19
2,8,30,3,36,48
3,175,47,2,165,156
4,104,31,1,185,150
...,...,...,...,...,...
145,35,4,5,51,168
146,156,7,5,134,19
147,75,10,1,178,90
148,242,33,0,98,42


In [11]:
df_fares = df_fares.astype("int64")

In [12]:
# lets load our ride_ids which were created moments ago for rides_fg
df_fares["ride_id"] = pd.read_csv("new_ride_ids.csv")["ride_id"]

In [13]:
for col in ["tip", "tolls", "total_fare"]:
    df_fares[col] = df_fares[col].astype("double")

## <span style="color:#ff5f27;"> 🔮 Connecting to Hopsworks Feature Store </span>

In [14]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/164




Connected. Call `.close()` to terminate connection gracefully.


## <span style="color:#ff5f27;">🪄 Retrieving Feature Groups</span>

In [15]:
rides_fg = fs.get_or_create_feature_group(name="rides_fg",
                                          version=1)   

fares_fg = fs.get_or_create_feature_group(name="fares_fg",
                                          version=1)   

## <span style="color:#ff5f27;">🧬 Inserting into Feature Groups</span>

In [16]:
rides_fg.insert(df_rides)

2022-09-12 20:37:24,884 INFO: 	2 expectation(s) included in expectation_suite.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/164/fs/106/fg/631


Uploading Dataframe: 0.00% |          | Rows 0/150 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/164/jobs/named/rides_fg_1_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x1fd9169a640>,
 {
   "results": [
     {
       "success": true,
       "result": {
         "element_count": 150,
         "missing_count": 0,
         "missing_percent": 0.0,
         "unexpected_count": 0,
         "unexpected_percent": 0.0,
         "unexpected_percent_total": 0.0,
         "unexpected_percent_nonmissing": 0.0,
         "partial_unexpected_list": []
       },
       "meta": {},
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       },
       "expectation_config": {
         "meta": {
           "expectationId": 295
         },
         "kwargs": {
           "column": "pickup_longitude",
           "min_value": -74.5,
           "max_value": -72.8
         },
         "expectation_type": "expect_column_values_to_be_between"
       }
     },
     {
       "success": true,
       "result": {
         "element_count": 150,
         "missing_count": 0,
      

In [17]:
fares_fg.insert(df_fares)

2022-09-12 20:38:31,509 INFO: 	1 expectation(s) included in expectation_suite.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/164/fs/106/fg/632


Uploading Dataframe: 0.00% |          | Rows 0/150 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/164/jobs/named/fares_fg_1_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x1fd9197c970>,
 {
   "results": [
     {
       "success": true,
       "result": {
         "element_count": 150,
         "missing_count": 0,
         "missing_percent": 0.0,
         "unexpected_count": 0,
         "unexpected_percent": 0.0,
         "unexpected_percent_total": 0.0,
         "unexpected_percent_nonmissing": 0.0,
         "partial_unexpected_list": []
       },
       "meta": {},
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       },
       "expectation_config": {
         "meta": {
           "expectationId": 297
         },
         "kwargs": {
           "column": "total_fare",
           "min_value": 3,
           "max_value": 5000
         },
         "expectation_type": "expect_column_values_to_be_between"
       }
     }
   ],
   "success": true,
   "meta": {
     "great_expectations_version": "0.15.18",
     "expectation_suite_name": "validate_on

---

## <span style="color:#ff5f27;">⏭️ **Next:** Part 03 </span>

In the next notebook, we will create a feature view and training dataset.