# <span style="font-width:bold; font-size: 3rem; color:#1EB182;"> **Hopsworks Feature Store** </span><span style="font-width:bold; font-size: 3rem; color:#333;">- Part 01: Load, Engineer & Connect</span>

## <span style="color:#ff5f27;"> 📝 Imports </span>

In [35]:
import hopsworks
import pandas as pd
from graphdatascience import GraphDataScience
from keys import *

In [4]:
'''
NEO4J_URI = "neo4j://..."
NEO4J_USER = "..."
NEO4J_PASSWORD = "..."
DATABASE_NAME = "..."
'''

'\nNEO4J_URI = "neo4j://..."\nNEO4J_USER = "..."\nNEO4J_PASSWORD = "..."\nDATABASE_NAME = "..."\n'

## <span style="color:#ff5f27;"> 💽 Loading Graph from Neo4J </span>


In [5]:
gds = GraphDataScience(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD), database=DATABASE_NAME)

In [6]:
gds.graph.list()

Unnamed: 0,degreeDistribution,graphName,database,memoryUsage,sizeInBytes,nodeCount,relationshipCount,configuration,density,creationTime,modificationTime,schema,schemaWithOrientation
0,"{'min': 0, 'max': 124, 'p90': 4, 'p999': 124, ...",portRoutes2,citi-supply-chain-supplychain-nov-3-2023-11-17-23,2559 KiB,2620584,729,1305,{},0.002459,2023-11-03T18:46:50.246652087+00:00,2023-11-03T18:46:50.598936154+00:00,"{'graphProperties': {}, 'nodes': {'Port': {}},...","{'graphProperties': {}, 'nodes': {'Port': {}},..."


In [7]:
port_routes = gds.graph.get("portRoutes2")

In [8]:
port_routes

Graph({'graphName': 'portRoutes2', 'nodeCount': 729, 'relationshipCount': 1305, 'database': 'citi-supply-chain-supplychain-nov-3-2023-11-17-23', 'configuration': {}, 'schema': {'graphProperties': {}, 'nodes': {'Port': {}}, 'relationships': {'HAS_TRADE_ROUTE': {'distance': 'Float (DefaultValue(NaN), PERSISTENT, Aggregation.NONE)'}}}, 'memoryUsage': '2559 KiB'})

In [9]:
port_routes.node_properties()

Port    []
dtype: object

In [10]:
port_routes.name()

'portRoutes2'

## <span style="color:#ff5f27;"> 🛠️ Graph Feature Engineering using Neo4J APIs</span>


In [13]:
distance_km_df = gds.graph.relationshipProperty.stream(port_routes,  node_properties="distance")
distance_km_df = distance_km_df.dropna() # remove NAs
distance_km_df.columns = ["source_node_id", "target_node_id", "relationshipType", "distance_km"] # rename columns
distance_km_df = distance_km_df[["source_node_id", "target_node_id", "distance_km"]] # select columns
distance_km_df

Unnamed: 0,source_node_id,target_node_id,distance_km
1,2358,2378,6823.546747
2,2358,2404,7622.919047
3,2358,2405,8120.530186
4,2358,2406,6841.241767
5,2358,2419,7130.264141
...,...,...,...
1295,962513,2419,7255.514237
1296,963560,2419,7448.409956
1298,978248,2410,8796.975979
1299,984591,2410,5289.952167


In [15]:
# Compute node embeddings
node_embeddings_df = gds.node2vec.stream(port_routes) 
node_embeddings_df

Unnamed: 0,nodeId,embedding
0,2357,"[0.00048003069241531193, -0.002239069202914834..."
1,2358,"[0.00034075722214765847, 0.0018543520709499717..."
2,2359,"[0.0003243751125410199, 0.0027955989353358746,..."
3,2360,"[-0.0008196087437681854, 0.0018999890889972448..."
4,2361,"[0.0004974995972588658, 0.003106697229668498, ..."
...,...,...
724,1035413,"[-0.0023392774164676666, -0.000405425991630181..."
725,1036604,"[0.001475348835811019, -0.0037449614610522985,..."
726,1044372,"[0.0038513815961778164, -0.00358191248960793, ..."
727,1044373,"[0.003850681707262993, -0.00225734687410295, 0..."


In [16]:
len(node_embeddings_df.embedding.values[0])

128

### Shortest path between 2 example nodes

In [18]:
gds.shortestPath.dijkstra.stream(port_routes, sourceNode=2406, targetNode=2404)

Unnamed: 0,index,sourceNode,targetNode,totalCost,nodeIds,costs,path
0,0,2406,2404,3.0,"[2406, 2360, 2386, 2404]","[0.0, 1.0, 2.0, 3.0]","((cost), (cost), (cost))"


In [19]:
gds.shortestPath.dijkstra.stream(port_routes, sourceNode=2358, targetNode=2378)

Unnamed: 0,index,sourceNode,targetNode,totalCost,nodeIds,costs,path
0,0,2358,2378,1.0,"[2358, 2378]","[0.0, 1.0]",((cost))


# Create all possible trips combinations ?

In [20]:
# Takes long
trip_df = gds.run_cypher(
    "MATCH (p) RETURN p.trip_id, p.arrival_time, p.destination_port, p.lon, p.departure_time, p.lat, p.departure_port"
)
trip_df.dropna(subset = ["p.trip_id"], inplace=True)
trip_df.dropna(inplace=True)

In [21]:
trip_df

Unnamed: 0,p.trip_id,p.arrival_time,p.destination_port,p.lon,p.departure_time,p.lat,p.departure_port
0,b56053ae-3dab-429b-8edd-fb5d00a294b2,2019-05-31T17:45:00.000Z,INDONESIA,110.985657,2019-05-30T22:54:30.000Z,-4.464749,IDPJG
10,50f3fa2b-5b62-44d2-b955-7da8c3464eb9,2022-02-23T03:43:00.000Z,INDONESIA,106.833300,2022-02-22T22:34:00.000Z,-6.133300,IDJKT
49,ffc87c9b-78e7-4109-bb78-d3dee96d5974,2022-04-04T12:40:00.000Z,INDONESIA,106.833300,2022-04-03T21:07:00.000Z,-6.133300,IDJKT
115,cbc7f3fa-e13f-4601-99bb-48a68c77edbe,2019-05-31T17:45:00.000Z,INDONESIA,110.985657,2019-05-30T22:54:30.000Z,-4.464749,IDPJG
119,dcb8214c-b245-46f5-935b-2003d55060b9,2020-01-21T16:17:00.000Z,CNFOC,119.663002,2020-01-21T10:21:00.000Z,25.935101,CNFOC
...,...,...,...,...,...,...,...
1874268,7200af38-9555-4fd2-b3a1-78f7cb53db03,2022-05-07T01:27:00.000Z,MYKUA,-90.258003,2022-03-27T05:50:00.000Z,29.993401,USMSY
1874269,58bd9881-1862-4df0-a38b-0f2639851e34,2022-05-07T01:30:00.000Z,MYKUA,-90.258003,2022-03-27T06:14:00.000Z,29.993401,USMSY
1874270,31993d0f-d1ec-443c-a87d-4c5d9c590364,2021-11-11T11:03:20.000Z,TRTEK,4.416700,2021-10-31T23:54:32.727Z,51.216700,BEANR
1874271,6b57c1f4-8f0d-4a7a-bc7e-f4f069dc7fc1,2021-11-11T11:03:20.000Z,TRTEK,4.416700,2021-10-31T23:54:32.727Z,51.216700,BEANR


In [22]:
distances = []
existent = []
trip_df = trip_df[trip_df["p.departure_port"] != trip_df["p.destination_port"]]
for departure_port,destination_port in trip_df[["p.departure_port", "p.destination_port"]].values.tolist():
    if not [departure_port,destination_port] in existent:
        existent.append([departure_port,destination_port])
        distances.append(gds.shortestPath.dijkstra.stream(port_routes, sourceNode=gds.find_node_id(["Port"], {"port_code": departure_port}), 
                                                          targetNode=gds.find_node_id(["Port"], {"port_code": destination_port}))
                         [["sourceNode", "targetNode", "totalCost", "nodeIds", "costs"]].to_dict())

Failed to read from defunct connection ResolvedIPv4Address(('104.197.246.0', 7687)) (ResolvedIPv4Address(('104.197.246.0', 7687)))
Unable to connect to the Neo4j DBMS. Trying again...
Failed to read from defunct connection ResolvedIPv4Address(('104.197.246.0', 7687)) (ResolvedIPv4Address(('104.197.246.0', 7687)))
Unable to connect to the Neo4j DBMS. Trying again...
Failed to read from defunct connection ResolvedIPv4Address(('104.197.246.0', 7687)) (ResolvedIPv4Address(('104.197.246.0', 7687)))
Unable to connect to the Neo4j DBMS. Trying again...


In [23]:
records = []
for r in distances:
    try:
        records.append({"sourceNode": r['sourceNode'][0], "targetNode": r['targetNode'][0], "nodeIds": r['nodeIds'][0], "costs": r['costs'][0]})
    except:
        pass

In [24]:
distances_df = pd.DataFrame.from_records(records)
distances_df["hops"] = distances_df.nodeIds.map(lambda x: len(x))
distances_df = distances_df[distances_df.hops > 2]

In [25]:
distances_df

Unnamed: 0,sourceNode,targetNode,nodeIds,costs,hops
6,6000,2360,"[6000, 2390, 2360]","[0.0, 1.0, 2.0]",3
8,2412,2360,"[2412, 2368, 2371, 2360]","[0.0, 1.0, 2.0, 3.0]",4
16,2383,2360,"[2383, 2371, 2360]","[0.0, 1.0, 2.0]",3
17,2364,2360,"[2364, 2371, 2360]","[0.0, 1.0, 2.0]",3
19,2405,2360,"[2405, 2388, 2360]","[0.0, 1.0, 2.0]",3
...,...,...,...,...,...
1531,2389,235522,"[2389, 2364, 235522]","[0.0, 1.0, 2.0]",3
1544,2364,321227,"[2364, 2366, 321227]","[0.0, 1.0, 2.0]",3
1557,4456,517443,"[4456, 2371, 2388, 517443]","[0.0, 1.0, 2.0, 3.0]",4
1560,2389,517449,"[2389, 2371, 2388, 517449]","[0.0, 1.0, 2.0, 3.0]",4


# Add embeddings to sources and targets

In [26]:
# Source
node_embeddings_df.columns = ['nodeId', 'source_node_embedding'] # rename columns for merging source side
port_embeddings_df = node_embeddings_df.merge(distances_df, left_on=["nodeId"], right_on=["sourceNode"]) # merge source in port with embedding df
port_embeddings_df.columns = ['node_id', 'source_node_embedding', 'source_node', 'target_node', 'node_ids', 'costs', 'hops']
port_embeddings_df

Unnamed: 0,node_id,source_node_embedding,source_node,target_node,node_ids,costs,hops
0,2358,"[0.00034075722214765847, 0.0018543520709499717...",2358,2371,"[2358, 2404, 2364, 2371]","[0.0, 1.0, 2.0, 3.0]",4
1,2358,"[0.00034075722214765847, 0.0018543520709499717...",2358,2403,"[2358, 2405, 4469, 2403]","[0.0, 1.0, 2.0, 3.0]",4
2,2358,"[0.00034075722214765847, 0.0018543520709499717...",2358,2408,"[2358, 2404, 2364, 2408]","[0.0, 1.0, 2.0, 3.0]",4
3,2358,"[0.00034075722214765847, 0.0018543520709499717...",2358,2410,"[2358, 2404, 2364, 2410]","[0.0, 1.0, 2.0, 3.0]",4
4,2358,"[0.00034075722214765847, 0.0018543520709499717...",2358,2430,"[2358, 2404, 2364, 2430]","[0.0, 1.0, 2.0, 3.0]",4
...,...,...,...,...,...,...,...
315,520699,"[-0.003359671216458082, -0.0001352341059828177...",520699,2405,"[520699, 2371, 2368, 2405]","[0.0, 1.0, 2.0, 3.0]",4
316,653525,"[-0.004010097123682499, -7.068668492138386e-05...",653525,2371,"[653525, 2404, 2379, 2371]","[0.0, 1.0, 2.0, 3.0]",4
317,653525,"[-0.004010097123682499, -7.068668492138386e-05...",653525,2403,"[653525, 2404, 2379, 2403]","[0.0, 1.0, 2.0, 3.0]",4
318,653525,"[-0.004010097123682499, -7.068668492138386e-05...",653525,2405,"[653525, 2404, 2379, 2405]","[0.0, 1.0, 2.0, 3.0]",4


In [27]:
# Target
node_embeddings_df.columns = ['node_id', 'targed_node_embedding'] # rename columns for merging target side
port_embeddings_df = port_embeddings_df.merge(node_embeddings_df, left_on=["target_node"], right_on=["node_id"]) # merge target in port with embedding df
port_embeddings_df = port_embeddings_df[['source_node_embedding', 'source_node', 'target_node', 'node_ids', 'costs', 'hops', 'targed_node_embedding']]

In [28]:
port_embeddings_df

Unnamed: 0,source_node_embedding,source_node,target_node,node_ids,costs,hops,targed_node_embedding
0,"[0.00034075722214765847, 0.0018543520709499717...",2358,2371,"[2358, 2404, 2364, 2371]","[0.0, 1.0, 2.0, 3.0]",4,"[0.000713341636583209, 0.0025018304586410522, ..."
1,"[0.00016638950910419226, 0.0028761376161128283...",2386,2371,"[2386, 2364, 2371]","[0.0, 1.0, 2.0]",3,"[0.000713341636583209, 0.0025018304586410522, ..."
2,"[5.6740595027804375e-05, 0.0002729971311055124...",2409,2371,"[2409, 2368, 2371]","[0.0, 1.0, 2.0]",3,"[0.000713341636583209, 0.0025018304586410522, ..."
3,"[-4.2610306991264224e-05, -0.00182178616523742...",2428,2371,"[2428, 18638, 12802, 2383, 2371]","[0.0, 1.0, 2.0, 3.0, 4.0]",5,"[0.000713341636583209, 0.0025018304586410522, ..."
4,"[-0.0027396425139158964, 0.001277837553061545,...",4461,2371,"[4461, 6001, 2371]","[0.0, 1.0, 2.0]",3,"[0.000713341636583209, 0.0025018304586410522, ..."
...,...,...,...,...,...,...,...
315,"[0.0026319425087422132, 0.0023814095184206963,...",17522,18640,"[17522, 2361, 2371, 18640]","[0.0, 1.0, 2.0, 3.0]",4,"[-0.0033289678394794464, 0.0019672370981425047..."
316,"[0.0039076837711036205, -0.0012678962666541338...",22893,2425,"[22893, 2360, 2425]","[0.0, 1.0, 2.0]",3,"[-0.00012190923007437959, 0.001790344482287764..."
317,"[0.0039076837711036205, -0.0012678962666541338...",22893,4465,"[22893, 2404, 2364, 4465]","[0.0, 1.0, 2.0, 3.0]",4,"[-0.0017999086994677782, -0.000460794253740459..."
318,"[0.0039076837711036205, -0.0012678962666541338...",22893,782708,"[22893, 2361, 15636, 2388, 782708]","[0.0, 1.0, 2.0, 3.0, 4.0]",5,"[0.0005274465074762702, 0.0019246068550273776,..."


In [29]:
emb_hops = port_embeddings_df.copy() # make copy
port_embeddings_df = emb_hops[["source_node", "target_node", "source_node_embedding", "targed_node_embedding"]] # select
port_embeddings_df.columns =["source_node_id", "target_node_id", "source_node_embedding", "targed_node_embedding"] # rename
port_embeddings_df

Unnamed: 0,source_node_id,target_node_id,source_node_embedding,targed_node_embedding
0,2358,2371,"[0.00034075722214765847, 0.0018543520709499717...","[0.000713341636583209, 0.0025018304586410522, ..."
1,2386,2371,"[0.00016638950910419226, 0.0028761376161128283...","[0.000713341636583209, 0.0025018304586410522, ..."
2,2409,2371,"[5.6740595027804375e-05, 0.0002729971311055124...","[0.000713341636583209, 0.0025018304586410522, ..."
3,2428,2371,"[-4.2610306991264224e-05, -0.00182178616523742...","[0.000713341636583209, 0.0025018304586410522, ..."
4,4461,2371,"[-0.0027396425139158964, 0.001277837553061545,...","[0.000713341636583209, 0.0025018304586410522, ..."
...,...,...,...,...
315,17522,18640,"[0.0026319425087422132, 0.0023814095184206963,...","[-0.0033289678394794464, 0.0019672370981425047..."
316,22893,2425,"[0.0039076837711036205, -0.0012678962666541338...","[-0.00012190923007437959, 0.001790344482287764..."
317,22893,4465,"[0.0039076837711036205, -0.0012678962666541338...","[-0.0017999086994677782, -0.000460794253740459..."
318,22893,782708,"[0.0039076837711036205, -0.0012678962666541338...","[0.0005274465074762702, 0.0019246068550273776,..."


In [30]:
emb_hops = emb_hops[["source_node", "target_node", "node_ids", "hops"]] # select
emb_hops.columns = ["source_node_id", "target_node_id", "hop_node_ids", "hops"] # rename
emb_hops

Unnamed: 0,source_node_id,target_node_id,hop_node_ids,hops
0,2358,2371,"[2358, 2404, 2364, 2371]",4
1,2386,2371,"[2386, 2364, 2371]",3
2,2409,2371,"[2409, 2368, 2371]",3
3,2428,2371,"[2428, 18638, 12802, 2383, 2371]",5
4,4461,2371,"[4461, 6001, 2371]",3
...,...,...,...,...
315,17522,18640,"[17522, 2361, 2371, 18640]",4
316,22893,2425,"[22893, 2360, 2425]",3
317,22893,4465,"[22893, 2404, 2364, 4465]",4
318,22893,782708,"[22893, 2361, 15636, 2388, 782708]",5


# Example distances from source to target

In [31]:
distance_km_df[(distance_km_df.source_node_id == 2364) & (distance_km_df.target_node_id ==2371)]

Unnamed: 0,source_node_id,target_node_id,distance_km
101,2364,2371,9296.306421


In [32]:
distance_km_df

Unnamed: 0,source_node_id,target_node_id,distance_km
1,2358,2378,6823.546747
2,2358,2404,7622.919047
3,2358,2405,8120.530186
4,2358,2406,6841.241767
5,2358,2419,7130.264141
...,...,...,...
1295,962513,2419,7255.514237
1296,963560,2419,7448.409956
1298,978248,2410,8796.975979
1299,984591,2410,5289.952167


In [33]:
from itertools import islice

def sliding_window(elements, window_size):
    
    if len(elements) <= window_size:
       return elements
    for i in range(len(elements)):
        result = elements[i:i+window_size]
        if len(result) > 1:
            yield(elements[i:i+window_size])


def estimate_dist(x):
    distance = 0
    for i in sliding_window(x, window_size=2):
        tmp_df = distance_km_df[(distance_km_df.source_node_id == i[0]) & (distance_km_df.target_node_id ==i[1])]
        distance += tmp_df.distance_km.values[0]
    return distance 
        
emb_hops["distance_km"]=  emb_hops.hop_node_ids.map(lambda x: estimate_dist(x))
emb_hops

Unnamed: 0,source_node_id,target_node_id,hop_node_ids,hops,distance_km
0,2358,2371,"[2358, 2404, 2364, 2371]",4,25736.944190
1,2386,2371,"[2386, 2364, 2371]",3,10683.848203
2,2409,2371,"[2409, 2368, 2371]",3,16504.152528
3,2428,2371,"[2428, 18638, 12802, 2383, 2371]",5,18908.143523
4,4461,2371,"[4461, 6001, 2371]",3,19436.013707
...,...,...,...,...,...
315,17522,18640,"[17522, 2361, 2371, 18640]",4,19063.979443
316,22893,2425,"[22893, 2360, 2425]",3,10698.436279
317,22893,4465,"[22893, 2404, 2364, 4465]",4,16913.838808
318,22893,782708,"[22893, 2361, 15636, 2388, 782708]",5,25803.489528


## <span style="color:#ff5f27;"> 🪄 Feature Groups Creation</span>

### Feature Groups

A `Feature Groups` is a logical grouping of features, and experience has shown, that this grouping generally originates from the features being derived from the same data source. The `Feature Group` lets you save metadata along features, which defines how the Feature Store interprets them, combines them and reproduces training datasets created from them.

Generally, the features in a feature group are engineered together in an ingestion job. However, it is possible to have additional jobs to append features to an existing feature group. Furthermore, `feature groups` provide a way of defining a namespace for features, such that you can define features with the same name multiple times, but uniquely identified by the group they are contained in.

> It is important to note that `feature groups` are not groupings of features for immediate training of Machine Learning models. Instead, to ensure reusability of features, it is possible to combine features from any number of groups into training datasets.

In [36]:
# Connect to Hopsworks
project = hopsworks.login()

# Retrieve Feature Store
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Multiple projects found. 

	 (1) marco
	 (2) quickstart_shared



Enter project to access:  1



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/397461
Connected. Call `.close()` to terminate connection gracefully.


In [37]:
port_embeddings_fg = fs.get_or_create_feature_group(
    name="port_embeddings",
    description="Port embeddings",
    version=1,
    online_enabled=True,
    primary_key=["source_node_id", "target_node_id"],
    statistics_config=False
)

port_embeddings_fg.insert(port_embeddings_df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/397461/fs/393284/fg/460116


Uploading Dataframe: 0.00% |          | Rows 0/320 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: port_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/397461/jobs/named/port_embeddings_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x2a4528fa0>, None)

In [None]:
port_distance_fg = fs.get_or_create_feature_group(
    name="port_distance",
    description="Distances between ports",
    version=1,
    online_enabled=True,
    primary_key=["source_node_id", "target_node_id"],
    statistics_config=True
)

port_distance_fg.insert(emb_hops)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/397461/fs/393284/fg/459090


Uploading Dataframe: 0.00% |          | Rows 0/320 | Elapsed Time: 00:00 | Remaining Time: ?