# <span style="font-width:bold; font-size: 3rem; color:#1EB182;"> **Hopsworks Feature Store** </span><span style="font-width:bold; font-size: 3rem; color:#333;">- Part 01: Load, Engineer & Connect</span>

## <span style="color:#ff5f27;"> 📝 Imports </span>

In [None]:
import pandas as pd
from graphdatascience import GraphDataScience

In [None]:
'''
NEO4J_URI = "neo4j://..."
NEO4J_USER = "..."
NEO4J_PASSWORD = "..."
DATABASE_NAME = "..."
'''

## <span style="color:#ff5f27;"> 💽 Loading Graph from Neo4J </span>


In [None]:
gds = GraphDataScience(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD), database=DATABASE_NAME)

In [None]:
gds.graph.list()

In [None]:
port_routes = gds.graph.get("portRoutes2")

In [None]:
port_routes

In [None]:
port_routes.node_properties()

In [None]:
port_routes.name()

## <span style="color:#ff5f27;"> 🛠️ Graph Feature Engineering using Neo4J APIs</span>


In [None]:
distance_km_df = gds.graph.relationshipProperty.stream(port_routes,  node_properties="distance")
distance_km_df = distance_km_df.dropna()
distance_km_df.columns = ["source_node_id", "target_node_id", "relationshipType", "distance_km"]
distance_km_df = distance_km_df[["source_node_id", "target_node_id", "distance_km"]]
distance_km_df

In [None]:
node_embeddings_df = gds.node2vec.stream(port_routes) 
node_embeddings_df

In [None]:
len(node_embeddings_df.embedding.values[0])

In [None]:
gds.shortestPath.dijkstra.stream(port_routes, sourceNode=2406, targetNode=2404)


In [None]:
gds.shortestPath.dijkstra.stream(port_routes, sourceNode=2358, targetNode=2378)

In [None]:
trip_df = gds.run_cypher(
    "MATCH (p) RETURN p.trip_id, p.arrival_time, p.destination_port, p.lon, p.departure_time, p.lat, p.departure_port"
)
trip_df.dropna(subset = ["p.trip_id"], inplace=True)
trip_df.dropna(inplace=True)

In [None]:
trip_df

In [None]:
distances = []
existent = []
trip_df = trip_df[trip_df["p.departure_port"] != trip_df["p.destination_port"]]
for departure_port,destination_port in trip_df[["p.departure_port", "p.destination_port"]].values.tolist():
    if not [departure_port,destination_port] in existent:
        existent.append([departure_port,destination_port])
        distances.append(gds.shortestPath.dijkstra.stream(port_routes, sourceNode=gds.find_node_id(["Port"], {"port_code": departure_port}), 
                                                          targetNode=gds.find_node_id(["Port"], {"port_code": destination_port}))
                         [["sourceNode", "targetNode", "totalCost", "nodeIds", "costs"]].to_dict())

In [None]:
records = []
for r in distances:
    try:
        records.append({"sourceNode": r['sourceNode'][0], "targetNode": r['targetNode'][0], "nodeIds": r['nodeIds'][0], "costs": r['costs'][0]})
    except:
        pass

In [None]:
distances_df = pd.DataFrame.from_records(records)
distances_df["hops"] = distances_df.nodeIds.map(lambda x: len(x))
distances_df = distances_df[distances_df.hops > 2]

In [None]:
distances_df

In [None]:
node_embeddings_df.columns = ['nodeId', 'source_node_embedding']
port_embeddings_df = node_embeddings_df.merge(distances_df, left_on=["nodeId"], right_on=["sourceNode"])
port_embeddings_df.columns = ['node_id', 'source_node_embedding', 'source_node', 'target_node', 'node_ids', 'costs', 'hops']
node_embeddings_df.columns = ['node_id', 'targed_node_embedding']
#port_embeddings_df = node_embeddings_df.merge(port_embeddings_df, left_on=["node_id"], right_on=["target_node"])
port_embeddings_df

In [None]:
node_embeddings_df

In [None]:
port_embeddings_df = port_embeddings_df.merge(node_embeddings_df, left_on=["target_node"], right_on=["node_id"])
port_embeddings_df = port_embeddings_df[['source_node_embedding', 'source_node', 'target_node', 'node_ids', 'costs', 'hops', 'targed_node_embedding']]
emb_hops = port_embeddings_df.copy()

In [None]:
port_embeddings_df = emb_hops[["source_node", "target_node", "source_node_embedding", "targed_node_embedding"]]
port_embeddings_df.columns =["source_node_id", "target_node_id", "source_node_embedding", "targed_node_embedding"]
port_embeddings_df

In [None]:
emb_hops = emb_hops[["source_node", "target_node", "node_ids", "hops"]]
emb_hops.columns = ["source_node_id", "target_node_id", "hop_node_ids", "hops"]

In [None]:
emb_hops

In [None]:
distance_km_df[(distance_km_df.source_node_id == 2364) & (distance_km_df.target_node_id ==2371)]

In [None]:
distance_km_df

In [None]:
from itertools import islice

def sliding_window(elements, window_size):
    
    if len(elements) <= window_size:
       return elements
    for i in range(len(elements)):
        result = elements[i:i+window_size]
        if len(result) > 1:
            yield(elements[i:i+window_size])


def estimate_dist(x):
    distance = 0
    for i in sliding_window(x, window_size=2):
        tmp_df = distance_km_df[(distance_km_df.source_node_id == i[0]) & (distance_km_df.target_node_id ==i[1])]
        distance += tmp_df.distance_km.values[0]
    return distance 
        
emb_hops["distance_km"]=  emb_hops.hop_node_ids.map(lambda x: estimate_dist(x))
emb_hops

## <span style="color:#ff5f27;"> 🪄 Feature Groups Creation</span>

### Feature Groups

A `Feature Groups` is a logical grouping of features, and experience has shown, that this grouping generally originates from the features being derived from the same data source. The `Feature Group` lets you save metadata along features, which defines how the Feature Store interprets them, combines them and reproduces training datasets created from them.

Generally, the features in a feature group are engineered together in an ingestion job. However, it is possible to have additional jobs to append features to an existing feature group. Furthermore, `feature groups` provide a way of defining a namespace for features, such that you can define features with the same name multiple times, but uniquely identified by the group they are contained in.

> It is important to note that `feature groups` are not groupings of features for immediate training of Machine Learning models. Instead, to ensure reusability of features, it is possible to combine features from any number of groups into training datasets.

In [None]:
import hsfs
connection = hsfs.connection()
fs = connection.get_feature_store()

In [None]:
port_embeddings_fg = fs.get_or_create_feature_group(
    name="port_embeddings",
    description="Port embeddings",
    version=1,
    online_enabled=True,
    primary_key=["source_node_id", "target_node_id"],
    statistics_config=False
)

port_embeddings_fg.insert(port_embeddings_df)

In [None]:
port_distance_fg = fs.get_or_create_feature_group(
    name="port_distance",
    description="Distances between ports",
    version=1,
    online_enabled=True,
    primary_key=["source_node_id", "target_node_id"],
    statistics_config=True
)

port_distance_fg.insert(emb_hops)