**NOTES:**

1.   Currently the algorithm puts 0 values for pages that don't have embeddings. Since that's still some sort of information, we need to revamp it by inferring empty nodes from neighbouring nodes.
2.   An OOP version of the algorithm was tested and performed worse, time-wise, compared to this current version.
3. The embeddings are not binarized, this can be done post-MVP.
4. The end results need to be evaluated, either through SEO or computer science approach and stored in a separate file:
  *   A/B testing URL sets by using https://github.com/google/tfp-causalimpact and Google Search Console data (time series data for the chosen URL test sets)
  *   Fake network modelling
5. Unit tests are not provided intentionally, since they're an overkill for our audience.




In [None]:
# Step 1: Mount Google Drive
!pip install torch-geometric

# Step 1: Mount Google Drive
from google.colab import drive
import os
import pandas as pd
import duckdb
import networkx as nx
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data, DataLoader

# Mount Google Drive
drive.mount('/content/drive')

# Step 2: Define paths to your files
drive_root = '/content/drive/My Drive/'

csv_file_path = os.path.join(drive_root, 'FinalLinkGraph.csv') # the file which is used to create the graph itself with columns Source and Destination
db_file_path = os.path.join(drive_root, 'VectorJinaDuckDBwithFeatures.db')

# Step 3: Load the data
filtered_link_graph_df = pd.read_csv(csv_file_path)
db_connection = duckdb.connect(db_file_path)

# Load features from the DuckDB - assuming the table name is 'features'
db_features_df = db_connection.execute("SELECT * FROM url_embeddings").fetchdf()

# Step 4: Combine the data
# Merge the CSV with the features from the DuckDB based on the Source URL
merged_df_source = filtered_link_graph_df.merge(
    db_features_df,
    left_on='Source',
    right_on='url',
    how='left',
    suffixes=('_source', '_destination')
)

# Merge again for the Destination URL
merged_df = merged_df_source.merge(
    db_features_df,
    left_on='Destination',
    right_on='url',
    how='left',
    suffixes=('_source', '_destination')
)

# Step 5: Prepare the Graph for GraphSAGE
G = nx.DiGraph()  # Create a directed graph

# Determine the feature dimension (based on the embedding size or other features you have)
feature_dim = len(db_features_df.iloc[0]['embedding']) if 'embedding' in db_features_df.columns else 10  # Example fallback

# Add nodes with attributes, using zero vectors where features are missing
for _, row in db_features_df.iterrows():
    features = row['embedding'] if isinstance(row['embedding'], np.ndarray) and not pd.isna(row['embedding']).all() else np.zeros(feature_dim)
    G.add_node(row['url'], features=features)

# Ensure every node in the graph has the 'features' attribute, even if it wasn't in db_features_df
for node in filtered_link_graph_df['Source'].unique():
    if node not in G.nodes:
        G.add_node(node, features=np.zeros(feature_dim))

for node in filtered_link_graph_df['Destination'].unique():
    if node not in G.nodes:
        G.add_node(node, features=np.zeros(feature_dim))

# Add edges from the CSV data
for _, row in merged_df.iterrows():
    G.add_edge(row['Source'], row['Destination'])

# Create the feature matrix (X) and edge list (edge_index)
node_features = []
node_indices = {}
index = 0

for node in G.nodes(data=True):
    node_features.append(node[1]['features'])
    node_indices[node[0]] = index
    index += 1

# Convert the feature list and edge list to torch tensors
X = torch.tensor(np.array(node_features), dtype=torch.float)
edge_index = torch.tensor([[node_indices[edge[0]], node_indices[edge[1]]] for edge in G.edges], dtype=torch.long).t().contiguous()

# Step 6: Create the PyTorch Geometric Data object
data = Data(x=X, edge_index=edge_index)

# Step 7: Define the GraphSAGE model
class GraphSAGELinkPredictor(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGELinkPredictor, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
        self.link_pred = nn.Linear(out_channels * 2, 1)  # Link prediction layer

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

    def predict_link(self, z, edge_label_index):
        edge_embeddings = torch.cat([z[edge_label_index[0]], z[edge_label_index[1]]], dim=1)
        return torch.sigmoid(self.link_pred(edge_embeddings)).view(-1)

# Initialize the model
model = GraphSAGELinkPredictor(in_channels=X.size(1), hidden_channels=64, out_channels=32)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.BCELoss()

# Step 8: Train the GraphSAGE model
def train(data, model, optimizer, criterion, edge_label_index, edge_labels):
    model.train()
    optimizer.zero_grad()
    z = model(data.x, data.edge_index)
    link_logits = model.predict_link(z, edge_label_index)
    loss = criterion(link_logits, edge_labels)
    loss.backward()
    optimizer.step()
    return loss.item()

# Create positive and negative samples for link prediction
def create_edge_labels(data):
    edge_index = data.edge_index
    num_edges = edge_index.size(1)

    # Create positive labels (1s)
    edge_labels = torch.ones(num_edges)

    # Sample negative edges (not in the graph)
    neg_edge_index = torch.randint(0, data.num_nodes, edge_index.size(), dtype=torch.long)

    # Combine positive and negative edges
    edge_label_index = torch.cat([edge_index, neg_edge_index], dim=1)
    edge_labels = torch.cat([edge_labels, torch.zeros(num_edges)])

    return edge_label_index, edge_labels

# Create edge labels for training
edge_label_index, edge_labels = create_edge_labels(data)

# Train the model
for epoch in range(200):  # Example for 200 epochs
    loss = train(data, model, optimizer, criterion, edge_label_index, edge_labels)
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

# Step 9: Evaluate the model and include URLs
model.eval()
with torch.no_grad():
    z = model(data.x, data.edge_index)
    link_probs = model.predict_link(z, edge_label_index)

# Get the source and destination indices for the edges
source_indices = edge_label_index[0].cpu().numpy()
destination_indices = edge_label_index[1].cpu().numpy()

# Create a list of source and destination URLs
index_to_url = {v: k for k, v in node_indices.items()}

# Print the URLs with their associated probabilities
for i in range(50):  # Adjust this range to print more or fewer results
    source_url = index_to_url[source_indices[i]]
    destination_url = index_to_url[destination_indices[i]]
    probability = link_probs[i].item()
    print(f"Source URL: {source_url}, Destination URL: {destination_url}, Probability: {probability:.4f}")



[31mERROR: Operation cancelled by user[0m[31m
[0m

KeyboardInterrupt: 

### Input URL to get recommendations

In [None]:
# Step 1: Mount Google Drive
!pip install torch-geometric

# Step 1: Mount Google Drive
from google.colab import drive
import os
import pandas as pd
import duckdb
import networkx as nx
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data, DataLoader

# Mount Google Drive
drive.mount('/content/drive')

# Step 2: Define paths to your files
drive_root = '/content/drive/My Drive/'

csv_file_path = os.path.join(drive_root, 'FinalLinkGraph.csv')
db_file_path = os.path.join(drive_root, 'VectorJinaDuckDBwithFeatures.db')

# Step 3: Load the data
filtered_link_graph_df = pd.read_csv(csv_file_path)
db_connection = duckdb.connect(db_file_path)

# Load features from the DuckDB - assuming the table name is 'features'
db_features_df = db_connection.execute("SELECT * FROM url_embeddings").fetchdf()

# Step 4: Combine the data
# Merge the CSV with the features from the DuckDB based on the Source URL
merged_df_source = filtered_link_graph_df.merge(
    db_features_df,
    left_on='Source',
    right_on='url',
    how='left',
    suffixes=('_source', '_destination')
)

# Merge again for the Destination URL
merged_df = merged_df_source.merge(
    db_features_df,
    left_on='Destination',
    right_on='url',
    how='left',
    suffixes=('_source', '_destination')
)

# Step 5: Prepare the Graph for GraphSAGE
G = nx.DiGraph()  # Create a directed graph

# Determine the feature dimension (based on the embedding size or other features you have)
feature_dim = len(db_features_df.iloc[0]['embedding']) if 'embedding' in db_features_df.columns else 10  # Example fallback

# Add nodes with attributes, using zero vectors where features are missing
for _, row in db_features_df.iterrows():
    features = row['embedding'] if isinstance(row['embedding'], np.ndarray) and not pd.isna(row['embedding']).all() else np.zeros(feature_dim)
    G.add_node(row['url'], features=features)

# Ensure every node in the graph has the 'features' attribute, even if it wasn't in db_features_df
for node in filtered_link_graph_df['Source'].unique():
    if node not in G.nodes:
        G.add_node(node, features=np.zeros(feature_dim))

for node in filtered_link_graph_df['Destination'].unique():
    if node not in G.nodes:
        G.add_node(node, features=np.zeros(feature_dim))

# Add edges from the CSV data
for _, row in merged_df.iterrows():
    G.add_edge(row['Source'], row['Destination'])

# Create the feature matrix (X) and edge list (edge_index)
node_features = []
node_indices = {}
index = 0

for node in G.nodes(data=True):
    node_features.append(node[1]['features'])
    node_indices[node[0]] = index
    index += 1

# Convert the feature list and edge list to torch tensors
X = torch.tensor(np.array(node_features), dtype=torch.float)
edge_index = torch.tensor([[node_indices[edge[0]], node_indices[edge[1]]] for edge in G.edges], dtype=torch.long).t().contiguous()

# Step 6: Create the PyTorch Geometric Data object
data = Data(x=X, edge_index=edge_index)

# Step 7: Define the GraphSAGE model
class GraphSAGELinkPredictor(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGELinkPredictor, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
        self.link_pred = nn.Linear(out_channels * 2, 1)  # Link prediction layer

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

    def predict_link(self, z, edge_label_index):
        edge_embeddings = torch.cat([z[edge_label_index[0]], z[edge_label_index[1]]], dim=1)
        return torch.sigmoid(self.link_pred(edge_embeddings)).view(-1)

# Initialize the model
model = GraphSAGELinkPredictor(in_channels=X.size(1), hidden_channels=64, out_channels=32)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.BCELoss()

# Step 8: Train the GraphSAGE model
def train(data, model, optimizer, criterion, edge_label_index, edge_labels):
    model.train()
    optimizer.zero_grad()
    z = model(data.x, data.edge_index)
    link_logits = model.predict_link(z, edge_label_index)
    loss = criterion(link_logits, edge_labels)
    loss.backward()
    optimizer.step()
    return loss.item()

# Create positive and negative samples for link prediction
def create_edge_labels(data):
    edge_index = data.edge_index
    num_edges = edge_index.size(1)

    # Create positive labels (1s)
    edge_labels = torch.ones(num_edges)

    # Sample negative edges (not in the graph)
    neg_edge_index = torch.randint(0, data.num_nodes, edge_index.size(), dtype=torch.long)

    # Combine positive and negative edges
    edge_label_index = torch.cat([edge_index, neg_edge_index], dim=1)
    edge_labels = torch.cat([edge_labels, torch.zeros(num_edges)])

    return edge_label_index, edge_labels

# Create edge labels for training
edge_label_index, edge_labels = create_edge_labels(data)

# Train the model
for epoch in range(200):  # Example for 200 epochs
    loss = train(data, model, optimizer, criterion, edge_label_index, edge_labels)
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

# Step 9: Define the recommendation function
def recommend_links(input_url, model, data, top_k=10):
    if input_url not in node_indices:
        raise ValueError(f"URL {input_url} not found in the graph.")

    input_index = node_indices[input_url]

    possible_destinations = list(G.nodes)
    possible_edge_index = torch.tensor([[input_index, node_indices[url]] for url in possible_destinations if url != input_url], dtype=torch.long).t().contiguous()

    model.eval()
    with torch.no_grad():
        z = model(data.x, data.edge_index)
        link_probs = model.predict_link(z, possible_edge_index)

    top_indices = torch.argsort(link_probs, descending=True)[:top_k]

    top_urls = [possible_destinations[i] for i in top_indices]
    top_probs = link_probs[top_indices].cpu().numpy()

    recommendations = []
    for i in range(top_k):
        recommendations.append({
            "Destination URL": top_urls[i],
            "Probability": top_probs[i]
        })

    return recommendations

# Example usage with user input
input_url = input("Enter a URL to get link recommendations: ")

try:
    recommendations = recommend_links(input_url, model, data, top_k=10)
    print("\nTop 10 recommended links:")
    for i, rec in enumerate(recommendations, 1):
        print(f"{i}. Destination URL: {rec['Destination URL']}, Probability: {rec['Probability']:.4f}")
except ValueError as e:
    print(e)




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Epoch 0, Loss: 0.6995
Epoch 10, Loss: 0.6900
Epoch 20, Loss: 0.6715
Epoch 30, Loss: 0.5931
Epoch 40, Loss: 0.5470
Epoch 50, Loss: 0.5277
Epoch 60, Loss: 0.5169
Epoch 70, Loss: 0.5084
Epoch 80, Loss: 0.5016
Epoch 90, Loss: 0.4963
Epoch 100, Loss: 0.4922
Epoch 110, Loss: 0.4880
Epoch 120, Loss: 0.4845
Epoch 130, Loss: 0.4823
Epoch 140, Loss: 0.4793
Epoch 150, Loss: 0.4769
Epoch 160, Loss: 0.4751
Epoch 170, Loss: 0.4737
Epoch 180, Loss: 0.4723
Epoch 190, Loss: 0.4715
Enter a URL to get link recommendations: https://kalicube.com/learning-spaces/knowledge-nuggets/brand-seo/how-do-you-establish-a-solid-foundation-for-your-brand-online-optyoumize-podcast/

Top 10 recommended links:
1. Destination URL: https://kalicube.com/learning-spaces/, Probability: 0.9569
2. Destination URL: https://kalicube.com/learning-spaces/faq/seo-glossary/the-12-concrete-results-of-impleme