In [None]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("../")
print(os.getcwd())

In [None]:
import pandas as pd
import numpy as np
import functools
import operator
import json
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch_geometric as tg
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader, NeighborLoader
from torch_geometric.data import HeteroData
import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
from kfp import dsl

In [None]:
@dsl.component
def say_hello(name: str) -> str:
    hello_text = f'Hello, {name}!'
    print(hello_text)
    return hello_text

@dsl.pipeline(name="HelloWorldPipeline")
def hello_pipeline(recipient: str) -> str:
    hello_task = say_hello(name=recipient)
    return hello_task.output

In [None]:
from kfp import compiler

compiler.Compiler().compile(hello_pipeline, 'pipeline.yaml')

In [None]:
from kfp.client import Client

client = Client(host='http://localhost:8080')
client.set_user_namespace('kubeflow')
run = client.create_run_from_pipeline_package(
    'pipeline.yaml',
    arguments={
        'recipient': 'World',
    },
)

In [None]:
# def load_node_csv(path, index_col, encoders=None, **kwargs):
#     df = pd.read_csv(path, index_col=index_col, **kwargs)
#     mapping = {index: i for i, index in enumerate(df.index.unique())}

#     x = None
#     if encoders is not None:
#         xs = [encoder(df[col]) for col, encoder in encoders.items()]
#         x = torch.cat(xs, dim=-1)

#     return x, mapping

In [None]:
def load_data_from_csv(path: str) -> pd.DataFrame:
    """
    Loads data from a CSV file into a Pandas DataFrame.
    Csv file requirements:
        - `user_id` - int
        - `app_id` - int
        - `is_recommended` - int [0/1]

    Parameters:
    - path (str): The file path of the CSV file to load.

    Returns:
    - df (pd.DataFrame): The loaded data as a Pandas DataFrame.
    """
    df = pd.read_csv(path, index_col=[0])
    return df

In [None]:
def load_graph(df: pd.DataFrame) -> HeteroData:
    """
    Loads a graph data structure from a pandas DataFrame.
    
    Parameters:
        - df (pd.DataFrame): The input DataFrame containing the graph data.

    Returns:
        - HeteroData: A heterogeneous graph data object representing the input graph.

    Example:
        >>> import pandas as pd
        >>> df = pd.DataFrame({'user_id': [1, 2, 3], 'app_id': [4, 5, 6], 'is_recommended': [1, 0 ,1]})
        >>> graph = load_graph(df)
    """
    
    data = HeteroData()
    
    data['user'].x = torch.zeros(df['user_id'].nunique(), 1)
    data['app'].x = torch.zeros(df['app_id'].nunique(), 1)
    
    edge_index = torch.tensor([df['user_id'].values, df['app_id'].values])
    edge_label = torch.tensor(df['is_recommended'].values)

    data['user', 'recommends', 'app'].edge_index = edge_index
    data['user', 'recommends', 'app'].edge_label = edge_label
    
    return data

In [None]:
def transform_graph(data: HeteroData) -> HeteroData:
    """
    Applies a transformation to a heterogeneous graph data object.

    Parameters:
        data (HeteroData): The input graph data object to be transformed.

    Returns:
        HeteroData: A new heterogeneous graph data object resulting from the transformation.

    Example:
        >>> transformed_data = transform_graph(data)
    """
    transform = T.Compose([T.ToUndirected()])
    return transform(data)

In [None]:
def init_edge_loader(data: HeteroData, **kwargs) -> NeighborLoader:
    """
    Initializes a neighbor loader for edge-based data in a heterogeneous graph.
    Firstly we sample `batch_size` edges and then sample at most `num_neighbors[0]`
    neighboring edges at first hop and at most `num_neighbors[1]` at second hop. 
    Value returned by next(iter(loader)) is a subgraph of `data` graph containing
    only sampled edges and congruent nodes.

    Args:
        data (HeteroData): The input heterogeneous graph data object.
        **kwargs: Additional keyword arguments for configuring the loader.

    Returns:
        NeighborLoader: A neighbor loader for the specified edge-based data.

    Example:
        >>> loader = init_edge_loader(data, num_neighbors=5, neg_sampl=0.2, bs=32, shuffle=True)
    """
    
    eli = (('user', 'recommends', 'app'), data['user', 'recommends', 'app'].edge_index)
    el = data['user', 'recommends', 'app'].edge_label
    
    loader = LinkNeighborLoader(
        data=data,
        num_neighbors=kwargs['num_neighbors'],
        neg_sampling_ratio=kwargs['neg_sampl'],
        edge_label_index=eli,
        edge_label=el,
        batch_size=kwargs['bs'],
        shuffle=kwargs['shuffle'],
    )
    return loader

In [None]:
train_df = load_data_from_csv("data/graph_train.csv")
test_df = load_data_from_csv("data/graph_test.csv")

In [None]:
train_data = load_graph(train_df)
test_data = load_graph(test_df)

In [None]:
train_data

In [None]:
test_data

In [None]:
train_data = transform_graph(train_data)
test_data = transform_graph(test_data)

In [None]:
test_data = init_edge_loader(test_data, num_neighbors=[20, 10], neg_sampl=0., bs=128, shuffle=False)

In [None]:
train_loader = init_edge_loader(train_data, num_neighbors=[20, 10], neg_sampl=0., bs=256, shuffle=True)
test_loader = init_edge_loader(test_data, num_neighbors=[20, 10], neg_sampl=0., bs=256, shuffle=False)

# TODO: 
    - edge level train mask
    - glue datasets and create separate file with train mask indices

In [None]:
def train_fn(train_data: HeteroData, test_data: HeteroData):

    
    

In [None]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = tg.nn.SAGEConv((-1, -1), hidden_channels)
        self.conv2 = tg.nn.SAGEConv((-1, -1), out_channels)
        

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x
    
    
class Classifier(torch.nn.Module):
    def forward(self, x_user, x_app, edge_label_index):
        x_user = x_user[edge_label_index[0]]
        x_app = x_app[edge_label_index[1]]
        return (x_user * x_app).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        # self.user_emb = torch.nn.Embedding(n_users, hidden_channels)
        # self.app_emb = torch.nn.Embedding(n_apps, hidden_channels)
        
        self.user_emb_z = torch.zeros(n_users, hidden_channels)
        self.app_emb_z = torch.zeros(n_apps, hidden_channels)
        
        self.gnn = GNN(hidden_channels=64, out_channels=32)
        self.gnn = tg.nn.to_hetero(self.gnn, data.metadata(), aggr='sum')
        
        self.clf = Classifier()
        
    def forward(self, data):
        # x_dict = {
        #   "user": self.user_emb.weight,
        #   "app": self.app_emb.weight,
        # } 
        
        x_dict = {
          "user": self.user_emb_z,
          "app": self.app_emb_z,
        } 
        
        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.clf(
            x_dict["user"],
            x_dict["app"],
            data['user', 'recommends', 'app'].edge_label_index,
        )
        return pred
    
model = Model(hidden_channels=64, out_channels=32)
model = model.to(device)

In [None]:
#criterion = torch.nn.NLLLoss()
criterion = F.binary_cross_entropy_with_logits
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
writer = SummaryWriter()

In [None]:
def train_batch(batch):
    optimizer.zero_grad()
    y_pred = model(batch)
    y_true = batch['user', 'recommends', 'app'].edge_label
    
    loss = criterion(y_pred, y_true.float())
    loss.backward()
    
    optimizer.step()
    
    return loss, y_pred.numel()

In [None]:
def train(n_epochs, print_loss=500):
    for epoch in range(n_epochs):
        running_loss = 0.
        for i_batch, batch in enumerate(tqdm(train_loader)):
            batch = batch.to(device)
            loss, num = train_batch(batch)
            running_loss += loss.item()

            if not (i_batch+1 % print_loss):
                last_loss = running_loss / print_loss
                writer.add_scalar("Loss/train", last_loss, epoch*len(train_loader) + i_batch + 1)
                print(f"batch <{i_batch}> - loss: {last_loss}")
                running_loss = 0.
            
        #print(f"Epoch: {epoch:03d}, Loss: {running_loss / len(train_loader):.4f}")

In [None]:
train(n_epochs=5)