In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx

In [3]:
git_data = pd.read_csv(
    os.path.join("musae_git_edges.csv"),
    skiprows=1,
    sep=",",
    header=None,
    names=["target", "source"],
)
print("citations shape:", git_data.shape)

citations shape: (289003, 2)


In [25]:
git_data.head()
git_data.shape

(289003, 2)

In [29]:
git_data.to_csv("git_edges.csv")

In [21]:
import json

with open('musae_git_features.json') as json_file:
    data = json.load(json_file)

In [60]:
papers = pd.DataFrame.from_dict(data, orient='index')
print(papers.shape)
papers.head()

(37700, 42)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,1574,3773,3571,2672,2478,2534,3129,3077,1171.0,2045.0,...,,,,,,,,,,
1,1193,376,73,290,3129,1852,3077,1171,1022.0,2045.0,...,,,,,,,,,,
2,1574,3773,925,1728,2815,2963,3077,364,1171.0,536.0,...,,,,,,,,,,
3,3964,3773,4003,928,1852,3077,364,1022,3763.0,2045.0,...,,,,,,,,,,
4,1929,3773,1793,3511,1290,3129,3077,364,1171.0,1022.0,...,,,,,,,,,,


In [51]:
papers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37700 entries, 0 to 37699
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       37700 non-null  int64  
 1   1       37700 non-null  int64  
 2   2       37700 non-null  int64  
 3   3       37700 non-null  int64  
 4   4       37700 non-null  int64  
 5   5       37700 non-null  int64  
 6   6       37700 non-null  int64  
 7   7       37700 non-null  int64  
 8   8       37700 non-null  float64
 9   9       37700 non-null  float64
 10  10      37700 non-null  float64
dtypes: float64(3), int64(8)
memory usage: 3.5+ MB


In [44]:
papers = papers.iloc[:, :11]

In [50]:
papers = papers.bfill(axis="rows")

In [52]:
papers.to_csv("papers.csv")

In [15]:
targets = pd.read_csv('musae_git_target.csv')

In [16]:
targets.head()

Unnamed: 0,id,name,ml_target
0,0,Eiryyy,0
1,1,shawflying,0
2,2,JpMCarrilho,1
3,3,SuhwanCha,0
4,4,sunilangadi2,1


In [17]:
import urllib.request

In [18]:
urllib.request.urlretrieve(
    'https://data.dgl.ai/tutorial/dataset/members.csv', './members.csv')
urllib.request.urlretrieve(
    'https://data.dgl.ai/tutorial/dataset/interactions.csv', './interactions.csv')

members = pd.read_csv('./members.csv')
members.head()

interactions = pd.read_csv('./interactions.csv')
interactions.head()

Unnamed: 0,Src,Dst,Weight
0,0,1,0.043591
1,0,2,0.282119
2,0,3,0.370293
3,0,4,0.73057
4,0,5,0.821187


In [19]:
members.head()

Unnamed: 0,Id,Club,Age
0,0,Mr. Hi,44
1,1,Mr. Hi,37
2,2,Mr. Hi,37
3,3,Mr. Hi,40
4,4,Mr. Hi,30


In [20]:
interactions.head()

Unnamed: 0,Src,Dst,Weight
0,0,1,0.043591
1,0,2,0.282119
2,0,3,0.370293
3,0,4,0.73057
4,0,5,0.821187


In [26]:
# git_data == interactions
# members == papers


In [27]:
import dgl

import torch


Using backend: pytorch


In [58]:
class GitHubDataset(DGLDataset):
    def __init__(self):
        super().__init__(name='github_dataset')

    def process(self):
        nodes_data = pd.read_csv('papers.csv')
        edges_data = pd.read_csv('git_edges.csv')
        node_target = pd.read_csv('musae_git_target.csv')
        node_features = torch.from_numpy(nodes_data.to_numpy())
        node_labels = torch.from_numpy(node_target["ml_target"].astype('category').cat.codes.to_numpy())
        #edge_features = torch.from_numpy(edges_data['Weight'].to_numpy())
        edges_src = torch.from_numpy(edges_data['source'].to_numpy())
        edges_dst = torch.from_numpy(edges_data['target'].to_numpy())

        self.graph = dgl.graph((edges_src, edges_dst), num_nodes=nodes_data.shape[0])
        self.graph.ndata['feat'] = node_features
        self.graph.ndata['label'] = node_labels
        #self.graph.edata['weight'] = edge_features

        # If your dataset is a node classification dataset, you will need to assign
        # masks indicating whether a node belongs to training, validation, and test set.
        n_nodes = nodes_data.shape[0]
        n_train = int(n_nodes * 0.6)
        n_val = int(n_nodes * 0.2)
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        val_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[:n_train] = True
        val_mask[n_train:n_train + n_val] = True
        test_mask[n_train + n_val:] = True
        self.graph.ndata['train_mask'] = train_mask
        self.graph.ndata['val_mask'] = val_mask
        self.graph.ndata['test_mask'] = test_mask

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1

dataset = GitHubDataset()
graph = dataset[0]
