**Create Train / Dev / Test files. <br> Each file is a dictionary where each key represent the ID of a certain Author and each value is a dict where the keys are : <br> - author_embedding : the Node embedding that correspond to the author (tensor of shape (128,)) <br> - papers_embedding : the abstract embedding of every papers (tensor of shape (10,dim)) (dim depend on the embedding model taken into account) <br> - features : the graph structural features (tensor of shape (4,)) <br> - y : the target (tensor of shape (1,))**

In [6]:
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm_notebook as tqdm
from sklearn.utils import shuffle
import gzip
import pickle
import torch

In [3]:
def load_dataset_file(filename):
    with gzip.open(filename, "rb") as f:
        loaded_object = pickle.load(f)
        return loaded_object
def save(object, filename, protocol = 0):
        """Saves a compressed object to disk
        """
        file = gzip.GzipFile(filename, 'wb')
        file.write(pickle.dumps(object, protocol))
        file.close()

# Roberta Embedding

In [None]:
# Load the paper's embedding
embedding_per_paper = load_dataset_file('/content/drive/MyDrive/altegrad_datachallenge/files_generated/embedding_per_paper_clean.txt')
# Load the node's embedding
embedding_per_nodes = load_dataset_file('/content/drive/MyDrive/altegrad_datachallenge/files_generated/Node2Vec.txt')
# read the file to create a dictionary with author key and paper list as value
f = open("/content/drive/MyDrive/altegrad_datachallenge/author_papers.txt","r")
papers_per_author = {}
for l in f:
    auth_paps = [paper_id.strip() for paper_id in l.split(":")[1].replace("[","").replace("]","").replace("\n","").replace("\'","").replace("\"","").split(",")]
    papers_per_author[l.split(":")[0]] = auth_paps
# Load train set
df_train = shuffle(pd.read_csv('/content/drive/MyDrive/altegrad_datachallenge/train.csv', dtype={'authorID': np.int64, 'h_index': np.float32})).reset_index(drop=True)
# Load test set
df_test = pd.read_csv('/content/drive/MyDrive/altegrad_datachallenge/test.csv', dtype={'authorID': np.int64}) 
# Load Graph
G = nx.read_edgelist('/content/drive/MyDrive/altegrad_datachallenge/collaboration_network.edgelist', delimiter=' ', nodetype=int)

In [None]:
# computes structural features for each node
core_number = nx.core_number(G)
avg_neighbor_degree = nx.average_neighbor_degree(G)
# Split into train/valid
df_valid = df_train.iloc[int(len(df_train)*0.9):, :]
df_train = df_train.iloc[:int(len(df_train)*0.9), :]

## Train

In [None]:
train_data = {}
for i, row in tqdm(df_train.iterrows()):
    author_id, y = str(int(row['authorID'])), row['h_index']
    degree, core_number_, avg_neighbor_degree_ = G.degree(int(author_id)), core_number[int(author_id)], avg_neighbor_degree[int(author_id)]
    author_embedding = torch.from_numpy(embedding_per_nodes[int(author_id)].reshape(1,-1))
    papers_ids = papers_per_author[author_id]
    papers_embedding = []
    num_papers = 0
    for id_paper in papers_ids:
        num_papers += 1
        try:
            papers_embedding.append(torch.from_numpy(embedding_per_paper[id_paper].reshape(1,-1)))
        except KeyError:
            print(f"Missing paper for {author_id}")
            papers_embedding.append(torch.zeros((1,768)))
    papers_embedding = torch.cat(papers_embedding, dim=0)
    additional_features = torch.from_numpy(np.array([degree, core_number_, avg_neighbor_degree_, num_papers]).reshape(1,-1))
    y = torch.Tensor([y])
    train_data[author_id] = {'author_embedding': author_embedding, 'papers_embedding': papers_embedding, 'features': additional_features, 'target': y}

In [None]:
# Saving
save(train_data, '/content/drive/MyDrive/altegrad_datachallenge/data/data.train')
# Deleting (memory)
del train_data

## Validation

In [None]:
valid_data = {}
for i, row in tqdm(df_valid.iterrows()):
    author_id, y = str(int(row['authorID'])), row['h_index']
    degree, core_number_, avg_neighbor_degree_ = G.degree(int(author_id)), core_number[int(author_id)], avg_neighbor_degree[int(author_id)]
    author_embedding = torch.from_numpy(embedding_per_nodes[int(author_id)].reshape(1,-1))
    papers_ids = papers_per_author[author_id]
    papers_embedding = []
    num_papers = 0
    for id_paper in papers_ids:
        num_papers += 1
        try:
            papers_embedding.append(torch.from_numpy(embedding_per_paper[id_paper].reshape(1,-1)))
        except KeyError:
            papers_embedding.append(torch.zeros((1,768)))
    papers_embedding = torch.cat(papers_embedding, dim=0)
    additional_features = torch.from_numpy(np.array([degree, core_number_, avg_neighbor_degree_, num_papers]).reshape(1,-1))
    y = torch.Tensor([y])
    valid_data[author_id] = {'author_embedding': author_embedding, 'papers_embedding': papers_embedding, 'features': additional_features, 'target': y}

In [None]:
save(valid_data, '/content/drive/MyDrive/altegrad_datachallenge/data/data.valid')
del valid_data

## Test

In [None]:
test_data = {}
for i, row in tqdm(df_test.iterrows()):
    author_id = str(int(row['authorID']))
    degree, core_number_, avg_neighbor_degree_ = G.degree(int(author_id)), core_number[int(author_id)], avg_neighbor_degree[int(author_id)]
    author_embedding = torch.from_numpy(embedding_per_nodes[int(author_id)].reshape(1,-1))
    papers_ids = papers_per_author[author_id]
    papers_embedding = []
    num_papers = 0
    for id_paper in papers_ids:
        num_papers += 1
        try:
            papers_embedding.append(torch.from_numpy(embedding_per_paper[id_paper].reshape(1,-1)))
        except KeyError:
            papers_embedding.append(torch.zeros((1,768)))
    papers_embedding = torch.cat(papers_embedding, dim=0)
    additional_features = torch.from_numpy(np.array([degree, core_number_, avg_neighbor_degree_, num_papers]).reshape(1,-1))
    test_data[author_id] = {'author_embedding': author_embedding, 'papers_embedding': papers_embedding, 'features': additional_features}

In [None]:
del G
del df_test
del embedding_per_paper
del papers_per_author
del core_number
del avg_neighbor_degree
del embedding_per_nodes

In [None]:
save(test_data, '/content/drive/MyDrive/altegrad_datachallenge/data/data.test', 4)
del test_data

# Doc2Vec

In [None]:
# Load the paper's embedding
embedding_per_paper = load_dataset_file('/content/drive/MyDrive/altegrad_datachallenge/files_generated/doc2vec_paper_embedding.txt')
# Load the node's embedding
embedding_per_nodes = load_dataset_file('/content/drive/MyDrive/altegrad_datachallenge/files_generated/Node2Vec.txt')
# read the file to create a dictionary with author key and paper list as value
f = open("/content/drive/MyDrive/altegrad_datachallenge/data/author_papers.txt","r")
papers_per_author = {}
for l in f:
    auth_paps = [paper_id.strip() for paper_id in l.split(":")[1].replace("[","").replace("]","").replace("\n","").replace("\'","").replace("\"","").split(",")]
    papers_per_author[l.split(":")[0]] = auth_paps
# Load train set
df_train = shuffle(pd.read_csv('/content/drive/MyDrive/altegrad_datachallenge/data/train.csv', dtype={'authorID': np.int64, 'h_index': np.float32})).reset_index(drop=True)
# Load test set
df_test = pd.read_csv('/content/drive/MyDrive/altegrad_datachallenge/data/test.csv', dtype={'authorID': np.int64}) 
# Load Graph
G = nx.read_edgelist('/content/drive/MyDrive/altegrad_datachallenge/data/collaboration_network.edgelist', delimiter=' ', nodetype=int)

In [9]:
# computes structural features for each node
core_number = nx.core_number(G)
avg_neighbor_degree = nx.average_neighbor_degree(G)
# Split into train/valid
df_valid = df_train.iloc[int(len(df_train)*0.9):, :]
df_train = df_train.iloc[:int(len(df_train)*0.9), :]

## Train

In [None]:
train_data = {}
for i, row in tqdm(df_train.iterrows()):
    author_id, y = str(int(row['authorID'])), row['h_index']
    degree, core_number_, avg_neighbor_degree_ = G.degree(int(author_id)), core_number[int(author_id)], avg_neighbor_degree[int(author_id)]
    author_embedding = torch.from_numpy(embedding_per_nodes[int(author_id)].reshape(1,-1))
    papers_ids = papers_per_author[author_id]
    papers_embedding = []
    num_papers = 0
    for id_paper in papers_ids:
        num_papers += 1
        try:
            papers_embedding.append(torch.from_numpy(embedding_per_paper[id_paper].reshape(1,-1)))
        except KeyError:
            print(f"Missing paper for {author_id}")
            papers_embedding.append(torch.zeros((1,256)))
    papers_embedding = torch.cat(papers_embedding, dim=0)
    additional_features = torch.from_numpy(np.array([degree, core_number_, avg_neighbor_degree_, num_papers]).reshape(1,-1))
    y = torch.Tensor([y])
    train_data[author_id] = {'author_embedding': author_embedding, 'papers_embedding': papers_embedding, 'features': additional_features, 'target': y}

In [11]:
# Saving
save(train_data, '/content/drive/MyDrive/altegrad_datachallenge/data/d2v.train')
# Deleting (memory)
del train_data

## Dev

In [12]:
valid_data = {}
for i, row in tqdm(df_valid.iterrows()):
    author_id, y = str(int(row['authorID'])), row['h_index']
    degree, core_number_, avg_neighbor_degree_ = G.degree(int(author_id)), core_number[int(author_id)], avg_neighbor_degree[int(author_id)]
    author_embedding = torch.from_numpy(embedding_per_nodes[int(author_id)].reshape(1,-1))
    papers_ids = papers_per_author[author_id]
    papers_embedding = []
    num_papers = 0
    for id_paper in papers_ids:
        num_papers += 1
        try:
            papers_embedding.append(torch.from_numpy(embedding_per_paper[id_paper].reshape(1,-1)))
        except KeyError:
            papers_embedding.append(torch.zeros((1,256)))
    papers_embedding = torch.cat(papers_embedding, dim=0)
    additional_features = torch.from_numpy(np.array([degree, core_number_, avg_neighbor_degree_, num_papers]).reshape(1,-1))
    y = torch.Tensor([y])
    valid_data[author_id] = {'author_embedding': author_embedding, 'papers_embedding': papers_embedding, 'features': additional_features, 'target': y}

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [14]:
save(valid_data, '/content/drive/MyDrive/altegrad_datachallenge/data/d2v.valid')
del valid_data

## Test

In [15]:
test_data = {}
for i, row in tqdm(df_test.iterrows()):
    author_id = str(int(row['authorID']))
    degree, core_number_, avg_neighbor_degree_ = G.degree(int(author_id)), core_number[int(author_id)], avg_neighbor_degree[int(author_id)]
    author_embedding = torch.from_numpy(embedding_per_nodes[int(author_id)].reshape(1,-1))
    papers_ids = papers_per_author[author_id]
    papers_embedding = []
    num_papers = 0
    for id_paper in papers_ids:
        num_papers += 1
        try:
            papers_embedding.append(torch.from_numpy(embedding_per_paper[id_paper].reshape(1,-1)))
        except KeyError:
            papers_embedding.append(torch.zeros((1,256)))
    papers_embedding = torch.cat(papers_embedding, dim=0)
    additional_features = torch.from_numpy(np.array([degree, core_number_, avg_neighbor_degree_, num_papers]).reshape(1,-1))
    test_data[author_id] = {'author_embedding': author_embedding, 'papers_embedding': papers_embedding, 'features': additional_features}

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [16]:
del G
del df_test
del embedding_per_paper
del papers_per_author
del core_number
del avg_neighbor_degree
del embedding_per_nodes

In [17]:
save(test_data, '/content/drive/MyDrive/altegrad_datachallenge/data/d2v.test', 4)
del test_data