In [1]:
import os
import sys
import pandas as pd
import numpy as np
import networkx as nx
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from sklearn.model_selection import KFold
import torch.nn.functional as F

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #if you have a GPU with CUDA installed, this may speed up computation

In [35]:
# read training data and partition it in train and validation files
df_train = pd.read_csv('../data/train.csv', dtype={'author': np.int64, 'hindex': np.float32})
n_train = df_train.shape[0]
print(df_train.head())

       author  hindex
0  1964267543     4.0
1  2153592714    13.0
2   217158525     8.0
3  2123103677    11.0
4  2067710487     3.0


In [19]:
# load the pre-processed features    
df_features = pd.read_csv('../../data/author-embeddings/allenai-specter-pca-128-deepwalk-128.csv', header=None)
df_features.rename(columns={0 :'author_id'}, inplace=True)
df_features.set_index('author_id', inplace=True)
df_features.sort_index(inplace=True)
df_features.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,247,248,249,250,251,252,253,254,255,256
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1036332,-1.90955,5.90279,-5.28037,4.26569,3.34847,-4.41165,1.9131,0.69429,-0.68736,-2.67308,...,0.62562,-0.0953,0.82372,-0.58675,1.24124,0.46163,-0.13666,0.1919,0.78893,-0.65728
1101850,-2.07252,-11.67235,-6.54985,2.34372,-1.66278,-1.01753,3.29457,-2.45055,-2.13195,1.19772,...,0.10049,0.33901,0.44946,-0.63224,0.08782,-0.16558,0.26003,0.52424,-0.61639,-0.18469
1336878,-4.6321,-4.48402,4.62236,4.82063,-0.75189,0.10461,7.15607,-4.27587,-1.61411,-0.5857,...,-0.17258,-0.01662,0.34523,-0.50513,0.4984,0.80124,0.25781,-0.57674,0.15122,-0.2212
1515524,-3.98285,-4.90364,-2.15142,-0.97513,0.29883,6.15499,-2.35005,3.9363,-3.59853,5.12052,...,0.43393,-0.18901,0.13683,-0.99948,-0.06288,1.00201,0.89722,-0.69786,-0.08425,0.25832
1606427,0.15545,-5.66483,5.10379,1.02987,-1.1954,-6.62419,-3.93375,-2.3894,0.50703,0.67375,...,0.87571,-0.62939,0.79637,-0.66846,-0.2988,0.16288,0.25446,-0.22339,0.3203,0.08723


In [41]:
class AuthorDataset(Dataset):
    # The mapping file maps an author to its h-index
    def __init__(self, mapping_file):
        self.author_map = pd.read_csv(mapping_file)

    def __len__(self):
        return len(self.author_map)

    def __getitem__(self, idx):
        # Get the author id and its h-index
        author_id = self.author_map.iloc[idx, 0]
        h_index = self.author_map.iloc[idx, 1].astype(np.float32)
        features = df_features.loc[author_id,:].to_numpy(dtype=np.float32)
        return features, h_index

In [66]:
class TunableParameters:
    def __init__(self, n_hidden1, n_hidden2, use_dropout, dropout_p, lr):
        self.n_hidden1 = n_hidden1
        self.n_hidden2 = n_hidden2
        self.use_dropout = use_dropout
        self.dropout_p = dropout_p
        self.lr = lr
    
    def print_parameters(self):
        print("Tunable parameters:")
        print("########################################")
        print("Size of hidden layer 1:", self.n_hidden1)
        print("Size of hidden layer 2:", self.n_hidden2)
        print("Using dropout:", self.use_dropout)
        print("Dropout p:", self.dropout_p)
        print("Learning rate:", self.lr)

In [67]:
class MLP(nn.Module):
    def __init__(self, n_input, n_hidden1, n_hidden2, n_output, use_dropout=True, dropout_p=0.2):
        super(MLP, self).__init__()
        self.use_dropout = use_dropout
        self.dropout_p = dropout_p
        self.fc1 = torch.nn.Linear(n_input, n_hidden1)
        self.fc2 = torch.nn.Linear(n_hidden1, n_hidden2)
        if use_dropout:
            self.dropout1 = torch.nn.Dropout(p=self.dropout_p)
            self.dropout2 = torch.nn.Dropout(p=self.dropout_p)
        self.output = torch.nn.Linear(n_hidden2, n_output)  
        
    def forward(self, x, verbose=False):
        x = self.fc1(x)
        if self.use_dropout:
            x = self.dropout1(x)
        x = F.relu(x)
        x = self.fc2(x)
        if self.use_dropout:
            x = self.dropout2(x)
        x = F.relu(x)
        x = self.output(x)
        return x

In [68]:
n_epochs = 5
n_k_folds = 5

input_size = df_features.shape[1]
output_size = 1

# Set the seed for random shuffles
torch.manual_seed(7)

<torch._C.Generator at 0x7f338b9643b0>

In [69]:
parameters = [TunableParameters(512, 256, True, 0.2, 0.01), TunableParameters(512, 256, False, 0.0, 0.01), TunableParameters(512, 256, True, 0.2, 0.001), TunableParameters(256, 128, True, 0.2, 0.01)]

In [70]:
train_dataset = AuthorDataset('../../data/train.csv')

In [71]:
k_fold = KFold(n_splits=n_k_folds, shuffle=True)

In [72]:
for param in parameters:
    param.results = {}
    for fold, (train_ids, validation_ids) in enumerate(k_fold.split(train_dataset)):

        print(f'Performing fold {fold}...')
        print('##########################################')

        train_subsampler = SubsetRandomSampler(train_ids)
        validation_subsampler = SubsetRandomSampler(validation_ids)

        train_loader = DataLoader(train_dataset, batch_size=64, sampler=train_subsampler)
        validation_loader = DataLoader(train_dataset, batch_size=64, sampler=validation_subsampler)

        model = MLP(input_size, param.n_hidden1, param.n_hidden2, output_size, param.use_dropout, param.dropout_p)
        # reset weights
        for layer in model.children():
             if hasattr(layer, 'reset_parameters'):
                layer.reset_parameters()
        model.to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

        for epoch in range(0, n_epochs):
            fold_loss = 0.0
            data_size = 0

            for i, (data, target) in enumerate(train_loader):
                data, target = data.to(device), target.to(device)
                data_size += len(data)

                optimizer.zero_grad()
                output = model(data).squeeze()

                # MSE loss is used in this case
                loss = F.mse_loss(output, target)
                loss.backward()

                optimizer.step()

                fold_loss += F.mse_loss(output, target, reduction='sum')

            fold_loss /= data_size
            print(f'Epoch {epoch} completed, MSE loss: {fold_loss}')
        print("Training complete, switching to evaluation...")

        eval_loss = 0.0
        model.eval()

        with torch.no_grad():
            data_size = 0
            for i, (data, target) in enumerate(validation_loader):
                data, target = data.to(device), target.to(device)
                data_size += len(data)
                output = model(data).squeeze()
                eval_loss += F.mse_loss(output, target, reduction="sum").item()  # sum up batch loss

            eval_loss /= data_size

            print(f'MSE loss on fold {fold}: {eval_loss}')
            param.results[fold] = eval_loss  

Performing fold 0...
##########################################
Epoch 0 completed, MSE loss: 99.35980224609375
Epoch 1 completed, MSE loss: 89.04737091064453
Epoch 2 completed, MSE loss: 83.77372741699219
Epoch 3 completed, MSE loss: 82.3624038696289
Epoch 4 completed, MSE loss: 78.7078628540039
Training complete, switching to evaluation...
MSE loss on fold 0: 82.11528676340694
Performing fold 1...
##########################################


  loss = F.mse_loss(output, target)
  fold_loss += F.mse_loss(output, target, reduction='sum')


Epoch 0 completed, MSE loss: 99.43627166748047
Epoch 1 completed, MSE loss: 88.03763580322266
Epoch 2 completed, MSE loss: 84.879638671875


KeyboardInterrupt: 

In [65]:
# Print final results
print(f'KFold results for k={n_k_folds}:')
print('######################################')
for param in parameters: 
    print("Results for the following parameters:")
    param.print_parameters()
    average = 0.0
    for k, v in param.results.items():
        print(f'Fold {k}: {v}')
        average += v
    average /= len(loss_results)
    print(f'Average value: {average}')

KFold results for k=5:
######################################
Results for the following parameters:
Fold 0: 86.12413353862378
Fold 1: 85.75901460603797
Fold 2: 85.065986566745
Fold 3: 82.4649932377923
Fold 4: 87.12480092464618
Average value: 213.26946443692262
Results for the following parameters:
Fold 0: 85.5780870047439
Fold 1: 91.54656057997074
Fold 2: 124.66160683680063
Fold 3: 86.93163720263156
Fold 4: 82.61718445244571
Average value: 235.66753803829627
