In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #if you have a GPU with CUDA installed, this may speed up computation

Load datasets

In [4]:
# read training data and partition it in train and validation files
df_train = pd.read_csv('../data/train.csv', dtype={'author': np.int64, 'hindex': np.float32})
n_train = df_train.shape[0]

msk = np.random.rand(len(df_train)) < 0.8

internal_train = df_train[msk]
internal_train.to_csv('../data/internal-train.csv')
internal_validation = df_train[~msk]
internal_validation.to_csv('../data/internal-validation.csv')

In [5]:
# load the pre-processed features    
df_features = pd.read_csv('../data/author-embeddings/glove-twitter-100-deepwalk-128.csv', header=None)
df_features.rename(columns={0 :'author_id'}, inplace=True)
df_features.set_index('author_id', inplace=True)
df_features.sort_index(inplace=True)
df_features.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,219,220,221,222,223,224,225,226,227,228
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1036332,0.19717,0.12603,-0.07933,0.02404,0.24165,0.17292,0.01,-0.36463,0.09118,-0.21507,...,0.62562,-0.0953,0.82372,-0.58675,1.24124,0.46163,-0.13666,0.1919,0.78893,-0.65728
1101850,0.184,-0.08761,-0.02453,0.09629,0.13563,0.13203,0.08129,-0.3924,-0.05746,-0.18697,...,0.10049,0.33901,0.44946,-0.63224,0.08782,-0.16558,0.26003,0.52424,-0.61639,-0.18469
1336878,0.15419,-0.16287,-0.0088,0.12217,0.17434,0.18038,0.08272,-0.44184,-0.06329,-0.24035,...,-0.17258,-0.01662,0.34523,-0.50513,0.4984,0.80124,0.25781,-0.57674,0.15122,-0.2212
1515524,0.22254,-0.14379,-0.03459,-0.07819,0.17318,0.05151,0.00876,-0.3134,-0.12205,-0.18052,...,0.43393,-0.18901,0.13683,-0.99948,-0.06288,1.00201,0.89722,-0.69786,-0.08425,0.25832
1606427,0.25398,-0.17235,-0.03324,0.1843,0.24849,0.18632,0.12979,-0.61093,0.05607,-0.22261,...,0.87571,-0.62939,0.79637,-0.66846,-0.2988,0.16288,0.25446,-0.22339,0.3203,0.08723


In [6]:
class AuthorDataset(Dataset):
    # The mapping file maps an author to its h-index
    def __init__(self, mapping_file):
        self.author_map = pd.read_csv(mapping_file)

    def __len__(self):
        return len(self.author_map)

    def __getitem__(self, idx):
        # Get the author id and its h-index
        author_id = self.author_map.iloc[idx, 1]
        h_index = self.author_map.iloc[idx, 2].astype(np.float32)
        features = df_features.loc[author_id,:].to_numpy(dtype=np.float32)
        return features, h_index

In [49]:
class MLP(nn.Module):
    def __init__(self, n_input, n_hidden, n_output):
        super(MLP, self).__init__()
        self.fc1 = torch.nn.Linear(n_input, n_hidden)  
        self.fc2 = torch.nn.Linear(n_hidden, n_hidden)
        self.output = torch.nn.Linear(n_hidden, n_output)  
        
    def forward(self, x, verbose=False):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.output(x)
        return x

In [62]:
def train(model, device, train_loader, optimizer, epoch):
    log_interval=100
    model.train() #set model in train mode
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        
        # MSE loss is used in this case
        loss = F.mse_loss(output, target)
        
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            
def test(model, device, test_loader):
    model.eval() #set model in test mode
    test_loss = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.mse_loss(output, target, reduction="sum").item()  # sum up batch loss
    
    test_loss /= len(test_loader.dataset)

    print('\nTest set: MSE loss on test set: {:.4f}\n'.format(
        test_loss))

In [63]:
train_dataset = AuthorDataset('../data/internal-train.csv')
validation_dataset = AuthorDataset('../data/internal-validation.csv')

train_loader = DataLoader(train_dataset,batch_size=64,shuffle=True)
validation_loader = DataLoader(validation_dataset,batch_size=1000,shuffle=True)

34928


In [None]:
input_size = df_features.shape[1]
n_hidden = 256
output_size = 1

model = MLP(input_size, n_hidden, output_size)
model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

n_epochs = 4

for epoch in range(0, n_epochs):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, validation_loader)

  loss = F.mse_loss(output, target)




  loss = F.mse_loss(output, target)
  test_loss += F.mse_loss(output, target, reduction="sum").item()  # sum up batch loss
  test_loss += F.mse_loss(output, target, reduction="sum").item()  # sum up batch loss


34928
5582088208.0
159817.00091617042

Test set: MSE loss on test set: 159817.0009



In [41]:
df_test = pd.read_csv('../data/test.csv', index_col=0, dtype={'author': np.int64, 'hindex': np.float32}, delimiter=',')

model.eval()
for i, row in df_test.iterrows():
    author_id = row['author']
    features = df_features.loc[author_id,:].to_numpy(dtype=np.float32)
    h_index = int(round(model(torch.from_numpy(features)).item()))
    df_test.at[i, 'hindex']  = h_index

df_test = df_test.astype({'hindex':np.int32})
df_test.to_csv('../data/test-completed.csv', sep=',')