In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #if you have a GPU with CUDA installed, this may speed up computation

Load datasets

In [3]:
# read training data and partition it in train and validation files
df_train = pd.read_csv('../data/train.csv', dtype={'author': np.int64, 'hindex': np.float32})
n_train = df_train.shape[0]

msk = np.random.rand(len(df_train)) < 0.8

internal_train = df_train[msk]
internal_train.to_csv('../data/internal-train.csv')
internal_validation = df_train[~msk]
internal_validation.to_csv('../data/internal-validation.csv')

In [4]:
# load the pre-processed features    
df_features = pd.read_csv('../data/node-embeddings/deepwalk-128.emb', header=None, skiprows=1, delimiter=' ')
df_features.rename(columns={0 :'author_id'}, inplace=True)
df_features.set_index('author_id', inplace=True)
df_features.sort_index(inplace=True)
df_features.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,119,120,121,122,123,124,125,126,127,128
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1036332,-0.025803,0.449338,-0.130984,1.176001,-0.210165,-1.066286,0.026156,-0.777965,0.432119,0.765542,...,0.625622,-0.095299,0.823723,-0.586748,1.241237,0.46163,-0.136655,0.191903,0.788927,-0.657276
1101850,0.324045,0.031472,-0.318788,0.045695,0.675653,0.589153,-0.145144,-0.873704,0.348498,0.480307,...,0.100493,0.33901,0.449462,-0.632242,0.087819,-0.165578,0.260031,0.524244,-0.616389,-0.184688
1336878,0.046531,-0.258634,0.113372,0.500867,0.190693,-0.483764,-0.042357,-0.324904,0.652467,0.358866,...,-0.172582,-0.016618,0.345227,-0.505127,0.498401,0.80124,0.257806,-0.576735,0.151219,-0.221204
1515524,0.112803,-0.625676,0.041467,0.486069,0.86846,0.553456,-0.137142,-0.59616,-0.72739,0.086116,...,0.433929,-0.189008,0.136833,-0.999479,-0.062883,1.002005,0.897219,-0.697862,-0.084246,0.258323
1606427,0.540616,-0.197666,-0.031343,0.874454,0.397556,-0.045367,-0.392079,-1.022359,0.624348,0.319048,...,0.875712,-0.629394,0.796365,-0.668456,-0.298803,0.162881,0.25446,-0.223389,0.320299,0.087225


In [5]:
class AuthorDataset(Dataset):
    # The mapping file maps an author to its h-index
    def __init__(self, mapping_file):
        self.author_map = pd.read_csv(mapping_file)

    def __len__(self):
        return len(self.author_map)

    def __getitem__(self, idx):
        # Get the author id and its h-index
        author_id = self.author_map.iloc[idx, 1]
        h_index = self.author_map.iloc[idx, 2].astype(np.float32)
        features = df_features.loc[author_id,:].to_numpy(dtype=np.float32)
        return features, h_index

In [6]:
class MLP(nn.Module):
    def __init__(self, n_input, n_hidden1, n_hidden2, n_output, use_dropout=True):
        super(MLP, self).__init__()
        self.use_dropout = use_dropout
        self.fc1 = torch.nn.Linear(n_input, n_hidden1)
        self.fc2 = torch.nn.Linear(n_hidden1, n_hidden2)
        self.dropout1 = torch.nn.Dropout(p=0.2)
        self.dropout2 = torch.nn.Dropout(p=0.2)
        self.output = torch.nn.Linear(n_hidden2, n_output)  
        
    def forward(self, x, verbose=False):
        x = self.fc1(x)
        if self.use_dropout:
            x = self.dropout1(x)
        x = F.relu(x)
        x = self.fc2(x)
        if self.use_dropout:
            x = self.dropout2(x)
        x = F.relu(x)
        x = self.output(x)
        return x

In [14]:
def train(model, device, train_loader, optimizer, epoch):
    log_interval=100
    model.train() #set model in train mode
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data).squeeze()
        print(output.shape)
        
        # MSE loss is used in this case
        loss = F.mse_loss(output, target)
        
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            
def test(model, device, test_loader):
    model.eval() #set model in test mode
    test_loss = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data).squeeze()
            test_loss += F.mse_loss(output, target, reduction="sum").item()  # sum up batch loss
    
    test_loss /= len(test_loader.dataset)

    print('\nTest set: MSE loss on test set: {:.4f}\n'.format(
        test_loss))

In [11]:
train_dataset = AuthorDataset('../data/internal-train.csv')
validation_dataset = AuthorDataset('../data/internal-validation.csv')

train_loader = DataLoader(train_dataset,batch_size=64,shuffle=True)
validation_loader = DataLoader(validation_dataset,batch_size=1000,shuffle=True)

In [15]:
input_size = df_features.shape[1]
hidden = [64, 64]
output_size = 1

model = MLP(input_size, hidden[0], hidden[1], output_size)
model.to(device)
#sgd_optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
adam_optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

print(model)

MLP(
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (dropout2): Dropout(p=0.2, inplace=False)
  (output): Linear(in_features=64, out_features=1, bias=True)
)


In [16]:
n_epochs = 200

for epoch in range(0, n_epochs):
    train(model, device, train_loader, adam_optimizer, epoch)
    # test(model, device, validation_loader)

torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64

torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64

torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])


KeyboardInterrupt: 

In [54]:
df_test = pd.read_csv('../data/test.csv', index_col=0, dtype={'author': np.int64, 'hindex': np.float32}, delimiter=',')

model.eval()
for i, row in df_test.iterrows():
    author_id = row['author']
    features = df_features.loc[author_id,:].to_numpy(dtype=np.float32)
    h_index = int(round(model(torch.from_numpy(features)).item()))
    df_test.at[i, 'hindex']  = h_index

df_test = df_test.astype({'hindex':np.int32})
df_test.to_csv('../data/test-completed.csv', sep=',')

KeyboardInterrupt: 