In [1]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
from collections import OrderedDict
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn import preprocessing
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import wandb

In [2]:
datafile_path = Path('C:/Users/Ni Ying/Desktop/data/network_datanew.xlsx')
datasets_root = Path('C:/Users/Ni Ying/Desktop/data/working')

In [3]:
raw_dt = pd.read_excel(datafile_path)
raw_dt.head()

Unnamed: 0,uid,type,betweenness_centrality,degree_centrality,closeness_centrality,pagerank,degree,clustering,ego_betweenness_centrality,ego_pagerank
0,1,pri,0.03727,0.000561,0.24258,0.00012,25,0.016667,0.976667,0.391487
1,12,pri,5.7e-05,0.000157,0.230282,3e-05,7,0.0,1.0,0.469596
2,18,pri,0.002311,0.003682,0.238747,0.000632,164,0.031423,0.804976,0.161289
3,22,pri,3e-06,4.5e-05,0.198369,1.3e-05,2,0.0,1.0,0.486486
4,35,pri,0.0,2.2e-05,0.196286,8e-06,1,0.0,0.0,0.5


In [4]:
data=raw_dt

In [5]:
cont_vars = ['betweenness_centrality','degree_centrality','closeness_centrality','pagerank','degree','clustering','ego_betweenness_centrality','ego_pagerank']

In [10]:
pri_data = data[data['type']=='pri']
tst_data = pri_data
off_data = data[data['type']=='off']
plat_data = data[data['type']=='plat']
tr_data = pd.concat((off_data, plat_data), ignore_index=True)

In [11]:
tr_data.head()

Unnamed: 0,uid,type,betweenness_centrality,degree_centrality,closeness_centrality,pagerank,degree,clustering,ego_betweenness_centrality,ego_pagerank
0,5719,off,0.0,2.2e-05,0.17352,7e-06,1,0.0,0.0,0.5
1,6719,off,4e-05,6.7e-05,0.218409,1.6e-05,3,0.0,1.0,0.479731
2,7179,off,0.00041,0.000135,0.216396,4e-05,6,0.066667,0.933333,0.40776
3,7207,off,0.001645,0.000651,0.240129,0.000108,29,0.08867,0.786412,0.217646
4,7257,off,0.0,2.2e-05,0.183737,1.1e-05,1,0.0,0.0,0.5


In [12]:
tst_data.head()

Unnamed: 0,uid,type,betweenness_centrality,degree_centrality,closeness_centrality,pagerank,degree,clustering,ego_betweenness_centrality,ego_pagerank
0,1,pri,0.03727,0.000561,0.24258,0.00012,25,0.016667,0.976667,0.391487
1,12,pri,5.7e-05,0.000157,0.230282,3e-05,7,0.0,1.0,0.469596
2,18,pri,0.002311,0.003682,0.238747,0.000632,164,0.031423,0.804976,0.161289
3,22,pri,3e-06,4.5e-05,0.198369,1.3e-05,2,0.0,1.0,0.486486
4,35,pri,0.0,2.2e-05,0.196286,8e-06,1,0.0,0.0,0.5


In [13]:
scaler = preprocessing.StandardScaler().fit(tr_data[cont_vars])

In [14]:
tr_data_scaled = tr_data.copy()
tr_data_scaled[cont_vars] = scaler.transform(tr_data[cont_vars])
tst_data_scaled = tst_data.copy()
tst_data_scaled[cont_vars] = scaler.transform(tst_data[cont_vars])

In [15]:
tst_data, tst_data_scaled

(           uid type  betweenness_centrality  degree_centrality  \
 0            1  pri                0.037270           0.000561   
 1           12  pri                0.000057           0.000157   
 2           18  pri                0.002311           0.003682   
 3           22  pri                0.000003           0.000045   
 4           35  pri                0.000000           0.000022   
 ...        ...  ...                     ...                ...   
 63656  1817611  pri                0.000000           0.000022   
 63657  1817678  pri                0.000000           0.000000   
 63658  1817710  pri                0.000000           0.000000   
 63659  1817712  pri                0.000000           0.000000   
 63660  1817890  pri                0.000000           0.000000   
 
        closeness_centrality  pagerank  degree  clustering  \
 0                  0.242580  0.000120      25    0.016667   
 1                  0.230282  0.000030       7    0.000000   
 2      

In [16]:
tr_data, tr_data_scaled

(          uid  type  betweenness_centrality  degree_centrality  \
 0        5719   off                0.000000           0.000022   
 1        6719   off                0.000040           0.000067   
 2        7179   off                0.000410           0.000135   
 3        7207   off                0.001645           0.000651   
 4        7257   off                0.000000           0.000022   
 ...       ...   ...                     ...                ...   
 3767  1808066  plat                0.000000           0.000000   
 3768  1808828  plat                0.000000           0.000000   
 3769  1809348  plat                0.000000           0.000000   
 3770  1823090  plat                0.000000           0.000000   
 3771  1825048  plat                0.000000           0.000000   
 
       closeness_centrality  pagerank  degree  clustering  \
 0                 0.173520  0.000007       1    0.000000   
 1                 0.218409  0.000016       3    0.000000   
 2         

In [17]:
tr_data_scaled.to_csv(datasets_root/'sn_ad_train.csv', index=False)
tst_data_scaled.to_csv(datasets_root/'sn_ad_test.csv', index=False)

In [45]:
class TSDataset(Dataset):
    def __init__(self, split, cont_vars=None, lbl_as_feat=True):
        """
        split: 'train' if we want to get data from the training examples, 'test' for
        test examples, or 'both' to merge the training and test sets and return samples
        from either.
        cont_vars: List of continuous variables to return as features. If None, returns
        all continuous variables available.
        cat_vars: Same as above, but for categorical variables.
        lbl_as_feat: Set to True when training a VAE -- the labels (temperature values)
        will be included as another dimension of the data. Set to False when training
        a model to predict temperatures.
        """
        super().__init__()
        assert split in ['train', 'test', 'both']
        self.lbl_as_feat = lbl_as_feat
        if split == 'train':
            self.df = pd.read_csv(datasets_root/'sn_ad_train.csv')
        elif split == 'test':
            self.df = pd.read_csv(datasets_root/'sn_ad_test.csv')
        else:
            df1 = pd.read_csv(datasets_root/'sn_ad_train.csv')
            df2 = pd.read_csv(datasets_root/'sn_ad_test.csv')
            self.df = pd.concat((df1, df2), ignore_index=True)
        
        if cont_vars:
            self.cont_vars = cont_vars
        else:  # if no list provided, use all available
            self.cont_vars = ['betweenness_centrality','degree_centrality','closeness_centrality','pagerank','degree','clustering','ego_betweenness_centrality','ego_pagerank']
        
        # Finally, make two Numpy arrays for continuous and categorical
        # variables, respectively:
        if self.lbl_as_feat:
            self.cont = self.df[self.cont_vars].copy().to_numpy(dtype=np.float32)
        else:
            self.cont = self.df[self.cont_vars].copy().to_numpy(dtype=np.float32)
            self.lbl = self.df['type'].copy().to_numpy(dtype=np.float32)
        #self.cat = self.df[self.cat_vars].copy().to_numpy(dtype=np.int64)
            
    def __getitem__(self, idx):
        if self.lbl_as_feat:  # for VAE training
            return torch.tensor(self.cont[idx])
        else:  # for supervised prediction
            return torch.tensor(self.cont[idx]), torch.tensor(self.lbl[idx])
    
    def __len__(self):
        return self.df.shape[0]

In [46]:
ds = TSDataset(split='both', cont_vars=['betweenness_centrality','degree_centrality','closeness_centrality','pagerank','degree','clustering','ego_betweenness_centrality','ego_pagerank'], lbl_as_feat=True)
print(len(ds))
it = iter(ds)
for _ in range(10):
    print(next(it))

63663
tensor([-0.0598, -0.0570,  1.1951, -0.0423, -0.0570, -0.1211, -0.5435,  1.4220])
tensor([-0.0416, -0.0080,  1.6818, -0.0126, -0.0080, -0.1211,  1.8869,  1.3335])
tensor([0.1260, 0.0655, 1.6600, 0.0609, 0.0655, 0.9456, 1.7249, 1.0192])
tensor([0.6852, 0.6292, 1.9174, 0.2701, 0.6292, 1.2977, 1.3678, 0.1891])
tensor([-0.0598, -0.0570,  1.3059, -0.0300, -0.0570, -0.1211, -0.5435,  1.4220])
tensor([-0.0598, -0.0570,  1.3059, -0.0300, -0.0570, -0.1211, -0.5435,  1.4220])
tensor([-0.0598, -0.0570,  1.2993, -0.0434, -0.0570, -0.1211, -0.5435,  1.4220])
tensor([-0.0408, -0.0325,  1.6541, -0.0156, -0.0325, -0.1211,  1.8869,  1.3630])
tensor([-0.0598, -0.0815, -0.6865, -0.0629, -0.0815, -0.1211, -0.5435, -0.7613])
tensor([-0.0598, -0.0570,  1.1691, -0.0392, -0.0570, -0.1211, -0.5435,  1.4220])


In [47]:
class Layer(nn.Module):
    '''
    A single fully connected layer with optional batch normalisation and activation.
    '''
    def __init__(self, in_dim, out_dim, bn = True):
        super().__init__()
        layers = [nn.Linear(in_dim, out_dim)]
        if bn: layers.append(nn.BatchNorm1d(out_dim))
        layers.append(nn.LeakyReLU(0.1, inplace=True))
        self.block = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.block(x)

    
class Encoder(nn.Module):
    '''
    The encoder part of our VAE. Takes a data sample and returns the mean and the log-variance of the 
    latent vector's distribution.
    '''
    def __init__(self, hparams):
        super().__init__()

        self.embeds = nn.ModuleList([
            nn.Embedding(n_cats, emb_size) for (n_cats, emb_size) in hparams.embedding_sizes
        ])
        # The input to the first layer is the concatenation of all embedding vectors and continuous
        # values
        in_dim = sum(emb.embedding_dim for emb in self.embeds) + len(hparams.cont_vars)
        layer_dims = [in_dim] + [int(s) for s in hparams.layer_sizes.split(',')]
        bn = hparams.batch_norm
        self.layers = nn.Sequential(
            *[Layer(layer_dims[i], layer_dims[i + 1], bn) for i in range(len(layer_dims) - 1)],
        )
        self.mu = nn.Linear(layer_dims[-1], hparams.latent_dim)
        self.logvar = nn.Linear(layer_dims[-1], hparams.latent_dim)
    
    def forward(self, x_cont):
        x = x_cont
        h = self.layers(x)
        mu_ = self.mu(h)
        logvar_ = self.logvar(h)
        return mu_, logvar_, x  # we return the concatenated input vector for use in loss fn
    

class Decoder(nn.Module):
    '''
    The decoder part of our VAE. Takes a latent vector (sampled from the distribution learned by the 
    encoder) and converts it back to a reconstructed data sample.
    '''
    def __init__(self, hparams):
        super().__init__()
#         self.final_activ = hparams.final_activ
        hidden_dims = [hparams.latent_dim] + [int(s) for s in reversed(hparams.layer_sizes.split(','))]
        out_dim = sum(emb_size for _, emb_size in hparams.embedding_sizes) + len(hparams.cont_vars)
        bn = hparams.batch_norm
        self.layers = nn.Sequential(
            *[Layer(hidden_dims[i], hidden_dims[i + 1], bn) for i in range(len(hidden_dims) - 1)],
        )
        self.reconstructed = nn.Linear(hidden_dims[-1], out_dim)
        
    def forward(self, z):
        h = self.layers(z)
        recon = self.reconstructed(h)
        return recon

In [48]:
class VAE(pl.LightningModule):
    def __init__(self, hparams):
        super().__init__()
        if isinstance(hparams, dict):
            hparams = Namespace(**hparams)
        self.hparams = hparams
        self.encoder = Encoder(hparams)
        self.decoder = Decoder(hparams)
        self.stdev = hparams.stdev
        self.kld_beta = hparams.kld_beta
        self.lr = hparams.lr
        self.wd = hparams.weight_decay
        
    def reparameterize(self, mu, logvar):
        '''
        The reparameterisation trick allows us to backpropagate through the encoder.
        '''
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std) * self.stdev
            return eps * std + mu
        else:
            return mu
        
    def forward(self, batch):
        x_cont = batch
        #assert x_cat.dtype == torch.int64
        mu, logvar, x = self.encoder(x_cont)
        z = self.reparameterize(mu, logvar)
        recon = self.decoder(z)
        return recon, mu, logvar, x
        
    def loss_function(self, obs, recon, mu, logvar):
#         recon_loss = F.mse_loss(recon, obs, reduction='sum')
        recon_loss = F.smooth_l1_loss(recon, obs, reduction='sum')
        kld = -0.5 * torch.sum(1 + logvar - mu ** 2 - logvar.exp())
        return recon_loss, kld
                               
    def training_step(self, batch, batch_idx):
        recon, mu, logvar, x = self.forward(batch)
        # The loss function compares the concatenated input vector including
        # embeddings to the reconstructed vector
        recon_loss, kld = self.loss_function(x, recon, mu, logvar)
        loss = recon_loss + self.kld_beta * kld

        self.log('total_tr_loss', loss.mean(dim=0), on_step=True, prog_bar=True, 
                 logger=True)
        self.log('recon_loss', recon_loss.mean(dim=0), on_step=True, prog_bar=True, 
                 logger=True)
        self.log('kld', kld.mean(dim=0), on_step=True, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        recon, mu, logvar, x = self.forward(batch)
        recon_loss, kld = self.loss_function(x, recon, mu, logvar)
        loss = recon_loss + self.kld_beta * kld
        self.log('test_loss', loss)
        return loss
        
    def configure_optimizers(self):
        opt = torch.optim.AdamW(self.parameters(), lr=self.lr, 
                                weight_decay=self.hparams.weight_decay, 
                                eps=1e-4)
        sch = torch.optim.lr_scheduler.MultiplicativeLR(opt, lr_lambda=lambda epoch: 0.95)
        return opt
    
    def train_dataloader(self):
        dataset = TSDataset('train', cont_vars=self.hparams.cont_vars, 
                            lbl_as_feat=True)
        return DataLoader(dataset, batch_size=self.hparams.batch_size, num_workers=0)
    
    def test_dataloader(self):
        dataset = TSDataset('test', cont_vars=self.hparams.cont_vars,
                            lbl_as_feat=True)
        return DataLoader(dataset, batch_size=self.hparams.batch_size, num_workers=0)   

In [49]:

hparams = OrderedDict(
    run='img_vars_embsz8_latsz32_bsz64_lay64-128-64_ep50',
    cont_vars = cont_vars,
    embedding_sizes = [(0, 14)],
    latent_dim = 32,
    layer_sizes = '64,128,64',
    batch_norm = True,
    stdev = 0.01,
    kld_beta = 0.2,
    lr = 0.01,
    weight_decay = 1e-5,
    batch_size = 64,
    epochs = 50,
)

In [50]:
from argparse import Namespace
# Simulate a Namespace with the defined hyperparameters
hparams = Namespace(**hparams)

In [51]:
torch.cuda.empty_cache()
model = VAE(hparams)
logger = WandbLogger(name=hparams.run, project='sn_ad', version=hparams.run)

In [52]:
ckpt_callback = pl.callbacks.ModelCheckpoint(filepath='C:/Users/Ni Ying/Desktop/data/sn_ad_vae_weights')
# Replace argument logger by None if you don't have a WandB account (and don't want to create one)
trainer = pl.Trainer(gpus=None,logger=logger, max_epochs=hparams.epochs, 
                     auto_lr_find=True, benchmark=True, callbacks=[ckpt_callback],
                     gradient_clip_val=1.
                     )
trainer.tune(model)

GPU available: False, used: False
TPU available: None, using: 0 TPU cores

  | Name    | Type    | Params
------------------------------------
0 | encoder | Encoder | 22.7 K
1 | decoder | Decoder | 20.6 K
------------------------------------
43.4 K    Trainable params
0         Non-trainable params
43.4 K    Total params


HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…

RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x8 and 22x64)