In [303]:
import os
import torch
import warnings
import numpy as np
import pandas as pd
from enum import Enum
from torch import nn
from torch.autograd import Variable
from torch.utils.data import Dataset
import torchvision.transforms as transforms
warnings.simplefilter('ignore')

In [138]:
### load title name entity tags and short hand token
IMPROT_FILE = 'validData_Reduced.csv'
ETET_GRAPHS = 'TemporalGraphData\ETETGraphs'
STANDARD_TITLE_FILES = ['tech_jobs.csv', 'non_tech_jobs.csv']
IMPORT_PATH = ''
START_YR, END_YR = 2009, 2022

In [150]:
### import stuffs
z_raw = np.load(os.path.join(IMPORT_PATH, 'title_emb_matrix.npy'))
path = os.path.join(IMPORT_PATH, ETET_GRAPHS)
df_etet = pd.read_csv(os.path.join(path, 'AggregatedETET.csv'))
df_etet.head(3)

Unnamed: 0,Year,Focal,Reference,Start_Year,Job_Index,TargetTitle
0,2009,E10008,E19463,2008,1,T100
1,2009,E10008,E5490,2008,1,T101
2,2009,E10008,E14323,2008,1,T111


In [140]:
### data loader
class CustomDataSet(Dataset):

    def __init__(self, root_path, x_file, y_file, title_emb_dim=384, use_cuda=False):
        self.dataset = torch.load(os.path.join(root_path, x_file),)
        self.labels = torch.load(os.path.join(root_path, y_file))
        self.title_emb_dim = title_emb_dim
        if use_cuda:
            self.dataset = self.dataset.cuda()
            self.labels = self.labels.cuda()
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        x, y = self.dataset[idx], self.labels[idx]
        emp_id = x[:, 0]
        num_exp = x[:, 1:2]
        end_yr = x[:, 2:3]
        job_idx = x[:, 3:4]
        dur = x[:, 4:5]
        div = x[:, 5:6]
        t_emb = x[:, 6:]
        return emp_id[0], num_exp[0], end_yr, job_idx, dur, div, t_emb, y[:, 0]

In [156]:
### test
import_path = os.path.join(IMPORT_PATH, 'TemporalGraphData\TrainTestData')
title_emb_dim = z_raw.shape[1]
train_dataset = CustomDataSet(import_path, 'TRAIN_X.pt', 'TRAIN_Y.pt', title_emb_dim)
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=1000, shuffle=True)

In [387]:
z_raw_torch = torch.FloatTensor(z_raw)


In [247]:
# get i-th batch
batch_id = 1
for i, (emp_id, num_exp, end_yr, job_idx, dur, div, t_emb, label) in enumerate(train_dataloader):
    if i == batch_id:
        num_exp_sq = num_exp.squeeze(-1).int()
        end_yr_sq = end_yr.squeeze().int().numpy()
        break

In [158]:
class RNN_CELL_TYPE(Enum):
    PLAIN = 1 # standard
    TIME_AWARED = 2 # time-aware

class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        # linear functionals
        self.hh = nn.Linear(hidden_size, hidden_size * 4, bias=True)
        self.xh = nn.Linear(input_size, hidden_size * 4, bias=False)
        # reset
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / np.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, x, hidden=None):
        # Input Dimensions:
        # x:        (batch_size, input_size)
        # hidden:   (batch_size, hidden_size)
        # unfold hidden (hidden + memory)
        if hidden is None:
            hidden = Variable(x.new_zeros(x.size(0), self.hidden_size))
            hidden = (hidden, hidden) 
        else:
            hidden, memory = hidden

        # execute all linear transformation
        gates = self.xh(x) + self.hh(hidden)

        # Get gates (i_t, f_t, g_t, o_t)
        input_gate, forget_gate, cell_gate, output_gate = gates.chunk(4, 1)

        # execute nonlinear transformation
        i_t = torch.sigmoid(input_gate)
        f_t = torch.sigmoid(forget_gate)
        o_t = torch.sigmoid(output_gate)
        candidate_t = torch.tanh(cell_gate)

        # memory
        memory_ = memory * f_t + i_t * candidate_t

        # hidden
        hidden_ = o_t * torch.tanh(memory_)

        # Output Dimension:
        # hidden_:        (batch_size, hidden_size)
        # memory_:        (batch_size, hidden_size)
        return (hidden_, memory_)

In [159]:
'''
ENCODER: LSTM
'''
class LSTM(nn.Module):
    CELL_TYPE = {
        RNN_CELL_TYPE.PLAIN : LSTMCell,
    }

    def __init__(self, input_size, hidden_size, num_layers, cell_type=RNN_CELL_TYPE.PLAIN):
        super(LSTM, self).__init__()
        ### TODO: EXTEND IT TO MULTIPLE LAYER (the pipeline is in place)
        if num_layers > 1:
            raise Exception('Only supports single layer LSTM')
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell_type = cell_type
        # allow multiple layers
        self.rnn_cell_list = nn.ModuleList()
        self.rnn_cell_list.append(LSTM.CELL_TYPE[self.cell_type](self.input_size, self.hidden_size))
        for _ in range(1, self.num_layers):
            self.rnn_cell_list.append(LSTM.CELL_TYPE[self.cell_type](self.hidden_size, self.hidden_size))

    def forward(self, input, hidden=None):
        
        # Input Dimension: (batch_size, seqence_size , input_size)
        input_ = input[0]
        seq_len = input[1]
        max_len = seq_len.max().int()

        # in case there is no initial inputs
        if hidden is None:
            if torch.cuda.is_available():
                h0 = Variable(torch.zeros(self.num_layers, input_.size(0), self.hidden_size).cuda())
            else:
                h0 = Variable(torch.zeros(self.num_layers, input_.size(0), self.hidden_size))
        else:
             h0 = hidden

        # collectors
        outs = torch.zeros(max_len, input_.shape[0], self.hidden_size)
        # unrolling
        hx = (h0[0, :, :], h0[0, :, :]) # h and c
        for t in range(max_len):
            hidden_l = self.rnn_cell_list[0](input_[:, t, :],  hx) if self.cell_type == RNN_CELL_TYPE.PLAIN \
                else self.rnn_cell_list[0]((input_[:, t, :], input[2][:, t, :]), hx)
            # masking technique
            mask = (t < seq_len).float().unsqueeze(1).expand_as(hidden_l[0])
            h_next = hidden_l[0]*mask + hx[0]*(1 - mask)
            c_next = hidden_l[1]*mask + hx[1]*(1 - mask)
            hx_next = (h_next, c_next)
            outs[t] = h_next
            hx = hx_next
            
        return outs.permute(1, 0, 2)

In [160]:
output_size = 128
model = LSTM(title_emb_dim, output_size, 1)

In [161]:
outputs = model((t_emb, num_exp_sq))

In [188]:
df_etet

Unnamed: 0,Year,Focal,Reference,Start_Year,Job_Index,TargetTitle
0,2009,E10008,E19463,2008,1,T100
1,2009,E10008,E5490,2008,1,T101
2,2009,E10008,E14323,2008,1,T111
3,2009,E10008,E5638,2008,1,T118
4,2009,E10008,E6113,2008,1,T127
...,...,...,...,...,...,...
516562,2011,E29834,E10891,2009,1,T499
516563,2011,E29834,E2881,2010,1,T50
516564,2011,E29834,E9307,2008,1,T504
516565,2011,E29834,E14200,2008,1,T54


In [304]:
helper_dict = {'E' + str(int(id)) : idx for idx, id in enumerate(emp_id)}
relevant = ['E' + str(id) for id in emp_id.int().numpy()]
df_helper = df_etet[(df_etet.Focal.isin(relevant))&(df_etet.Reference.isin(relevant))]
df_helper['BatchPos'] = df_helper.Reference.apply(lambda x: helper_dict[x])
df_helper

Unnamed: 0,Year,Focal,Reference,Start_Year,Job_Index,TargetTitle,BatchPos
349,2009,E10128,E10188,2008,1,T289,632
355,2009,E10128,E4516,2008,1,T32,324
585,2009,E10206,E10389,2008,1,T60,348
1660,2009,E10483,E10188,2008,1,T289,632
1666,2009,E10483,E4516,2008,1,T32,324
...,...,...,...,...,...,...,...
508718,2011,E29006,E23440,2008,1,T372,196
509812,2011,E29126,E10280,2010,1,T1,50
509821,2011,E29126,E12167,2008,1,T161,186
512877,2011,E29397,E3963,2010,1,T169,769


In [272]:
label.shape[0]

1000

In [None]:
df_helper.groupby(['Focal'])

In [299]:
df_helper[(df_helper.Focal == focal_id)].Year

Unnamed: 0,Year,Focal,Reference,Start_Year,Job_Index,TargetTitle,BatchPos
109618,2010,E13786,E11637,2009,2,T122,905
109619,2010,E13786,E2677,2008,1,T129,572
109621,2010,E13786,E6336,2009,2,T203,267
324050,2011,E13786,E16862,2010,2,T191,375
324054,2011,E13786,E11453,2010,2,T201,656
324065,2011,E13786,E11453,2009,1,T234,656
324070,2011,E13786,E14938,2010,2,T244,954
324097,2011,E13786,E20280,2009,1,T42,834
324100,2011,E13786,E22152,2010,1,T446,875
324110,2011,E13786,E14938,2008,1,T75,954


In [306]:
df_helper[(df_helper.Focal == focal_id)]

Unnamed: 0,Year,Focal,Reference,Start_Year,Job_Index,TargetTitle,BatchPos
109618,2010,E13786,E11637,2009,2,T122,905
109619,2010,E13786,E2677,2008,1,T129,572
109621,2010,E13786,E6336,2009,2,T203,267
324050,2011,E13786,E16862,2010,2,T191,375
324054,2011,E13786,E11453,2010,2,T201,656
324065,2011,E13786,E11453,2009,1,T234,656
324070,2011,E13786,E14938,2010,2,T244,954
324097,2011,E13786,E20280,2009,1,T42,834
324100,2011,E13786,E22152,2010,1,T446,875
324110,2011,E13786,E14938,2008,1,T75,954


In [319]:
end_yr_sq[i]

array([2009, 2010, 2011, 2012, 2016, 2017, 2020,    0,    0,    0])

array([], dtype=int64)

In [314]:
end_yr_sq[i].tolist().index([2010, 2011])

ValueError: [2010, 2011] is not in list

In [311]:
np.where(end_yr_sq[i] == 2010)

(array([1], dtype=int64),)

In [322]:
tmp_ref

Unnamed: 0,Year,Focal,Reference,Start_Year,Job_Index,TargetTitle,BatchPos
109618,2010,E13786,E11637,2009,2,T122,905
109619,2010,E13786,E2677,2008,1,T129,572
109621,2010,E13786,E6336,2009,2,T203,267
324050,2011,E13786,E16862,2010,2,T191,375
324054,2011,E13786,E11453,2010,2,T201,656
324065,2011,E13786,E11453,2009,1,T234,656
324070,2011,E13786,E14938,2010,2,T244,954
324097,2011,E13786,E20280,2009,1,T42,834
324100,2011,E13786,E22152,2010,1,T446,875
324110,2011,E13786,E14938,2008,1,T75,954


In [338]:
end_yr_sq[i]

array([2008, 2010, 2012, 2014,    0,    0,    0,    0,    0,    0])

In [339]:
jidx

tensor([0.])

In [341]:
for i in range(label.shape[0]):
    focal_id = 'E' + str(int(emp_id[i]))
    if focal_id == 'E13786':
        break
    tmp_ref = df_helper[(df_helper.Focal == focal_id)]
    for yr in end_yr_sq[i]:
        if yr == 0: break
        focal_cur_yr = tmp_ref[tmp_ref.Year == yr]
        if len(focal_cur_yr) == 0: continue
        ref_embs = outputs[focal_cur_yr.BatchPos.values, focal_cur_yr.Job_Index.values]
    
    # for yr in end_yr[i]:
        
    #     print(yr)
    #     break
    # break
    # tmp_ref = df_helper[df_helper.Focal == focal_id]
    # break
# focal_id

In [457]:
outputs.shape[1]

9

In [454]:
tmp_ref = df_helper[(df_helper.Focal == focal_id)]
graph_concat = torch.zeros(outputs.shape[1], 384)
for pos in range(outputs[i].shape[0]):
    yr = end_yr_sq[i][pos]
    if yr == 0:
        graph_emb.append(torch.zeros(384))
        continue
    focal_cur_yr = tmp_ref[tmp_ref.Year == yr]
    if len(focal_cur_yr) == 0:
        graph_emb.append(torch.zeros(384))
        continue
    ref_embs = outputs[focal_cur_yr.BatchPos.values, focal_cur_yr.Job_Index.values]
    focal_emb = outputs[i][pos]
    raw_s = torch.inner(focal_emb.unsqueeze(0), ref_embs)
    sim = (raw_s.exp() / raw_s.exp().sum())
    ref_t_embs = z_raw_torch[[int(each.split('T')[1]) for each in focal_cur_yr.TargetTitle]]
    graph_t_emb = torch.matmul(sim, ref_t_embs)
    graph_emb.append(graph_t_emb.squeeze())

In [449]:
outputs[i].shape

torch.Size([9, 128])

In [455]:
len(graph_emb)

9

In [443]:
torch.(graph_emb)

ValueError: only one element tensors can be converted to Python scalars

In [430]:
torch.concat([focal_emb, graph_t_emb], axis=0)

RuntimeError: Tensors must have same number of dimensions: got 1 and 2

In [431]:
focal_emb.shape

torch.Size([128])

In [432]:
graph_t_emb.shape

torch.Size([1, 384])

In [392]:
ref_t_embs.shape

torch.Size([3, 384])

In [361]:
focal_cur_yr

Unnamed: 0,Year,Focal,Reference,Start_Year,Job_Index,TargetTitle,BatchPos
109618,2010,E13786,E11637,2009,2,T122,905
109619,2010,E13786,E2677,2008,1,T129,572
109621,2010,E13786,E6336,2009,2,T203,267


In [360]:
ref_embs.shape

torch.Size([3, 128])

In [355]:
ref_embs.shape


torch.Size([8, 128])

In [330]:
for jidx in job_idx[i]:
    focal_cur_yr = tmp_ref[tmp_ref.Job_Index == jidx]
    if len(focal_cur_yr) == 0: continue
    ref_embs = outputs[focal_cur_yr.BatchPos.values, focal_cur_yr.Job_Index.values]
    break

In [298]:
ref_embs.shape

torch.Size([11, 128])

In [279]:
test_ref_emp_ids = [1, 2, 1]
test_tar_title_ids = [1, 1, 2]
test_data = np.array([
    [0, 1, 2, 3],
    [4, 5, 6, 7],
    [8, 9, 10, 11],
    [11, 12, 13, 14]    
])

In [280]:
test_data[test_ref_emp_ids, test_tar_title_ids]

array([5, 9, 6])

In [294]:
tt = df_helper.groupby('Focal')[['TargetTitle']].count().reset_index()
tt[tt.TargetTitle == 11]

Unnamed: 0,Focal,TargetTitle
65,E13786,11


In [266]:
emp_id[9]

tensor(23715.)

In [224]:
emp_id[-1]

tensor(8707.)

In [242]:
set()

{0, 2011, 2012, 2013, 2014, 2020}

In [256]:
helper_dict['E29397']

445

In [257]:
end_yr_sq[445]

array([2011, 2020, 2021,    0,    0,    0,    0,    0,    0,    0])

In [258]:
yr_set = set(end_yr_sq[445])
df_helper[(df_helper.Focal=='E29397')&(df_helper.Year.isin(yr_set))]

Unnamed: 0,Year,Focal,Reference,Start_Year,Job_Index,TargetTitle,BatchPos
512877,2011,E29397,E3963,2010,1,T169,769
512889,2011,E29397,E15766,2009,1,T64,553


In [253]:
df_helper

Unnamed: 0,Year,Focal,Reference,Start_Year,Job_Index,TargetTitle,BatchPos
587,2009,E10207,E9630,2009,1,T105,833
773,2009,E10255,E10216,2008,1,T179,416
874,2009,E10320,E6824,2008,1,T105,965
885,2009,E10320,E2553,2009,1,T250,139
3064,2009,E10852,E12325,2008,1,T127,972
...,...,...,...,...,...,...,...
511333,2011,E29230,E8922,2008,1,T220,608
512549,2011,E29354,E6824,2008,1,T105,965
512556,2011,E29354,E6460,2010,2,T135,408
512574,2011,E29354,E2553,2009,1,T250,139


In [210]:
tmp_ref

Unnamed: 0,Year,Focal,Reference,Start_Year,Job_Index,TargetTitle,BatchPos


In [201]:
tmp_ref

Unnamed: 0,Year,Focal,Reference,Start_Year,Job_Index,TargetTitle,BatchPos


In [186]:

for focal_id in df_helper.Focal.unique():
    tmp_ref = df_helper[df_helper.Focal == focal_id]
    ref_embs = outputs[tmp_ref.BatchPos.values, tmp_ref.Job_Index.values]

tmp_ref

Unnamed: 0,Year,Focal,Reference,Start_Year,Job_Index,TargetTitle,BatchPos
512549,2011,E29354,E6824,2008,1,T105,965
512556,2011,E29354,E6460,2010,2,T135,408
512574,2011,E29354,E2553,2009,1,T250,139
512589,2011,E29354,E16882,2011,1,T421,810


In [187]:
ref_embs

tensor([[-0.0379,  0.0240, -0.0364,  0.0098,  0.0219,  0.0020, -0.0192,  0.0147,
          0.0516, -0.0213, -0.0233,  0.0261, -0.0021, -0.0033, -0.0068, -0.0100,
          0.0020, -0.0012,  0.0344,  0.0472, -0.0120,  0.0240,  0.0180, -0.0213,
          0.0050, -0.0110,  0.0327,  0.0136, -0.0019, -0.0107,  0.0235, -0.0480,
         -0.0447, -0.0086,  0.0063,  0.0105, -0.0256,  0.0500,  0.0034,  0.0608,
         -0.0086, -0.0462, -0.0396, -0.0077,  0.0395,  0.0062,  0.0187, -0.0490,
          0.0110, -0.0200, -0.0228,  0.0020,  0.0319, -0.0365, -0.0190,  0.0151,
         -0.0112, -0.0254, -0.0056,  0.0158,  0.0043, -0.0199,  0.0220,  0.0413,
          0.0197,  0.0077, -0.0063,  0.0548,  0.0053,  0.0233, -0.0001, -0.0298,
         -0.0321, -0.0243,  0.0216, -0.0013, -0.0233,  0.0016, -0.0298, -0.0191,
          0.0207, -0.0079,  0.0002,  0.0027, -0.0274, -0.0126, -0.0087, -0.0103,
          0.0271,  0.0200,  0.0238,  0.0187,  0.0048, -0.0053, -0.0027, -0.0253,
          0.0166, -0.0083, -

In [181]:
tmp_ref

Year            2009
Reference      E9630
Start_Year      2009
Job_Index          1
TargetTitle     T105
BatchPos         833
Name: E10207, dtype: object

In [179]:

ref_embs.shape

torch.Size([4, 128])

In [154]:
for idx, row in df_helper.iterrows():
    print(idx)
    

E18510
E3546
E5131
E9546
E5131
E10520
E11161
E11889
E11971
E18809
E19953
E23949
E25753
E28955


In [152]:
# for i, (emp_id, num_exp, end_yr, dur, div, t_emb, label) in enumerate(train_dataloader):
for i in range(label.shape[0]):
    focal_id = 'E' + str(int(emp_id[i]))
    
    tmp_ref = df_helper.loc[focal_id]
    break

KeyError: 'E24619'

Unnamed: 0_level_0,Year,Reference,Start_Year,TargetTitle,BatchPos
Focal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E13626,2009,E12512,2008,T191,73
E22178,2009,E12512,2008,T191,73
E13045,2010,E15118,2009,T191,79
E15106,2010,E11256,2009,T160,33
E15106,2010,E26760,2009,T3,19
...,...,...,...,...,...
E1030,2021,E4533,2019,T131,66
E11026,2021,E17947,2018,T156,69
E17070,2021,E4390,2020,T77,77
E19614,2021,E7757,2019,T73,76


In [135]:
tmp_ref

Unnamed: 0_level_0,Year,Reference,Start_Year,TargetTitle,BatchPos
Focal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E26591,2015,E11622,2011,T25,65
E26591,2015,E13626,2013,T291,62


In [137]:
t_emb[65].shape

torch.Size([10, 384])

In [112]:
tmp_ref

Unnamed: 0,Year,Focal,Reference,Start_Year,TargetTitle
1942636,2015,E26591,E11622,2011,T25
1942647,2015,E26591,E13626,2013,T291
