In [1]:
import os
import torch
import json
import warnings
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import KarateClub
from torch_geometric.utils import to_dense_adj, to_networkx
warnings.simplefilter('ignore')

In [4]:
### load title name entity tags and short hand token
IMPROT_FILE = 'validData_Reduced.csv'
IMPORT_PATH = ''

In [15]:
### import title/employee id dict
with open(os.path.join(IMPORT_PATH, "title_id_dict.json"), "r") as openfile:
    org_title_2_new_id = json.load(openfile)
with open(os.path.join(IMPORT_PATH, "employee_id_dict.json"), "r") as openfile:
    org_employee_id_2_new_id = json.load(openfile)

In [16]:
### load title embedding
np_title_emb = np.load(os.path.join(IMPORT_PATH, 'title_only_emb.npy'), allow_pickle=True)
title_2_emb = {org_title_2_new_id[t] : e for t, e, in np_title_emb if t in org_title_2_new_id}
z_raw = np.array([title_2_emb['T' + str(i)] for i in range(1, len(title_2_emb) + 1)])
np.save(os.path.join(IMPORT_PATH, 'title_emb_matrix.npy') ,z_raw)

In [17]:
### import golden source
df = pd.read_csv(os.path.join(IMPORT_PATH, IMPROT_FILE))
df['sy'] = df['Start_Date'].apply(lambda x: int(x.split('-')[0]))
df['ey'] = df['End_Date'].apply(lambda x: int(x.split('-')[0]))
df.drop_duplicates(subset=['ID', 'ey'], keep='last', inplace=True)
df.drop_duplicates(subset=['ID', 'sy'], keep='last', inplace=True)
df['EmployeeID'] = df.ID.apply(lambda x: org_employee_id_2_new_id[x])
df['TitleID'] = df.Title.apply(lambda x: org_title_2_new_id[x])
df.head(3)

Unnamed: 0,ID,Firm,Title,Start_Date,End_Date,Title_Group,EDU,Duration,DiversityScore,YrsOfExp,sy,ey,EmployeeID,TitleID
0,"ACwAAA--S20BZncfI96Y51rtML5hkDoodwbFi-c,NAME_S...",Amazon,software engineer,2019-12-01,2021-10-01,software developer,"computer science, computer engineering, electr...",2,0.0,1.0,2019,2021,E1,T1
1,"ACwAAA--zVABrailb54YsBsNL_ulhYNRVeX599Y,NAME_S...",Amazon,software engineer,2018-03-01,2019-03-01,software developer,"computer science, computer engineering, electr...",1,0.0,0.0,2018,2019,E2,T1
2,"ACwAAA--zVABrailb54YsBsNL_ulhYNRVeX599Y,NAME_S...",JT4,technical analyst,2020-01-01,2020-07-01,it specialist,information technology,1,0.693147,1.0,2020,2020,E2,T2


In [18]:
# select yr = 2018
yr = 2018
df_yr = df[(df['sy']==yr)|(df['ey']==yr)]
df_yr_ = df_yr.join(df_yr.groupby('ID')['TitleID'].shift(-1), rsuffix='1')
df_yr_.rename({'TitleID1': 'transferTo'}, axis=1, inplace=True)
df_valid = df_yr_[df_yr_['transferTo'].notnull()]

In [19]:
# aggregate flows
df_valid['pair'] = df_valid.apply(lambda x: ','.join(np.sort([x['TitleID'], x['transferTo']])), axis = 1)
df_agg = df_valid.groupby(["pair"]).size().reset_index(name="Frequency")
df_agg[['From', 'To']] = df_agg['pair'].str.split(',', 1, expand=True)
df_agg.drop(columns=['pair'], inplace=True)
df_agg = df_agg[['From', 'To', 'Frequency']]
df_agg = df_agg[df_agg['From'] != df_agg['To']]

In [241]:
### create directed graph
# raw title emb matrix
x = torch.tensor(z_raw, dtype=torch.float)
# edge index
from_v = df_agg['From'].apply(lambda x: int(x.split('T')[1])-1).values
to_v = df_agg['To'].apply(lambda x: int(x.split('T')[1])-1).values
edge_index = torch.tensor(np.array([from_v, to_v]), dtype=torch.long)
# edge attr
edge_attr = torch.tensor(df_agg.Frequency.values.reshape(len(df_agg), 1), dtype=torch.float)
# graph data
data = Data(x=x, edge_attr=edge_attr, edge_index=edge_index.contiguous())

In [246]:
torch.save(data, r'c:\temp\testDump.pt')

In [247]:
data_ = torch.load(r'c:\temp\testDump.pt')

In [236]:
# little module
class GCN(torch.nn.Module):
    
    def __init__(self, num_features, hidden_dim, out_dim):
        super().__init__()
        self.gcn_l1 = GCNConv(num_features, hidden_dim, cached=True)
        self.gcn_l2 = GCNConv(hidden_dim, out_dim, cached=True)
        
    def forward(self, x, edge_index, edge_weight):
        h = self.gcn_l1(x, edge_index, edge_weight).relu()
        o = self.gcn_l2(h, edge_index, edge_weight)
        return o

In [248]:
# simulate one iteration
model = GCN(x.shape[1], 128, 64)
print(model)
z = model(data.x, data.edge_index, data.edge_attr)
z_ = model(data_.x, data_.edge_index, data_.edge_attr)

GCN(
  (gcn_l1): GCNConv(384, 128)
  (gcn_l2): GCNConv(128, 64)
)


In [None]:
### MINI TEST

In [223]:
### little graph
x = torch.tensor([[-1, 0], [0, 0], [1, 0], [2, 0]], dtype=torch.float)
edge_index = torch.tensor([[0, 1, 2], [1, 2, 1]], dtype=torch.long)
edge_attr = torch.tensor([[3], [2], [3]], dtype=torch.float)
data = Data(x=x, edge_attr=edge_attr, edge_index=edge_index.contiguous())

In [224]:
### get adjacent matrix
to_dense_adj(data.edge_index)[0].numpy().astype(int)

array([[0, 1, 0],
       [0, 0, 1],
       [0, 1, 0]])

In [225]:
### visualization
# g = to_networkx(data, to_undirected=True)
# plt.figure(figsize=(5, 5))
# plt.axis('off')
# nx.draw_networkx(g)
# plt.show()

In [226]:
# play around with model
model = GCN(2, 3)
print(model)
z = model(data.x, data.edge_index, data.edge_attr)

GCN(
  (gcn): GCNConv(2, 3)
)


In [16]:
### load pyG data
def loadPyGeoData(folder, path, file_name, post_fix = []):
    path_ = os.path.join(folder, path)
    if len(post_fix) == 0:
        return torch.load(os.path.join(path_, file_name))
    data_lst = []
    for each in post_fix:
        data_lst.append(torch.load(os.path.join(path_, file_name.format(each))))
    return data_lst

In [17]:
dataLst = loadPyGeoData(
    IMPORT_PATH, "TemporalGraphData\TitleGraphs", 'pyg_title_{}.pt', list(range(2008, 2023)))

In [20]:
dataLst[0].x

tensor([[-0.0583,  0.0194,  0.0025,  ..., -0.0228,  0.0687,  0.0214],
        [-0.0804,  0.0226, -0.0661,  ..., -0.0447,  0.0282, -0.0392],
        [-0.0631, -0.0631, -0.0283,  ...,  0.0103, -0.0142,  0.0112],
        ...,
        [-0.0184,  0.0205, -0.0460,  ...,  0.0368, -0.0871, -0.0470],
        [-0.0899, -0.0571, -0.0158,  ..., -0.0083,  0.0043, -0.0303],
        [-0.1468, -0.0142,  0.0224,  ..., -0.0090, -0.0379, -0.0094]])