Import

In [2]:
import os
import numpy as np
import random
import pandas as pd
from numba import jit

import os 
import warnings
warnings.simplefilter('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict


Paths

In [3]:
features_influencers_path = "data/weibo_features/features_influencers_all_200_twitter.pkl"
features_targets_path = "data/weibo_features/features_targets_all_200_twitter.pkl"
labels_path = "data/weibo_preprocessed/labels2_all_200.pkl"
edges_path = "data/weibo_preprocessed/edges2_all_200.pkl"
influencers_embeddings_path = "data/weibo_preprocessed/influencers_embeddings.pkl"
targets_embeddings_path = "data/weibo_preprocessed/target_embeddings.pkl"
influencers_inf2vec_path = "data/weibo_preprocessed/influencers_inf2vec.pkl"
targets_inf2vec_path = "data/weibo_preprocessed/target_inf2vec.pkl"

Features

In [4]:
features_influencers = pd.read_pickle(features_influencers_path)
features_targets = pd.read_pickle(features_targets_path)
print("features_influencers : \n" + str(features_influencers.dtypes))
print("shape : " + str(features_influencers.shape))
print("features_targets : \n" + str(features_targets.dtypes))
print("shape : " + str(features_targets.shape))

features_influencers : 
d_out       float64
pagerank    float64
#2r         float64
dtype: object
shape : (7907, 3)
features_targets : 
d_in        float64
pagerank    float64
dtype: object
shape : (1160, 2)


Labels

In [6]:
labels = pd.read_pickle(labels_path)
labels.index = pd.MultiIndex.from_tuples(zip(labels['u'],labels['v'])) #important to do .loc[(u,v)]
labels = labels.sort_index() # infos are retreived faster
labels = labels.drop_duplicates()

print("labels : \n" + labels.head(2).to_markdown())
print("shape : " + str(labels.shape))

influencers_embeddings = pd.read_pickle(influencers_embeddings_path)
targets_embeddings = pd.read_pickle(targets_embeddings_path)
# influencers_embeddings = pd.read_pickle(influencers_inf2vec_path)
# targets_embeddings = pd.read_pickle(targets_inf2vec_path)


labels : 
|                      |      u |                v |   BT |         JI |         LP |
|:---------------------|-------:|-----------------:|-----:|-----------:|-----------:|
| (101713, 101713)     | 101713 | 101713           |    1 | 0.0136364  | 0.0138249  |
| (104881, 1706372734) | 104881 |      1.70637e+09 |    1 | 0.00340136 | 0.00341297 |
shape : (111575, 5)


Preprocessing labels

In [7]:
# removing the labels where we do not have the embeddings
# and removing the labels where we do not have the features

d_influencers = defaultdict(lambda : 0)
for i in list(influencers_embeddings.index) : d_influencers[i] += 1
for i in list(features_influencers.index) : d_influencers[i] += 1

d_targets = defaultdict(lambda : 0)
for i in list(targets_embeddings.index) : d_targets[i] += 1
for i in list(features_targets.index) : d_targets[i] += 1

labels = labels.drop(labels[labels.u.apply(lambda x : d_influencers[x] < 2)].index)
labels = labels.drop(labels[labels.v.apply(lambda x : d_targets[x] < 2)].index)

influencers = list(labels.groupby('u').count().index)
targets = list(labels.groupby('v').count().index)

print("labels : \n" + labels.head(2).to_markdown())
print("shape : " + str(labels.shape))
print(f'influencers : {len(influencers)}')
print(f'targets : {len(targets)}')

labels : 
|                      |      u |                v |   BT |         JI |         LP |
|:---------------------|-------:|-----------------:|-----:|-----------:|-----------:|
| (101713, 101713)     | 101713 | 101713           |    1 | 0.0136364  | 0.0138249  |
| (104881, 1706372734) | 104881 |      1.70637e+09 |    1 | 0.00340136 | 0.00341297 |
shape : (99436, 5)
influencers : 5977
targets : 1134


Create feature vector

In [8]:
edges = pd.read_pickle(edges_path)

d_edges = defaultdict(lambda : 0)
for (u,v) in zip(edges.u, edges.v) :
    d_edges[(u,v)] = 1
del(edges)

def feature_vector(u,v, fu=None, fv=None) : 
    """
    Creates vector with
    - Influencers features
    - Target features
    - Topology link
    """
    if fu is None or fv is None : 
        fu = features_influencers.loc[u]
        fv = features_targets.loc[v]
        
    return np.concatenate([fu, fv, d_edges[(u,v)]], axis = None)


Create instance

In [9]:
N_FEATURES = 6

d_labels = defaultdict(lambda : False)
for (u,v) in zip(labels.u, labels.v) :
    d_labels[(u,v)] = True

PROB_TYPE = 'JI'

def fill_y(u,v) : 
    if d_labels[(u,v)] : 
        return labels.loc[(u,v)][PROB_TYPE]
    else : 
        return 0

@jit
def softmax(x):
        return np.exp(x)/np.sum(np.exp(x))

#@jit
def create_XY(sampled_influencers, sampled_targets) :
    """
    from 2 sets of influencers and targets, creates features and labels according to the paper format
    """
    nI = len(sampled_influencers)
    nT = len(sampled_targets)

    X = np.zeros((nI, nT, N_FEATURES))
    Y = np.zeros((nI, nT))
    Y_emb = np.zeros((nI, nT))

    #To not call loc for each (u,v)
    fI = np.array(features_influencers.loc[sampled_influencers])
    fT = np.array(features_targets.loc[sampled_targets])
    eI = np.array(influencers_embeddings.loc[sampled_influencers])
    eT = np.array(targets_embeddings.loc[sampled_targets])

    for i in range(nI):
        for j in range(nT):
            u,v = sampled_influencers[i], sampled_targets[j]
            X[i,j, :] = feature_vector(u, v, fI[i], fT[j])

            #X[i,j, :] = np.concatenate([features_influencers.loc[sampled_influencers[i]], 
                                        # features_targets.loc[sampled_targets[j]]], 
                                        # axis = None)
            Y[i,j] = fill_y(u,v)
            Y_emb[i,j] = np.dot(eI[i], eT[j])
        
    Y = np.reshape(Y, (nI, nT,1))
    
    #transform each row into a probability distribution
    Y_emb = np.reshape(Y_emb, (nI, nT, 1))
    # Y_emb = np.apply_along_axis(lambda x:x-abs(max(x)), 1, Y_emb) 
    # Y_emb = np.apply_along_axis(softmax, 1, Y_emb)
    # Y_emb = np.around(Y_emb,3)
    # Y_emb = np.abs(Y_emb)

    return np.concatenate((X, Y, Y_emb), axis = 2)

# def fill_with_positive2(XY, p) :
#     """
#     input : XY -> output of createXY, p -> proportion of positive examples needed in XY
#     output : XY with the positive examples added
#     """
#     nI, nT, _ = XY.shape
#     n_pos = int(p * nI * nT)

#     labels_to_add = labels.sample(n = n_pos)
    
#     for l in range(n_pos) :
    
#         i = np.random.randint(0,nI)
#         t = np.random.randint(0, nT)
    
#         label = labels_to_add.iloc[l]
#         f = feature_vector(label.u, label.v)
#         #f = np.concatenate([features_influencers.loc[label.u], features_targets.loc[label.v]], axis = None)

#         p_emb = np.dot(influencers_embeddings.loc[label.u], targets_embeddings.loc[label.v])

#         XY[i, t, :] = np.concatenate([f, label[PROB_TYPE], p_emb], axis=None)
    
#     return XY

def fill_with_positive(XY, p, sampled_influencers) :
    """
    input : XY -> output of createXY, p -> proportion of positive examples needed in XY
    output : XY with the positive examples added
    """
    nI, nT, _ = XY.shape
    
    for i in range(nI) :
        u = sampled_influencers[i]
        pos_labels = labels[labels['u'] == u] #positive labels of the seed u
        K = min(pos_labels.shape[0], int(p * nT)) 
        eI = influencers_embeddings.loc[u]

        for k in range(K):
            label = pos_labels.iloc[k]
            f = feature_vector(u, label.v)
            #f = np.concatenate([features_influencers.loc[label.u], features_targets.loc[label.v]], axis = None)
            p_emb = np.dot(eI, targets_embeddings.loc[label.v])

            XY[i, k, :] = np.concatenate([f, label[PROB_TYPE], p_emb], axis=None)
    
    return XY


Only considering best P% influencers

In [10]:
PROP_I = 0.2
influencers = labels.groupby('u').count().sort_values('v', ascending=False)
nI = influencers.shape[0]
n = int(nI * PROP_I)
influencers = influencers.iloc[:n].index

Generate all instances

In [14]:
features_targets.max()

d_in         0.958646
pagerank    14.351554
dtype: float64

In [12]:
path = 'decision_focused_learning_gpu/instances_weibo/06-23-all_200_twitter_500/'
if not os.path.exists(path) :
    os.mkdir(path)

# PROP_POS = 0.
N_INSTANCES = 20
N_INFLUENCERS = 500
N_TARGETS = 500

for instance in range(N_INSTANCES) : 

    if instance % (N_INSTANCES // 10) == 0 : print(f"Saving instance {instance}/{N_INSTANCES}...")

    if os.path.exists(path + f'{instance}.npz') :
        print("Instance already created")
    else :
        sampled_influencers = np.random.choice(influencers, N_INFLUENCERS, p = None, replace=False)
        sampled_targets = np.random.choice(targets, N_TARGETS, p = None, replace=False)

        XY = create_XY(sampled_influencers, sampled_targets)
        # if PROP_POS > 0 :
        #     XY = fill_with_positive(XY, PROP_POS, sampled_influencers)

        np.savez(path + f'{instance}.npz', XY)    
        del(XY)
    
print("End")

Saving instance 0/20...
Saving instance 2/20...
Saving instance 4/20...
Saving instance 6/20...
Saving instance 8/20...
Saving instance 10/20...
Saving instance 12/20...
Saving instance 14/20...
Saving instance 16/20...
Saving instance 18/20...
End
