In [3]:
import os
import numpy as np
import random
import pandas as pd
from numba import jit

import os 
import warnings
warnings.simplefilter('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict


In [1]:
features_influencers_path = "data/weibo_features/features_influencers_100K_150_emb.pkl"
features_targets_path = "data/weibo_features/features_targets_100K_150_emb.pkl"
labels_path = "data/weibo_preprocessed/labels2_100K_150.pkl"
edges_path = "data/weibo_preprocessed/edges2_100K_150.pkl"

### Import of the features

In [4]:
features_influencers = pd.read_pickle(features_influencers_path)
features_targets = pd.read_pickle(features_targets_path)
print("features_influencers : \n" + str(features_influencers.dtypes))
print("shape : " + str(features_influencers.shape))
print("features_targets : \n" + str(features_targets.dtypes))
print("shape : " + str(features_targets.shape))

features_influencers : 
0     float64
1     float64
2     float64
3     float64
4     float64
5     float64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
13    float64
14    float64
15    float64
16    float64
17    float64
18    float64
19    float64
20    float64
21    float64
22    float64
23    float64
24    float64
25    float64
26    float64
27    float64
28    float64
29    float64
30    float64
31    float64
32    float64
33    float64
34    float64
35    float64
36    float64
37    float64
38    float64
39    float64
40    float64
41    float64
42    float64
43    float64
44    float64
45    float64
46    float64
47    float64
48    float64
49    float64
dtype: object
shape : (6417, 50)
features_targets : 
0     float64
1     float64
2     float64
3     float64
4     float64
5     float64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
13    float64
14    float64
15    float64

### Import of labels 

In [5]:
labels = pd.read_pickle(labels_path)
labels.index = pd.MultiIndex.from_tuples(zip(labels['u'],labels['v'])) #important to do .loc[(u,v)]
labels = labels.sort_index() # infos are retreived faster
labels = labels.drop_duplicates()

print("labels : \n" + labels.head(2).to_markdown())
print("shape : " + str(labels.shape))

labels : 
|                  |      u |      v |   BT |        JI |        LP |
|:-----------------|-------:|-------:|-----:|----------:|----------:|
| (41499, 41499)   |  41499 |  41499 |    1 | 0.0277778 | 0.0285714 |
| (101713, 101713) | 101713 | 101713 |    1 | 0.0135135 | 0.0136986 |
shape : (179932, 5)


removing the labels where we do not have the features

In [6]:
influencers = list(features_influencers.index)
d_influencers = defaultdict(lambda : False)
for i in influencers : d_influencers[i] = True

targets = list(features_targets.index)
d_targets = defaultdict(lambda : False)
for i in targets : d_targets[i]=True

labels = labels.drop(labels[labels.u.apply(lambda x : not d_influencers[x])].index)
labels = labels.drop(labels[labels.v.apply(lambda x : not d_targets[x])].index)

print("labels : \n" + labels.head(2).to_markdown())
print("shape : " + str(labels.shape))

labels : 
|                      |      u |                v |   BT |         JI |         LP |
|:---------------------|-------:|-----------------:|-----:|-----------:|-----------:|
| (101713, 101713)     | 101713 | 101713           |    1 | 0.0135135  | 0.0136986  |
| (200880, 1840240511) | 200880 |      1.84024e+09 |    1 | 0.00917431 | 0.00925926 |
shape : (164066, 5)


Create feature vector

In [7]:
edges = pd.read_pickle(edges_path)

def f(): return 0
d_edges = defaultdict(f)
for (u,v) in zip(edges.u, edges.v) :
    d_edges[(u,v)] = 1
del(edges)

def feature_vector(u,v) : 
    """
    Creates vector with
    - Influencers features
    - Target features
    - Topology link
    """
    fu = features_influencers.loc[u]
    fv = features_targets.loc[v]
    
    return np.concatenate([fu, fv, d_edges[(u,v)]], axis = None)



Create instance

In [8]:

N_FEATURES = 100

d_labels = defaultdict(lambda : False)
for (u,v) in zip(labels.u, labels.v) :
    d_labels[(u,v)] = True

PROB_TYPE = 'LP'

def fill_y(u,v) : 
    if d_labels[(u,v)] : 
        return labels.loc[(u,v)][PROB_TYPE]
    else : 
        return 0
@jit
def create_XY(sampled_influencers, sampled_targets) :
    """
    from 2 sets of influencers and targets, creates features and labels according to the paper format
    """
    nI = len(sampled_influencers)
    nT = len(sampled_targets)
    X = np.zeros((nI, nT, N_FEATURES))

    Y = np.zeros((nI, nT))

    for i in range(nI):
        for j in range(nT):
            #X[i,j, :] = feature_vector(sampled_influencers[i], sampled_targets[j])
            X[i,j, :] = np.concatenate([features_influencers.loc[sampled_influencers[i]], features_targets.loc[sampled_targets[j]]], axis = None)
            Y[i,j] = fill_y(sampled_influencers[i], sampled_targets[j])

    Y = np.reshape(Y, (nI, nT,1))

    return np.concatenate((X,Y), axis = 2)

def fill_with_positive(XY, p) :
    """
    input : XY -> output of createXY, p -> proportion of positive examples needed in XY
    output : XY with the positive examples added
    """
    nI, nT, _ = XY.shape
    n_pos = int(p * nI * nT)

    labels_to_add = labels.sample(n = n_pos)
    labels
    for l in range(n_pos) :
    
        i = np.random.randint(0,nI)
        t = np.random.randint(0, nT)
    
        label = labels_to_add.iloc[l]
        #f = feature_vector(label.u, label.v)
        f = np.concatenate([features_influencers.loc[label.u], features_targets.loc[label.v]], axis = None)

        XY[i, t, :] = np.concatenate([f, label[PROB_TYPE]], axis=None)
    
    return XY

Generate all instances

In [10]:
path = 'decision_focused_learning_gpu/instances_weibo/embeddings/'

PROP_POS = 0.2
N_INSTANCES = 10
N_INFLUENCERS = 500
N_TARGETS = 500

for instance in range(N_INSTANCES) : 

    if instance % (N_INSTANCES // 10) == 0 : print(f"Saving instance {instance}/{N_INSTANCES}...")

    if os.path.exists(path + f'{instance}.npz') :
        print("Instance already created")
    else :
        sampled_influencers = np.random.choice(influencers, N_INFLUENCERS, p = None, replace=False)
        sampled_targets = np.random.choice(targets, N_TARGETS, p = None, replace=False)

        XY = create_XY(sampled_influencers, sampled_targets)
        if PROP_POS > 0 :
            XY = fill_with_positive(XY, PROP_POS)

        np.savez(path + f'{instance}.npz', XY)    
        del(XY)
    
print("End")

Saving instance 0/10...
Saving instance 1/10...
Saving instance 2/10...
Saving instance 3/10...
Saving instance 4/10...
Saving instance 5/10...
Saving instance 6/10...
Saving instance 7/10...
Saving instance 8/10...
Saving instance 9/10...
End
