In [29]:
import os
import numpy as np
import pandas as pd
from numba import jit

import os 
import warnings
warnings.simplefilter('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict


In [2]:
file_user = "../data/weibo/userProfile.pkl"
file_labels = "../data/weibo/labels_1000.pkl"
file_edges = "../data/weibo/edges_1000.pkl"

DATE_VAR = ['created_at']
NUM_VAR = ['bi_followers_count', 'followers_count', 'friends_count', 'statuses_count']
CAT_VAR = ['city', 'verified', 'province', 'verified_type', 'gender']

FEATURES = ['followers_count', 'friends_count', 'statuses_count', 'verified', 'gender']

N_INFLUENCERS = 100
N_TARGETS = 1000
N_FEATURES = 2 * len(FEATURES)
N_INSTANCES = 10

PROB_TYPE = 'LP'
PROP_POS = 0.1

In [5]:
df_user_orig = pd.read_pickle(file_user)
df_user_orig.index = df_user_orig['id'].astype(np.int64)

print("df_user_orig : \n" + df_user_orig.head(5).to_markdown())
print(f"shape : {df_user_orig.shape}\n" )

#FE
df_user = df_user_orig[FEATURES]
df_user['followers_count'] = df_user['followers_count'].apply(lambda x : np.log(max(x, 0) + 1)) / 10
df_user['friends_count'] = df_user['friends_count'].apply(lambda x : np.log(max(x, 0) + 1)) / 8
df_user['statuses_count'] = df_user['statuses_count'].apply(lambda x : np.log(max(x, 0) + 1)) / 10
df_user['verified'] = df_user.verified.apply(lambda x : 1 if x =='True' else 0)
df_user['gender'] = df_user.gender.cat.codes
df_user = df_user.groupby(df_user.index).max() #some userid were 2 time in the dataset 1681085 rows -> 1655678 rows


df_user_orig : 
|         id |         id |   bi_followers_count |   city | verified   |   followers_count |   province |   friends_count | gender   | created_at          |   verified_type |   statuses_count |
|-----------:|-----------:|---------------------:|-------:|:-----------|------------------:|-----------:|----------------:|:---------|:--------------------|----------------:|-----------------:|
| 1657151084 | 1657151084 |                    0 |      5 | False      |                33 |         31 |             162 | m        | 2009-10-29 22:20:41 |              -1 |                0 |
| 1657149732 | 1657149732 |                    0 |      9 | False      |                16 |         62 |              27 | m        | 2009-10-29 20:48:01 |              -1 |               11 |
| 1657148500 | 1657148500 |                    4 |     20 | False      |                79 |         31 |              29 | m        | 2009-12-12 18:10:42 |              -1 |               25 |
| 1657146942 |

Import of ground truths previously estimated

In [7]:
labels = pd.read_pickle(file_labels)
labels.u = labels.u.astype(np.int64)
labels.v = labels.v.astype(np.int64)
labels.index = pd.MultiIndex.from_tuples(zip(labels['u'],labels['v'])) #important to do .loc[(u,v)]
labels = labels.sort_index() # infos are retreived faster
labels = labels.drop_duplicates()
labels = labels.drop((1637712471, 279405)) #1637712471 is not present in df_user_num (why ?)


labels : 
|                      |               u |               v |   BT |   JI |   LP |
|:---------------------|----------------:|----------------:|-----:|-----:|-----:|
| (82768, 82768)       | 82768           | 82768           |  1   | 0.5  |  1   |
| (7747002, 7747002)   |     7.747e+06   |     7.747e+06   |  1   | 0.5  |  1   |
| (8060099, 8060099)   |     8.0601e+06  |     8.0601e+06  |  1   | 0.5  |  1   |
| (15058618, 3023198)  |     1.50586e+07 |     3.0232e+06  |  1   | 0.5  |  1   |
| (32821757, 32821757) |     3.28218e+07 |     3.28218e+07 |  0.5 | 0.25 |  0.5 |
shape : (147693, 5)



In [8]:
influencers = list(labels.groupby('u').count().index)
targets = list(labels.groupby('v').count().index)

Total influencers : 823
Total targets : 133678


In [12]:
def fill_y(u,v) : 
    if (u,v) in labels.index : 
        return labels.loc[(u,v)][PROB_TYPE]
    else : 
        return 0

def create_XY(sampled_influencers, sampled_targets) :
    """
    from 2 sets of influencers and targets, creates features and labels according to the paper format
    """
    nI = len(sampled_influencers)
    nT = len(sampled_targets)
    X = np.zeros((nI, nT, N_FEATURES))

    for target in range(nT):
        X[:, target, :] = np.c_[np.array(df_user.loc[sampled_influencers]), np.tile(df_user.loc[sampled_targets[target]],(nI, 1))]

    Y = np.zeros((nI, nT))

    for i in range(nI):
        for j in range(nT):
            Y[i,j] = fill_y(sampled_influencers[i], sampled_targets[j])

    Y = np.reshape(Y, (nI, nT,1))

    return np.concatenate((X,Y), axis = 2)


In [17]:

def fill_with_positive(XY, p) :
    """
    input : XY -> output of createXY, p -> proportion of positive examples needed in XY
    output : XY with the positive examples added
    """
    nI, nT, _ = XY.shape
    n_pos = int(p * nI * nT)

    labels_to_add = labels.sample(n = n_pos)
    for l in range(n_pos) :
    
        i = np.random.randint(0,nI)
        t = np.random.randint(0, nT)
    
        label = labels_to_add.iloc[l]
        fu = df_user.loc[label.u]
        fv = df_user.loc[label.v]
    
        XY[i, t, :] = np.concatenate([fu, fv, label[PROB_TYPE]], axis=None)
    
    return XY


In [19]:
gbu = labels.groupby('u').count().v
influencers = list(gbu.index)
p_influencers = list(gbu.values / sum(gbu)) 
del(gbu)

gbv = labels.groupby('v').count().u
targets = list(gbv.index)
p_targets = list(gbv.values / sum(gbv))
del(gbv)

In [423]:
path = '../decision_focused_learning_gpu/instances_weibo/oversampled_FE_LP/'

for instance in range(N_INSTANCES) : 

    if instance % (N_INSTANCES // 10) == 0 : print(f"Saving instance {instance}/{N_INSTANCES}...")

    if os.path.exists(path + f'{instance}.npz') :
        print("Instance already created")
    else :
        sampled_influencers = np.random.choice(influencers, N_INFLUENCERS, p = p_influencers, replace=False)
        sampled_targets = np.random.choice(targets, N_TARGETS, p = p_targets, replace=False)

        XY = create_XY(sampled_influencers, sampled_targets)
        XY = fill_with_positive(XY, PROP_POS)

        np.savez(path + f'{instance}.npz', XY)    
        del(XY)
    
print("End")

Saving instance 0/10...
Saving instance 1/10...
Saving instance 2/10...
Saving instance 3/10...
Saving instance 4/10...
Saving instance 5/10...
Saving instance 6/10...
Saving instance 7/10...
Saving instance 8/10...
Saving instance 9/10...
End


### Adding the topology edges info to instances

In [31]:
edges = pd.read_pickle(file_edges)
def f(): return False
d_edges = defaultdict(f)
for (u,v) in zip(edges.u, edges.v) :
    d_edges[(u,v)] = True
del(edges)

def feature_vector(u,v) : 

    fu = df_user.loc[u]
    fv = df_user.loc[v]

    
    XY[i, t, :] = np.concatenate([fu, fv, label[PROB_TYPE]], axis=None)


False

Plots