In [7]:
import os
import torch
import numpy as np
import random
import pandas as pd


In [None]:
file_user = "../data/weibo/userProfile.pkl"
file_labels = "../data/weibo/labels_1000.pkl"

Import of User data 
First approach : without categorical variables

In [8]:
df_user = pd.read_pickle(file_user)
df_user.index = df_user['id'].astype(np.int64)

def featvec(feat, clist):
    """
    onehot encoding
    """
    output = [0] * len(clist)
    output[clist.index(feat)] = 1
    return output

DATE_VAR = ['created_at']
NUM_VAR = ['bi_followers_count', 'followers_count', 'friends_count', 'statuses_count']
CAT_VAR = ['city', 'verified', 'province', 'verified_type']

df_user_num = df_user[NUM_VAR]
df_user_num = df_user_num.groupby(df_user_num.index).max() #some userid were 2 time in the dataset 1681085 rows -> 1655678 rows

print("df_user_num : \n" + df_user_num.head(5).to_markdown())
print(f"shape : {df_user_num.shape}\n" )


df_user_num : 
|    id |   bi_followers_count |   followers_count |   friends_count |   statuses_count |
|------:|---------------------:|------------------:|----------------:|-----------------:|
| 10029 |                  142 |             11573 |             378 |             1226 |
| 10057 |                   72 |               212 |             315 |              305 |
| 10111 |                   93 |               322 |             252 |              619 |
| 10145 |                  310 |              3264 |             540 |             1484 |
| 10211 |                  123 |              5704 |             242 |              169 |
shape : (1655678, 4)



Import of ground truths previously estimated

In [9]:
labels = pd.read_pickle(file_labels)
labels.u = labels.u.astype(np.int64)
labels.v = labels.v.astype(np.int64)
labels.index = pd.MultiIndex.from_tuples(zip(labels['u'],labels['v'])) #important to do .loc[(u,v)]
labels = labels.sort_index() # infos are retreived faster
labels = labels.drop_duplicates()
labels = labels.drop((1637712471, 279405)) #1637712471 is not present in df_user_num (why ?)

print("labels : \n" + labels.head(5).to_markdown())
print(f"shape : {labels.shape}\n" )

labels : 
|                      |               u |               v |   BT |   JI |   LP |
|:---------------------|----------------:|----------------:|-----:|-----:|-----:|
| (82768, 82768)       | 82768           | 82768           |  1   | 0.5  |  1   |
| (7747002, 7747002)   |     7.747e+06   |     7.747e+06   |  1   | 0.5  |  1   |
| (8060099, 8060099)   |     8.0601e+06  |     8.0601e+06  |  1   | 0.5  |  1   |
| (15058618, 3023198)  |     1.50586e+07 |     3.0232e+06  |  1   | 0.5  |  1   |
| (32821757, 32821757) |     3.28218e+07 |     3.28218e+07 |  0.5 | 0.25 |  0.5 |
shape : (147693, 5)



It is not possible to create a matrix of #influencers x #targets x #features (=1.6Billions entries for only 158048 positive influences)

So to create one instance we randomly sample 100 influencers and 1000 influencers

Is it enough to have positive examples ? yes : roughly 100 / (100x1000) positive labels per prediction matrix

In [10]:
influencers =set(labels.groupby('u').count().index)
targets = set(labels.groupby('v').count().index)

### PARAMETERS

In [11]:
N_INFLUENCERS = 100
N_TARGETS = 1000
N_FEATURES = 2 * len(NUM_VAR)
N_INSTANCES = 100

In [12]:
sampled_influencers = random.sample(influencers, N_INFLUENCERS)
sampled_targets = random.sample(targets, N_TARGETS)


In [13]:
def fill_y(u,v) : 
    if (u,v) in labels.index : 
        return labels.loc[(u,v)]['BT']
    else : 
        return 0

def create_XY(sampled_influencers, sampled_targets) :
    """
    from 2 sets of influencers and targets, creates features and labels according to the paper format
    """

    X = np.zeros((N_INFLUENCERS, N_TARGETS, N_FEATURES))

    for target in range(N_TARGETS):
        X[:, target, :] = np.c_[np.array(df_user_num.loc[sampled_influencers]), np.tile(df_user_num.loc[sampled_targets[target]],(N_INFLUENCERS, 1))]

    Y = np.zeros((N_INFLUENCERS, N_TARGETS))

    for i in range(N_INFLUENCERS):
        for j in range(N_TARGETS):
            Y[i,j] = fill_y(sampled_influencers[i], sampled_targets[j])
    
    return X,Y


In [14]:
path = 'instances_weibo/'

for instance in range(N_INSTANCES) : 
    sampled_influencers = random.sample(influencers, N_INFLUENCERS)
    sampled_targets = random.sample(targets, N_TARGETS)

    X,Y = create_XY(sampled_influencers, sampled_targets)
    
    if instance % (N_INSTANCES // 10) == 0 : print(f"Saving instance {instance}/{N_INSTANCES}...")
    if os.path.exists(path + f'X{instance}.npz') == False:
        np.savez(path + f'X{instance}.npz', X)
    if os.path.exists(path + f'Y{instance}.npz') == False:
        np.savez(path + f'Y{instance}.npz', Y)
    
    del(X,Y)
print("End")


Saving instance 1/100...
Saving instance 11/100...
Saving instance 21/100...
Saving instance 31/100...
Saving instance 41/100...
Saving instance 51/100...
Saving instance 61/100...
Saving instance 71/100...
Saving instance 81/100...
Saving instance 91/100...
