In [1]:
import pandas as pd
import bamboolib
import numpy as np
import ast

In [2]:
def norm(x):
    return (1/np.log(x))

#### Get the graines, their friends & their followers

In [3]:
# Load all the followers of the graines
foll = pd.read_csv('data/followers_graines_version_2021_09_21.csv', index_col = [0], low_memory=False).reset_index()
foll = foll[['twitter_handle', 'follower_id']]

# Create the dictionary of graines and their followers
group_foll = foll.groupby('twitter_handle')['follower_id'].count().rename('count_followers').reset_index()
dict_follo = dict(zip(group_foll['twitter_handle'], group_foll['count_followers']) )

# Load of the friends of the graines
fri = pd.read_csv('data/friends_graines.csv.gz', index_col = [0], low_memory=False).reset_index()
fri = fri[['twitter_handle', 'friend_id']]

# Create the dictionary of graines and their friends
group_fri = fri.groupby('twitter_handle')['friend_id'].count().rename('count_friends').reset_index()
dict_fri = dict(zip(group_fri['twitter_handle'], group_fri['count_friends']) )

#### Get the missing values of the dataset

In [4]:
# Load the dataset of the candidates
data_candidate = pd.read_csv('data/data_ready.csv', index_col = [0])
data = data_candidate[['user_id', 'followers', 'friends', 'count_graines_in_friends', 'graines_in_friends']]
data['graines_in_friends'] = data['graines_in_friends'].apply(lambda x: ast.literal_eval(x))

In [5]:
# The followers information of the original dataset is missing
foll_grain = pd.merge(data, foll, left_on = 'user_id', right_on = 'follower_id').drop('follower_id', axis=1)
foll_grain = foll_grain.groupby('user_id')['twitter_handle'].agg(['count', list])
foll_grain.columns = ['count_graines_in_followers', 'graines_in_followers']
foll_grain = foll_grain.reset_index()

# remerge with the datase
new_data = pd.merge(foll_grain, data, on = 'user_id')

### Followers

In [6]:
new_data_follo = new_data.explode('graines_in_followers')
new_data_follo['total_grain_follo'] = new_data_follo['graines_in_followers'].apply(lambda x: dict_follo.get(x))

# get the sum of the total followers of graine follows by an indivual
new_data_follo = new_data_follo.groupby('user_id')['total_grain_follo'].sum().reset_index()

# Normalize by the formula
new_data_follo['normalize_follo'] = new_data_follo['total_grain_follo'].apply(lambda x: norm(x))

### Friends

In [7]:
new_data_fri = new_data.explode('graines_in_friends')
new_data_fri['total_grain_fri'] = new_data_fri['graines_in_friends'].apply(lambda x: dict_fri.get(x))

# get the sum of the total followers of graine follows by an indivual
new_data_fri = new_data_fri.groupby('user_id')['total_grain_fri'].sum().reset_index()

# Normalize by the formula
new_data_fri['normalize_friends'] = new_data_fri['total_grain_fri'].apply(lambda x: norm(x))

### Final concat

In [8]:
concat_score = pd.merge(new_data_follo, new_data_fri, on = 'user_id')
concat_score = pd.merge(concat_score, new_data, on = 'user_id')

In [10]:
concat_score['prop_graine_friends'] = concat_score['count_graines_in_friends']/concat_score['friends']
concat_score['prop_graine_followers'] = concat_score['count_graines_in_followers']/concat_score['followers']

In [92]:
final = concat_score[['count_graines_in_followers', 'prop_graine_followers',
                    'count_graines_in_friends', 'prop_graine_friends', 
                    'normalize_friends', 'normalize_follo']]



final = final.fillna(final.mean())
final = final.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
final = np.array(final)
np.save('embeddings/topo.npy', final)

In [95]:
final

array([[1.00000000e+00, 6.96330339e-05, 1.00000000e+00, 1.14613181e-04,
        1.16517267e-01, 1.22061812e-01],
       [1.00000000e+00, 8.91265597e-04, 1.00000000e+00, 3.13283208e-04,
        1.16517267e-01, 1.22061812e-01],
       [2.00000000e+00, 6.76818951e-04, 2.00000000e+00, 2.48756219e-03,
        1.43592838e-01, 8.31166027e-02],
       ...,
       [6.00000000e+00, 6.00000000e-02, 6.00000000e+00, 1.79104478e-02,
        1.30315434e-01, 8.03413221e-02],
       [1.00000000e+00, 7.75193798e-03, 1.00000000e+00, 3.19488818e-03,
        1.62926763e-01, 1.08067269e-01],
       [3.00000000e+00, 5.17241379e-02, 3.00000000e+00, 1.81268882e-03,
        1.20825557e-01, 9.00817742e-02]])

In [93]:
from sklearn.preprocessing import StandardScaler
 
sc = StandardScaler()