In [2]:
import numpy as np
import pandas as pd
import networkx as nx
import pickle
import time
import requests

from os import listdir
from os.path import isfile, join
from datetime import datetime

In [4]:
OUTPUT_CLIENT_PATH = 'output_client'
OUTPUT_PATH = 'output'

In [5]:
#@title
def get_last_checkpoint(path=None, prefix='data'):
  if not path:
    path = OUTPUT_PATH

  files = [f for f in listdir(path) if isfile(join(path, f)) and f.startswith(f'{prefix}_') and f.endswith('.pickle')]
  
  if not files:
    return None
  
  files.sort(reverse=True)
  path = join(path, files[0])
  print(f'loading {path} ...')
  # read pickle file
  with open(path, 'rb') as f:
    df = pickle.load(f)
  
  return df

def prepare_data(df):
    available_ids = df.index
    info = {}
    ntx = {}
    for index, row in df.iterrows():
        info[index] = {
            'username': row['username'],
            'name': row['name'],
            'profile_image_url': row['profile_image_url'],
            'followers': row['followers'],
            'following': row['following'],
            'log_ratio': row['log_ratio'],
            'nocentrality': 1,
            'degree': row.get('degree', 0.02),
            'pagerank': row.get('pagerank', 0.02),
            'betweenness_centrality': row.get('betweenness_centrality', 0.02),
        }
        
        ntx[index] = [i for i in row['following_ids'].split(',') if i in available_ids]
    
    return info, ntx

In [4]:
influencer_df = get_last_checkpoint(OUTPUT_PATH)
influencer_df = influencer_df[(influencer_df.following_ids != '') & (influencer_df.following_ids.notnull())] #elimina usuarios con dato de seguidores nulo
influencer_df = influencer_df.drop(columns='withheld')

client_df = get_last_checkpoint(OUTPUT_CLIENT_PATH)
client_df = client_df[(client_df.following_ids != '') & (client_df.following_ids.notnull())]#elimina usuarios con dato de seguidores nulo

loading /content/drive/MyDrive/Colab Notebooks/mbu-graph/output/data_0325_203935.pickle ...
loading /content/drive/MyDrive/Colab Notebooks/mbu-graph/output_client/data_0328_160123.pickle ...


In [5]:
starting_nodes = [
    '67401711',  # Mercantil
    '126086956', # Provincial
    '105219620', # Banesco
    '221765424', # BOD
    '3083754887',
    '2354503127',
]

## Solo Candidatos

In [None]:
df = influencer_df
df.drop(index=starting_nodes, errors = 'ignore', inplace=True) #elimina nodos inicales

In [None]:
info, ntx = prepare_data(df) #info tiene la informacion de cada nodo en un diccionario y ntx es un diccionario con los 
                                #usuarios que sigue o que lo siguen por cada usuario
dg = nx.DiGraph(ntx)#crea el grafo

In [None]:
centrality_measures = pd.DataFrame({ #crea dataframe con medidas leidas del grafo
    'degree': dict(nx.degree(dg, weight=None)),
    'pagerank': dict(nx.pagerank(dg, weight=None)),
    'betweenness_centrality': dict(nx.betweenness_centrality(dg, weight=None)),
    })

df_wcentrality = df.join(centrality_measures) #une dataframes entre usuarios y medidas del grafo (que están idexadas por id)

In [None]:
df_wcentrality[['name','following','followers','pagerank','degree','betweenness_centrality']].sort_values('pagerank', ascending=False).head(25)

Unnamed: 0_level_0,name,following,followers,pagerank,degree,betweenness_centrality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
23719107,Nelson Bocaranda Sardi.,1321,3322502,0.021928,148,0.040485
64252221,Laureano Marquez,1387,3656106,0.020769,149,0.036307
76947892,César Miguel Rondón,3023,2886446,0.020746,148,0.023793
77054564,Leonardo Padrón,2452,2953683,0.020069,146,0.036323
35103663,Eugenio G. Martínez,3540,306009,0.020022,149,0.032517
41626835,ROMAN LOZINSKI,2426,1120990,0.016337,143,0.025475
76779177,Luis Vicente Leon,391,1495506,0.016113,102,0.005436
37875647,Edgar Ramírez,702,1512418,0.015812,76,0.005087
7508402,Henkel Garcia U.,10869,192958,0.015533,76,0.010334
764036,Fran Monroy Moret,8202,128897,0.015233,99,0.030128


##Candidatos + Clientes

In [6]:
starting_nodes = [
    '67401711',  # Mercantil
    '126086956', # Provincial
    '105219620', # Banesco
    '221765424', # BOD
    '3083754887',
    '77613385',
    '828194378258726916',
    '3331949733',
    '338541764'
]

df = influencer_df.append(client_df)
df.drop(index=starting_nodes, errors = 'ignore', inplace=True)

In [7]:
info, ntx = prepare_data(df)
dg = nx.DiGraph(ntx)

In [8]:
centrality_measures = pd.DataFrame({
    'degree': dict(nx.degree(dg, weight=None)),
    'pagerank': dict(nx.pagerank(dg, weight=None)),
    'betweenness_centrality': dict(nx.betweenness_centrality(dg, weight=None)),
    })

df_wcentrality = df.join(centrality_measures)

In [11]:
df_wcentrality[['username', 'name','pagerank']].sort_values('pagerank', ascending=False).head(30)

Unnamed: 0_level_0,username,name,pagerank
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
23719107,nelsonbocaranda,Nelson Bocaranda Sardi.,0.021492
76947892,cmrondon,César Miguel Rondón,0.020499
64252221,laureanomar,Laureano Marquez,0.019922
77054564,Leonardo_Padron,Leonardo Padrón,0.019189
35103663,puzkas,Eugenio G. Martínez,0.016969
76779177,luisvicenteleon,Luis Vicente Leon,0.016152
63263205,ErikaDLV,Erika de la Vega,0.015881
37875647,edgarramirez25,Edgar Ramírez,0.015875
41626835,RLOZINSKI,ROMAN LOZINSKI,0.014873
54329161,aroliveros,Asdrúbal R. Oliveros,0.013139
