In [None]:
import pandas as pd
import re
import os
import numpy as np
import networkx as nx
import networkit as ntkit  # pip install networkit
import datetime
import igraph   # primero: sudo apt install build-essential python-dev libxml2 libxml2-dev zlib1g-dev
# luego: pip install pyhton-igraph

In [None]:
# Levanto toda la data
files = os.listdir("/home/digitas_arg/fede_ego")
files  = [x for x in files if('.xlsx' in x)]
files = [x for x in files if('fedevigevani.xlsx' not in x)]

data = pd.DataFrame([],columns = ['author', 'date', 'mentions', 'permalink', 'text'])

for i,f in enumerate(files):
    aux = pd.read_excel('/home/digitas_arg/fede_ego/'+f)
    data = data.append(aux,ignore_index = True)
    if((i%500) == 0):
        print(i)

# Apendeo la data del influencer original
data = data.append(pd.read_excel("./data/fedevigevani.xlsx"),ignore_index = False)
data = data.append(pd.read_excel("./data/fedevigevani.2.xlsx"),ignore_index = False)

# Saco donde no hay menciones
data = data.dropna(subset = ["mentions"] )
data = data.loc[data.mentions != ' ']
data = data.loc[data.mentions != '']

# Save por las moscas
data.to_excel("./data/fede_total.xlsx",index = False)  # Guardo por las MOSCAS

In [None]:
# Levanto y genero las relaciones
data = pd.read_excel("./data/fede_total.xlsx") 

#data = data.append(pd.read_excel("./data/fedevigevani.xlsx"),ignore_index = False)
#data = data.append(pd.read_excel("./data/fedevigevani.2.xlsx"),ignore_index = False)

#data = data.dropna(subset = ["mentions"] )
#data = data.loc[data.mentions != ' ']
#data = data.loc[data.mentions != '']

# =============================================================================
#  Me armo las relaciones y me genero un dataframe sumarizado con comentatios y menciones
# =============================================================================

def generate_relations(data):
    data["mentions"] = data.mentions.apply(lambda x: re.findall(r'\S+',x))
    
    data_ampliado = data.mentions.apply(pd.Series).stack().rename('mentions').reset_index()
    print("data_ampliado generado")
    data_rankme = pd.merge(data_ampliado,data,left_on='level_0',right_index=True, suffixes=(['','_old']))[data.columns]
    print("data_rankme generado")
    
    rank = data_rankme[['author',"mentions"]].groupby(['author',"mentions"]).size().reset_index()
    rank.columns = ["author","mentions","weight"]
    return rank

relations = generate_relations(data)


In [None]:
# Recorto las conexiones pesadas para que sea mayor a 3. Luego me quedo con el subgrafo
net = nx.from_pandas_edgelist(relations,'author','mentions',["weight"])

def trim_edges(g, weight=1):
    g2=nx.Graph()
    for f, to, edata in g.edges(data=True):
        if edata['weight'] > weight:
            g2.add_edge(f,to,weight = edata['weight'])
    return g2

net_trim3 = trim_edges(net,3)

l_graphs = list(nx.connected_component_subgraphs(net_trim3))
l_graphs.sort(key=len)
print("length de los subgrafo 1",l_graphs[-1])
print("length de los subgrafo 2",l_graphs[-2])

In [None]:
# Guardo la data por las dudas
net_trim3 = l_graphs[-1]
nx.write_pajek(net_trim3,'./outputs/fede.net')
nx.write_gml(net_trim3,'./outputs/fede.gml')

In [None]:
# levanto la data para laburar con las librerias de networkx y igraph
net_trim3 = nx.read_pajek('./outputs/fede.net')
g = igraph.Graph.Read_GML("./outputs/fede.gml")

## Centralidad: 
### Uso networkit ya que esta codeado en C++ y anda mucho mas rapido. De todas formas dejo el betweeness paralelizado :)

In [None]:
# betweeness paralelizado! Igual uso el de networkit

def chunks(l, n):
    """Divide a list of nodes `l` in `n` chunks"""
    l_c = iter(l)
    while 1:
        x = tuple(itertools.islice(l_c, n))
        if not x:
            return
        yield x


def _betmap(G_normalized_weight_sources_tuple):
    """Pool for multiprocess only accepts functions with one argument.
    This function uses a tuple as its only argument. We use a named tuple for
    python 3 compatibility, and then unpack it when we send it to
    `betweenness_centrality_source`
    """
    return nx.betweenness_centrality_source(*G_normalized_weight_sources_tuple)


def betweenness_centrality_parallel(G, processes=None):
    """Parallel betweenness centrality  function"""
    p = Pool(processes=processes)
    node_divisor = len(p._pool) * 4
    node_chunks = list(chunks(G.nodes(), int(G.order() / node_divisor)))
    num_chunks = len(node_chunks)
    bt_sc = p.map(_betmap,
                  zip([G] * num_chunks,
                      [True] * num_chunks,
                      [None] * num_chunks,
                      node_chunks))

    # Reduce the partial solutions
    bt_c = bt_sc[0]
    for bt in bt_sc[1:]:
        for n in bt:
            bt_c[n] += bt[n]
    return bt_c



In [None]:
G = ntkit.nxadapter.nx2nk(net_trim3)  # adapto networkit a networkx

In [None]:
d = ntkit.centrality.DegreeCentrality(G)
d.run()
print("Degree")
c= ntkit.centrality.ApproxCloseness(G,nSamples=20000)
c.run()
b= ntkit.centrality.Betweenness(G)
b.run()

In [None]:
# Me armo una tabla con los usuarios que tuvieron la mayor de las 3 metricas
def sorted_map(map):
    ms = sorted(map, key=map.__getitem__, reverse=True)
    return ms

dict_degree = {}
[dict_degree.update( {str(name) : d } ) for name,d in zip(net_trim3.nodes(),d.scores())]
dict_betweeness= {}
[dict_betweeness.update( {str(name) : b } ) for name,b in zip(net_trim3.nodes(),b.scores())]
dict_closeness = {}
[dict_closeness.update( {str(name) : c } ) for name,c in zip(net_trim3.nodes(),c.scores())]

ds=sorted_map(dict_degree)
cs=sorted_map(dict_closeness)
bs=sorted_map(dict_betweeness)

names1= ds[:30]
names2= bs[:30]
names3= cs[:30]

## use Python sets to compute a union of the sets
names=list(set(names1) | set(names2) | set (names3))
## build a table with centralities
table=pd.DataFrame([{'node_name':name,'degree':dict_degree[name],'centrality':dict_closeness[name],
        'betweeness':dict_betweeness[name]} for name in names])

table.to_excel("centrality.xlsx")

## Analizo las comunidades usando infomap.

In [None]:
# Calculo las comunidades
comunities = g.community_infomap(edge_weights = 'weight')

In [None]:
clusters = []

for i in range(len(comunities)):
    if(len(comunities[i]) > 700):
        clusters.append(comunities[i])
        
name_converter = {}
_ = [name_converter.update({i:str(name)}) for i,name in enumerate(net_trim3.nodes())]

clusters = sorted(clusters,key = len,reverse = True)
[len(x) for x in clusters][0:10]  # Reviso cuales son los length necesarios

In [None]:
influencer_name = 'fedevigevani'

# Funcon que entrega las relaciones entre clsuters n_1 y n_2
def generate_weight(clusters,n_1,n_2): 
    l_links = []
    community_name = [name_converter.get(x,x) for x in clusters[n_1]]
    for name_1 in [name_converter.get(x,x) for x in clusters[n_2]]:
            l_links.append(sum([ net_trim3[name_1][x][0]['weight'] for x in net_trim3[name_1] if (x in community_name)]))

    return sum(l_links)

# Itero entre todas las posibles combinaciones y me genero un grafo de clusters
cluster_graph = nx.Graph()
for n_1 in range(len(clusters)):
    for n_2 in range(len(clusters)):
        if(n_1 != n_2):
            weight = generate_weight(clusters,n_1,n_2)
            if(weight > 0):
                cluster_graph.add_edge(n_2,n_1)
                cluster_graph[n_2][n_1]['weight'] = weight
                        
# Agrego las conexiones con el influencer
for n in range(len(clusters)):
    community_name = [name_converter.get(x,x) for x in clusters[n]]
    weight = sum([net_trim3[influencer_name][x][0]['weight']  for x in net_trim3[influencer_name] if (x in community_name)]) 
    if(weight > 0):
        cluster_graph.add_edge(influencer_name,n)
        cluster_graph[influencer_name][n]['weight'] = weight

# Agrego el size de cada clsuter, con el agregado del influencer
aux = {}
[aux.update({i:len(n_cluster) for i,n_cluster in enumerate(clusters)})]
aux.update({influencer_name:int(np.mean([len(x) for x in clusters]))})   # le asigno la media para que cuando haga el gradiente de colores quede bien.
nx.set_node_attributes(cluster_graph,aux,'size')  

In [None]:
#  Muestro la data
import matplotlib.pyplot as plt

nx.draw(cluster_graph)
plt.show()

In [None]:
#Me guardo las comunidades. Esto es opcional
name_converter = {}
[name_converter.update({i:str(name)}) for i,name in enumerate(net_trim3.nodes())]

for n in range(len(clusters)):
    comunidad1 = pd.DataFrame([],columns = ["node_number","node_screen","degree"])
    comunidad1["node_number"] = clusters[n]
    comunidad1["node_screen"] = comunidad1["node_number"].apply(lambda x: name_converter.get(x,x))
    comunidad1["degree"] = comunidad1["node_screen"].apply(lambda x: dict_degree.get(x,x))
    comunidad1 = comunidad1.sort_values(by = ["degree"],ascending=False)
    comunidad1.to_csv("./clusters_fede/cluster_"+str(n)+'.csv')
#comunidad1.head(10)


In [None]:
# Interpreto que tiene cada cluster
name_converter = {}
[name_converter.update({i:str(name)}) for i,name in enumerate(net_trim3.nodes())]
dict_degree = nx.degree_centrality(net_trim3)

list_df_clusters = []

for n in range(len(clusters)):
    comunidad1 = pd.DataFrame([],columns = ["node_number","node_screen","degree"])
    comunidad1["node_number"] = clusters[n]
    comunidad1["node_screen"] = comunidad1["node_number"].apply(lambda x: name_converter.get(x,x))
    comunidad1["degree"] = comunidad1["node_screen"].apply(lambda x: dict_degree.get(x,x))
    comunidad1 = comunidad1.sort_values(by = ["degree"],ascending=False)
    list_df_clusters.append(comunidad1)

In [None]:
# Para visualizar:
n = 5
print(len(clusters[n]))
list_df_clusters[n].head(10)

### Analisis de cluster por separado:


In [None]:
# Le agrego nombres a los clusters

mapping = {0:'Busca Fama',1:'Youtubers',2:'Amor',3:'Gammers',4:'Youtuber',5:'Youtubers2',6:'Girl Power',7:'Busca Fama',
          influencer_name:influencer_name}
cluster_graph = nx.relabel.relabel_nodes(cluster_graph,mapping)

# Genero el grafo
nx.write_gexf(cluster_graph, "./outputs/ClustersLena.gexf")

In [None]:
# para ir viendo quienes son los nexos de malena a los clusters
n = 14
community_name = [name_converter.get(x,x) for x in clusters[n]]
[print(x, net_trim3['LenaNarvay'][x][0]['weight'] ) for x in net_trim3['LenaNarvay'] if (x in community_name)] 

## Densidad y diametro


In [None]:
ntkit.graph.Graph.density(G)

In [None]:
#nx.effective_size(net_trim3)   # demasiado lento...
p = ntkit.distance.EffectiveDiameterApproximation(G)
p.run()
print("Effective diameter: ",p.getEffectiveDiameter())

In [None]:
p = ntkit.distance.Diameter(G)
p.run()
p.getDiameter()

In [None]:
nx.diameter(net_trim3)  # revisar esto

In [None]:
# Probar esto:
net.Graph(net.ego_graph(net_trim3,'LenaNarvay', radius=2))
#nx.average_clustering(net_trim3)