In [1]:
import os, sys, time, resource, re, gc, shutil
from multiprocess import Pool
from functools import partial
from urllib.parse import urlparse, parse_qsl
import matplotlib
matplotlib.use('pgf')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import django
import igraph
sys.path.append('/home/galm/software/django/tmv/BasicBrowser/')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BasicBrowser.settings")
django.setup()
from django.db.models import Q, F, Sum, Count, FloatField, Case, When, Value, Max
import matplotlib.patches as patches
from scipy.sparse import csr_matrix, find

from scoping.models import *
from tmv_app.models import *

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize
from nltk.stem import SnowballStemmer


import textwrap as tw
from utils.text import *
import networkx as nx
run_id = 665
stat = RunStats.objects.get(pk=run_id)
top = 11046

dtopics = DynamicTopic.objects.filter(run_id=stat)
wtopics = Topic.objects.filter(run_id=stat)


In [2]:
def return_corrs(ar,cor,run_id):
    topics = DynamicTopic
    topiccorrs = DynamicTopicCorr
    tars = DynamicTopicARScores
    field = 'dynamictopicarscores__score'
    nodes = topics.objects.filter(
        run_id=run_id,
        timedtopic__period__n=ar,
        timedtopic__share__gt=0.0025
    )
    
    nodelist = list(nodes.values_list('id',flat=True))
    
    nodes = list(nodes.values('id','title','score','primary_wg','wg_prop'))
    for n in nodes:
        if ar > -1:
            tdt = TimeDTopic.objects.get(dtopic_id=n['id'],period__n=ar)
            n['arscore'] = tdt.score
    links = topiccorrs.objects.filter(run_id=run_id).filter(
        score__gt=cor,
        score__lt=1,
        ar=ar
    ).annotate(
        source=F('topic'),
        target=F('topiccorr')
    )
    links = list(links.values('source','target','score'))
    
    links = [x for x in links if x['source'] in nodelist and x['target'] in nodelist]
    
    for node in nodes:
        if node['arscore'] is None:
            node['arscore'] = 0
    return nodes,links

In [3]:
def generate_graph(nodes,links,top=None,c=1):
    g = igraph.Graph()
    dt_max = max([x['arscore'] for x in nodes])

    cmap = {
        1: "#66c2a5",
        2: "#fc8d62",
        3: "#8da0cb"
    }

    for n in nodes:
        g.add_vertex(
            name=str(n['id']),
            primary_wg=n['primary_wg'],
            color=cmap[n['primary_wg']],
            size=20/dt_max*n['arscore'],
            title=n['title'],
            ttype="dynamic"
        )
        
    labels = []
    
    node = [x for x in nodes if x['id']==top]
    if len(node) > 0: 
        node = node[0]
        node['v'] = g.vs.find(name=str(top))
        if node not in labels:
            labels.append(node)
    
    for l in links:
        ns = [l['source'],l['target']]
        if top in ns and l['score'] > c:
            highlight=0.5
            for n in ns:
                node = [x for x in nodes if x['id']==n]       
                if len(node) > 0: 
                    node = node[0]
                    node['v'] = g.vs.find(name=str(top))
                    if node not in labels:
                        labels.append(node)

        else:
            highlight=0
                  
        g.add_edge(str(l['source']),str(l['target']),weight=l['score'],highlight=highlight)

    layout = g.layout('fr', weights='weight')
    


    return g, layout, labels

def draw_graph(g, layout, ax, labels):
    label_ids = [l['id'] for l in labels]
    x = [x[0] for x in layout]
    y = [x[1] for x in layout]
    c = [v["color"] for v in g.vs]
    s = [v["size"]*5 for v in g.vs]
    lw = [0.05 if int(v['name']) not in label_ids else 1.5 for v in g.vs ]

    ax.scatter(x,y,c=c,s=s,edgecolor="black",linewidths=lw)

    for e in g.es:#[:1000]:
        s = layout[e.source]
        t = layout[e.target]
        ex = [s[0],t[0]]
        ey = [s[1],t[1]]
        lw = 0.05+(np.log(e['weight']+1)*0.5)+e['highlight']*2 #+0.05+(np.log(e['connection_to_n']+1)*5)  #+0.05+np.log(e['connection_to_n'])
        alpha = 0.2+(np.log(e['weight']+1)*0.5)+e['highlight']*2
        ax.plot(
            ex,
            ey,
            c="grey",
            lw=lw,
            alpha=alpha,
            zorder=-1
        )    
    for l in labels:
        xy = layout[l['v'].index]
        ax.text(xy[0],xy[1],l['title'])
                
    return #xs, ys

def eigenvector_centralization(G,w):
    vnum = G.vcount()
    if vnum < 3:
        raise ValueError("graph must have at least three vertices")
    denom = (vnum-1)*(vnum-2)
 
    temparr = [2*i/denom for i in G.eigenvector_centrality(weights="weight")]
    max_temparr = max(temparr)
    return sum(max_temparr-i for i in temparr)/(vnum-1)

In [4]:
fig = plt.figure()

plt.rcParams["figure.figsize"] = (18,18)


pgf_with_latex = {
    "text.usetex": True,            # use LaTeX to write all text
    "pgf.rcfonts": False,           # Ignore Matplotlibrc
    "text.latex.unicode": True,
    "pgf.preamble": [
        #r"\usepackage[utf8x]{inputenc}",
        r"\usepackage{xcolor}"
    ],
    "pgf.texsystem" : "xelatex",
    "figure.figsize": [12,7]
}
matplotlib.rcParams.update(pgf_with_latex)

periods = TimePeriod.objects.filter(timedtopic__dtopic__run_id=run_id).distinct()

all_df = pd.DataFrame()

n_ars = 6



graph_centrality = []

for i,ar in enumerate(range(1,n_ars+1)):
    print(ar)
    p = periods.get(n=ar)
    ax = fig.add_subplot(3,3,i+1)
    nodes, links = return_corrs(ar,0.0001,run_id)
    g, layout, labels =  generate_graph(nodes, links, top)
    draw_graph(g, layout, ax, labels)
    ax.set_title(p.title)
    ax.grid(False)
    ax.axis('off')
    node_df = pd.DataFrame.from_dict([x.attributes() for x in g.vs])
    node_df['ar'] = p.title
    node_df['eigen_centrality'] = g.eigenvector_centrality(weights='weight')
    all_df = all_df.append(node_df)
    graph_centrality.append(eigenvector_centralization(g,'weight'))
    
    
ax = fig.add_subplot(3,3,i+2)
ax.set_title("Topic centrality")

#node = [x for x in nodes if x['id']==top][0]
#v = node['v'] = g.vs.find(name=str(top))
#df = pd.DataFrame.from_dict(list(TimeDTopic.objects.filter(dtopic_id=top).values(
#    'period__title','share','score'
#))).groupby(['period__title'])['share'].mean()
#df.plot.bar(color=v['color'],ax=ax)

ind = np.arange(n_ars)
width=0.25
i=-1
cmap = {
    1: "#66c2a5",
    2: "#fc8d62",
    3: "#8da0cb"
}

for name, group in all_df.groupby('primary_wg'):
    c = cmap[name]
    p = ax.bar(
        ind+(i*width),
        #group.sort_values('ar')['eigen_centrality'][:n_ars],
        group.groupby('ar')['eigen_centrality'].mean()[:n_ars],
        width,
        color=c
    )
    i+=1
    
fig.patch.set_facecolor('#f0f0f0')
plt.savefig("../plots/network_development_wgs_{}.pdf".format(run_id),antialiased=True,facecolor=fig.get_facecolor())

1
2
3
4
5
6


In [5]:
n_ars = 5 

fig, ax = plt.subplots()
ind = np.arange(n_ars)

width=0.25

i=-1
cmap = {
    1: "#66c2a5",
    2: "#fc8d62",
    3: "#8da0cb"
}
for name, group in all_df.groupby('primary_wg'):
    c = cmap[name]
    p = ax.bar(ind+(i*width),group.sort_values('ar')['eigen_centrality'][:n_ars],width,color=c)
    i+=1

In [6]:
graph_centrality

[0.0005454065864564285,
 0.0002567270930028122,
 0.00021888489638552958,
 0.00020592728254674987,
 0.00019689072810077896,
 0.00019575051214540352]

In [7]:
node_df.groupby(['primary_wg','ar'])['eigen_centrality'].mean().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x7fe93ee79278>