In [1]:
%matplotlib qt4
from __future__ import division

from collections import OrderedDict, defaultdict

from models import tools, optimize, models, filters
from models.tests import PerformanceTest

import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

In [54]:
data = tools.load_data(limit=1000000, offset=1000000)
data = data[data['is_correct'] == 0]
#data = data[filters.european_countries(data)]
print len(data)

214743


In [3]:
places = tools.load_places().T.to_dict()
n = lambda v: tools.to_place_name(v, places=places)

In [261]:
elo = models.EloModel()
pfa = models.PFAModel(elo)
pfa.train(data)

In [262]:
place_users = {}
correlations = {}
for place_id in pfa.prior.places:
    place_users[place_id] = {
        item.user.id for index, item in pfa.items.items()
        if place_id == index[1]
    }
for i, place_i in enumerate(pfa.prior.places):
    for place_j in pfa.prior.places:
        d = []
        for user_id in place_users[place_i]:
            if user_id in place_users[place_j]:
                d += [(pfa.items[user_id, place_i].knowledge,
                       pfa.items[user_id, place_j].knowledge)]
        correlation = sp.stats.spearmanr(d)
        correlations[place_i, place_j] = correlation
    tools.echo('{}/{}'.format(i+1, len(place_users)))

45/45


In [263]:
edges = OrderedDict()
min_c = 0.8
max_c = max(correlation for correlation, pvalue in correlations.values())

for (v1, v2), (correlation, pvalue) in correlations.items():
    if pvalue < 0.05 and v1 != v2 and (v2, v1) not in edges and correlation > min_c:
        edges[v1, v2] = (correlation - min_c) / (max_c - min_c)
nodes = list({e[0] for e in edges} | {e[1] for e in edges})

In [264]:
places = tools.load_places().T.to_dict()
n = lambda x: places[x]['name_en'].decode('utf-8')

In [265]:
G = nx.Graph()

for (v1, v2), weight in edges.items():
    G.add_edge(n(v1), n(v2), weight=weight)
for v in nodes:
    G.add_node(n(v))

In [267]:
nx.draw_networkx(
    G,
    pos=nx.spring_layout(G, iterations=20),
    write_labels=True,
    node_size=0,
    font_size=8,
    edge_color=edges.values(),
    edge_cmap=plt.cm.PuBu,
)

In [71]:
d = data[filters.world_countries(data)]
places_answered = defaultdict(list)
for _, row in d.T.iteritems():
    if np.isfinite(row.place_answered):
        places_answered[int(row.place_id)].append(int(row.place_answered))

In [83]:
G = nx.Graph()

for v1, answeres in places_answered.iteritems():
    for v2 in set(answeres):
        weight = answeres.count(v2) / len(answeres)
        e = (n(v1), n(v2))
        if tuple(reversed(e)) in G.edges():
            weight += G[e[0]][e[1]]['weight']
        if weight <= 0.25:
            continue
        G.add_edge(e[0], e[1], weight=weight)
    G.add_node(n(v1))

In [84]:
nx.draw_networkx(
    G,
    pos=nx.spring_layout(G, iterations=10),
    write_labels=True,
    node_size=0,
    font_size=8,
    edge_color=[G[e[0]][e[1]]['weight'] for e in G.edges()],
    edge_cmap=plt.cm.YlOrRd,
)