In [1]:
import glob
import re
import dill
import pandas as pd
import numpy as np

In [2]:
import networkx as nx
from itertools import combinations

In [36]:
G = nx.Graph()

In [38]:
for j, file in enumerate(glob.iglob(r'C:\data\results\races\*.pkd')):
    if j % 1000 == 1:
        print(j)
    json = dill.load(open(file, 'rb'))

    df_race = pd.read_json(json)
    if df_race.empty:
        continue
    
    # drop duplicate columns
    columns = [str(i) for i in range(28)]  # Each column repeated for some reason
    df_race = df_race.drop(columns=columns)

    race_id = re.findall('(\d+)', file)[0]
    df_race['race_id'] = int(race_id)
    
    for group in df_race.groupby('RaceCategoryID').groups.values():
        for name1, name2 in combinations(df_race.iloc[group]['RacerID'], 2):
            if G.has_edge(name1, name2):
                G[name1][name2]['weight'] += 1
            else:
                G.add_edge(name1, name2, weight=1)

1
1001
2001
3001
4001
5001
6001
7001
8001
9001
10001
11001
12001


In [39]:
G.number_of_nodes(), G.number_of_edges()

(175846, 20072320)

In [41]:
# nx.readwrite.gpickle.write_gpickle(G, r'C:\data\results\races\graph.gz')

In [3]:
G = nx.readwrite.gpickle.read_gpickle(r'C:/data/results/races/graph.gz')

In [25]:
sorted(list(G.neighbors(177974))) == sorted(list(set(G.neighbors(177974))))

True

In [4]:
import heapq  # Heaps are efficient structures for tracking the largest
              # elements in a collection.  Use introspection to find the
              # function you need.
import pandas as pd
        
degree = heapq.nlargest(100, ((node, G.degree(node)) for node in G.nodes), key=lambda x: G.degree(x[0]))
s = pd.Series([d[1] for d in degree])
s.describe()

count      100.000000
mean      4292.520000
std       3183.623859
min       3483.000000
25%       3587.250000
50%       3749.500000
75%       4170.250000
max      35054.000000
dtype: float64

In [12]:
degree[:5]

[(3288, 35054), (351, 8511), (5694, 6438), (6335, 5837), (56429, 5309)]

In [13]:
weights = [(key, value['weight']) for key, value in G.edges.items()]

In [15]:
opponents = heapq.nlargest(100, weights, key=lambda x: x[1])
s = pd.Series([d[1] for d in opponents])

In [17]:
opponents

[((3288, 3288), 6737),
 ((61706, 61706), 5348),
 ((351, 351), 2793),
 ((832, 832), 582),
 ((5694, 5694), 282),
 ((39517, 39517), 274),
 ((15262, 15264), 219),
 ((73927, 68489), 207),
 ((15262, 16067), 202),
 ((37514, 52762), 200),
 ((5134, 24216), 196),
 ((45530, 20299), 189),
 ((37274, 126123), 182),
 ((75849, 126123), 180),
 ((43342, 24220), 179),
 ((15705, 15696), 177),
 ((44773, 126123), 176),
 ((45508, 20299), 175),
 ((44710, 45756), 174),
 ((6743, 6747), 171),
 ((21239, 118975), 169),
 ((42428, 43333), 169),
 ((75849, 44773), 169),
 ((37514, 50326), 165),
 ((44526, 44511), 165),
 ((42456, 39256), 164),
 ((44526, 75574), 164),
 ((75574, 126123), 163),
 ((29350, 126123), 161),
 ((37274, 75849), 161),
 ((45783, 44511), 159),
 ((44526, 44733), 159),
 ((75574, 56135), 159),
 ((45530, 45508), 158),
 ((24426, 5787), 158),
 ((44511, 44733), 158),
 ((5996, 5134), 157),
 ((45783, 44733), 157),
 ((20299, 78791), 157),
 ((15262, 13829), 156),
 ((37274, 44617), 156),
 ((44733, 19825), 156),
 