In [1]:
import pickle
import numpy as np
from tqdm.notebook import tqdm

In [2]:
with open('../data/raw/clusters.pkl', 'rb') as f:
    contents = pickle.load(f)

In [3]:
results = {}
for split, cluster_ids in contents.items():
    num_clusters = np.max(cluster_ids)
    cluster_sizes, _ = np.histogram(cluster_ids, num_clusters)
    
    results[split] = {
        'cluster_ids': cluster_ids,
        'num_clusters': num_clusters,
        'cluster_sizes': cluster_sizes
    }
    
    sorted_inds = np.argsort(cluster_sizes)
    sorted_cluster_sizes = cluster_sizes[sorted_inds]
    uniqueness_scores = np.cumsum(sorted_cluster_sizes[::-1])[::-1]
    
    num_players = len(cluster_ids)
    uniqueness_percentiles = {}
    for size in tqdm(sorted_cluster_sizes):
        if size in uniqueness_percentiles:
            continue

        keep_inds = (sorted_cluster_sizes == size).nonzero()[0]
        num_less_unique_players = uniqueness_scores[keep_inds[0]]
        uniqueness_percentile = num_less_unique_players / num_players
        uniqueness_percentiles[size] = uniqueness_percentile

    uniqueness = {}
    for i, cluster_size in enumerate(sorted_cluster_sizes):
        cluster_id = sorted_inds[i] + 1
        uniqueness[cluster_id] = uniqueness_percentiles[cluster_size]
    uniqueness = sorted(uniqueness.items())
    uniqueness = [k[1] for k in uniqueness]
    
    results[split]['percent_uniqueness'] = np.array(uniqueness)

  0%|          | 0/9044 [00:00<?, ?it/s]

  0%|          | 0/2897 [00:00<?, ?it/s]

  0%|          | 0/3764 [00:00<?, ?it/s]

In [4]:
results

{'all': {'cluster_ids': array([   1,    1,    1, ..., 7933, 4333, 7859]),
  'num_clusters': 9044,
  'cluster_sizes': array([123100,   3018,     30, ...,      1,      3,      1]),
  'percent_uniqueness': array([0.0635108 , 0.43210819, 0.97607949, ..., 1.        , 0.99938243,
         1.        ])},
 'cb': {'cluster_ids': array([   1,    1,    1, ..., 1098, 1410, 1148]),
  'num_clusters': 2897,
  'cluster_sizes': array([126118,  83751,   6708, ...,      1,      1,      1]),
  'percent_uniqueness': array([0.06506787, 0.24746099, 0.6394609 , ..., 1.        , 1.        ,
         1.        ])},
 'noncb': {'cluster_ids': array([   1,    1,    1, ..., 3079, 3079, 2378]),
  'num_clusters': 3764,
  'cluster_sizes': array([63910, 83106, 31963, ...,     1,     1,     1]),
  'percent_uniqueness': array([0.11185279, 0.04287676, 0.19844365, ..., 1.        , 1.        ,
         1.        ])}}

In [5]:
with open('../data/processed/stats.pkl', 'rb') as f:
    stats = pickle.load(f)

In [6]:
players = stats['usernames']
skills = np.array(stats['stats'][:, 4::3], dtype='float32')
skills[skills < 0] = np.nan

In [7]:
for i in range(23):
    print(np.sum(np.isnan(skills[:, i])))

48
51
58
42
54
55
51
40
49
55
51
44
51
43
34
31
49
33
70
13
14
13
11


In [8]:
for split, result in results.items():
    num_clusters = result['num_clusters']
    
    if split == 'all':
        dataset = skills
        cluster_centroids = np.zeros((num_clusters, 23, 3))
    elif split == 'cb':
        dataset = skills[:, :7]
        cluster_centroids = np.zeros((num_clusters, 7, 3))
    else:
        dataset = skills[:, 7:]
        cluster_centroids = np.zeros((num_clusters, 16, 3))
    
    for i in tqdm(range(num_clusters)):
        keep_inds = result['cluster_ids'] == i+1
        cluster_rows = dataset[keep_inds]
        cluster_centroids[i, :, 0] = np.nanpercentile(cluster_rows, axis=0, q=5)
        cluster_centroids[i, :, 1] = np.nanpercentile(cluster_rows, axis=0, q=50)
        cluster_centroids[i, :, 2] = np.nanpercentile(cluster_rows, axis=0, q=95)
    
    results[split]['cluster_percentiles'] = {
        5: cluster_centroids[:, :, 0],
        50: cluster_centroids[:, :, 1],
        95: cluster_centroids[:, :, 2]
    }

  0%|          | 0/9044 [00:00<?, ?it/s]

  0%|          | 0/2897 [00:00<?, ?it/s]

  result = np.apply_along_axis(_nanquantile_1d, axis, a, q,


  0%|          | 0/3764 [00:00<?, ?it/s]

In [9]:
results

{'all': {'cluster_ids': array([   1,    1,    1, ..., 7933, 4333, 7859]),
  'num_clusters': 9044,
  'cluster_sizes': array([123100,   3018,     30, ...,      1,      3,      1]),
  'percent_uniqueness': array([0.0635108 , 0.43210819, 0.97607949, ..., 1.        , 0.99938243,
         1.        ]),
  'cluster_percentiles': {5: array([[87.  , 86.  , 92.  , ..., 62.  , 70.  , 74.  ],
          [50.  ,  1.  , 88.  , ..., 45.  , 65.  , 69.  ],
          [52.45, 48.  ,  1.  , ..., 63.9 , 76.7 , 67.25],
          ...,
          [42.  , 40.  , 73.  , ...,  1.  , 99.  ,  1.  ],
          [ 1.  ,  1.  , 84.1 , ...,  1.8 ,  1.8 ,  9.9 ],
          [ 1.  ,  1.  , 69.  , ...,  1.  ,  1.  ,  1.  ]]),
   50: array([[99. , 99. , 99. , ..., 79. , 81. , 84. ],
          [75. ,  1. , 99. , ..., 75. , 84. , 84. ],
          [61. , 68.5,  1. , ..., 88. , 90. , 84.5],
          ...,
          [42. , 40. , 73. , ...,  1. , 99. ,  1. ],
          [ 1. ,  1. , 94. , ...,  9. ,  9. , 36. ],
          [ 1. ,  1. 

In [10]:
for split in ['all', 'cb', 'noncb']:
    for percentile in [5, 50, 95]:
        replace_rows, replace_cols = np.isnan(results[split]['cluster_percentiles'][50]).nonzero()
        for i, j in zip(replace_rows, replace_cols):
            results[split]['cluster_percentiles'][percentile][i, j] = 1
            print("replaced {} cluster_percentiles {} row: {} col: {} with 1"
                  .format(split, percentile, i, j))

replaced cb cluster_percentiles 5 row: 2158 col: 0 with 1
replaced cb cluster_percentiles 5 row: 2800 col: 0 with 1
replaced cb cluster_percentiles 50 row: 2158 col: 0 with 1
replaced cb cluster_percentiles 50 row: 2800 col: 0 with 1
