In [1]:
%matplotlib inline
import ray
import time
import pickle
from pathlib import Path
from tqdm import tqdm
import json
import numpy as np
from matplotlib import pyplot as plt

In [2]:
ray.shutdown()
ray.init(dashboard_host='0.0.0.0', dashboard_port=8999)

2020-11-21 11:18:40,680	INFO services.py:1092 -- View the Ray dashboard at [1m[32mhttp://169.229.48.125:8999[39m[22m


{'node_ip_address': '169.229.48.125',
 'raylet_ip_address': '169.229.48.125',
 'redis_address': '169.229.48.125:6379',
 'object_store_address': '/tmp/ray/session_2020-11-21_11-18-40_166968_128289/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-11-21_11-18-40_166968_128289/sockets/raylet',
 'webui_url': '169.229.48.125:8999',
 'session_dir': '/tmp/ray/session_2020-11-21_11-18-40_166968_128289',
 'metrics_export_port': 65021,
 'node_id': '1972d1bea4005ceca5bf563f73ac9bd92158f125'}

In [3]:
challenge_list_path = Path('../data/hackerrank/algorithms_challenge_index.json')
base_path = Path('../data/hackerrank/')
dirs = [(x / 'challenge_data.json') for x in tqdm(list(base_path.iterdir())) if x.is_dir() and (x / 'challenge_data.json').exists()]

with challenge_list_path.open('r') as f:
    challenge_list = json.load(f)
extract_fields = ['slug', 'max_score', 'success_ratio', 'preview', 'difficulty_name', 'tag_names']
challenge_list = [{k: v for k, v in challenge.items() if k in extract_fields} for challenge in tqdm(challenge_list)]

100%|██████████| 444/444 [00:00<00:00, 1297.09it/s]
100%|██████████| 442/442 [00:00<00:00, 91185.01it/s]


In [4]:
data = {}
for index in tqdm(dirs, desc="Loading data"):
    challenge = index.parent.name
    with index.open('r') as f:
        data[challenge] = json.load(f)

Loading data: 100%|██████████| 437/437 [00:00<00:00, 467.45it/s]


In [5]:
def fn(tup):
    import editdistance
    import numpy as np
    challenge, submissions = tup
    if submissions is None:
        return challenge, None
    N = len(submissions)
    dist_mat = np.zeros((N, N))
    for i in range(N):
        dist_mat[i, i] = 0
        for j in range(i + 1, N):
            a = submissions[i]['src']
            b = submissions[j]['src']
            if a is not None and b is not None:
                dist = float(editdistance.distance(a, b)) / max(len(a), len(b))
                dist_mat[i][j] = dist
                dist_mat[j][i] = dist
    return challenge, dist_mat

data_iter = ray.util.iter.from_items(list(data.items()), num_shards=36)
vecs = data_iter.for_each(fn, max_concurrency=4)

distance_map = {}
for challenge, distance_matrix in tqdm(vecs.gather_async(batch_ms=1000), total=len(data)):
    distance_map[challenge] = distance_matrix

100%|██████████| 437/437 [38:16<00:00,  5.26s/it] 


In [55]:
difficulty_mapping = dict(easy='easy', medium='medium', hard='hard', advanced='hard', expert='hard')
challenge_by_difficulty = dict(easy={}, medium={}, hard={})
for challenge in tqdm(challenge_list):
    key = challenge['slug']
    if key in distance_map:
        x = distance_map[key]
        norms = np.sum(np.abs(x)**2,axis=-1)**(1./2)
        idxs = np.flip(norms.argsort())
        correct_idxs = [idx for idx in idxs if data[key][idx]['score'] >= challenge['max_score']]
        top_correct_idxs = correct_idxs[:10]
        if len(x) > 0 and len(top_correct_idxs) > 0:
            out_dict = challenge.copy()
            out_dict['srcs'] = [data[key][idx]['src'] for idx in top_correct_idxs]
            out_dict['hacker_ids'] = [data[key][idx]['hacker_id'] for idx in top_correct_idxs]
            out_dict['dissimilarity_matrix'] = np.zeros((len(top_correct_idxs), len(top_correct_idxs))).tolist()
            for x1, i in enumerate(top_correct_idxs):
                for y1, j in enumerate(top_correct_idxs):
                    out_dict['dissimilarity_matrix'][x1][y1] = x[i][j]
            challenge_by_difficulty[difficulty_mapping.get(challenge['difficulty_name'].lower())][key] = out_dict

for k, v in challenge_by_difficulty.items():
    print(k, len(v))

100%|██████████| 442/442 [00:00<00:00, 4552.20it/s]

easy 117
medium 134
hard 91





In [58]:
with Path('../data/hackerrank/full_data.json').open('w') as f:
    json.dump(challenge_by_difficulty, f)