In [37]:
import pandas as pd
import numpy as np

In [2]:
need_columns = ['user_id', 'problem_id', 'language', 'execution_time', 'length', 'result']
data = pd.read_csv('./submissions.csv', usecols=need_columns)
data = data.dropna()
data = data.astype({'execution_time': 'int32'})

In [3]:
# scale down for local test
data = data.iloc[:100000, :]
data_ac = data[data['result'] == 'AC']

In [4]:
list_for_exec = ['problem_id', 'language', 'execution_time']
list_for_leng =  ['problem_id', 'language', 'length']
data_exec = data_ac[list_for_exec]
data_leng = data_ac[list_for_leng]

In [72]:
group_prob_lang_exec = data_exec.groupby(['language', 'problem_id'], as_index=False)
group_prob_lang_leng = data_leng.groupby(['language', 'problem_id'], as_index=False)

In [73]:
exec_statistics = group_prob_lang_exec.min().rename(columns={'execution_time': 'min'})

In [74]:
exec_statistics_10 =  group_prob_lang_exec.quantile(0.1, interpolation='higher')
exec_statistics_25 =  group_prob_lang_exec.quantile(0.25, interpolation='higher')
exec_statistics_50 =  group_prob_lang_exec.quantile(0.50, interpolation='higher')
exec_statistics_75 =  group_prob_lang_exec.quantile(0.75, interpolation='higher')

In [75]:
exec_statistics['10%'] = exec_statistics_10['execution_time']
exec_statistics['25%'] = exec_statistics_25['execution_time']
exec_statistics['50%'] = exec_statistics_50['execution_time']
exec_statistics['75%'] = exec_statistics_75['execution_time']

In [9]:
exec_statistics.drop('min', axis=1).to_csv('./exec_statistics', header=False)

In [18]:
problems = pd.read_json('./problems.json')

In [19]:
problems.head()

Unnamed: 0,id,contest_id,title
0,abc001_1,abc001,A. 積雪深差
1,abc001_2,abc001,B. 視程の通報
2,abc001_3,abc001,C. 風力観測
3,abc001_4,abc001,D. 感雨時刻の整理
4,abc002_1,abc002,A. 正直者


In [13]:
problems.to_csv('./problems.csv', header=False, index=False)

In [20]:
contests = problems['contest_id'].unique()

In [21]:
types = []
for contest in contests:
    initial = contest[:3]
    contest_type  = ''
    if initial == 'abc':
        contest_type = 1
    elif initial == 'arc':
        contest_type = 2
    elif initial == 'agc':
        contest_type = 3
    else:
        contest_type = 4
    
    types.append(contest_type)

In [22]:
contests_data = pd.DataFrame({'id': contests, 'type': types})

In [23]:
contests_data.head()

Unnamed: 0,id,type
0,abc001,1
1,abc002,1
2,abc003,1
3,abc004,1
4,abc005,1


In [18]:
contests_data.to_csv('./contests.csv', header=False, index=False)

In [76]:
length_statistics =  group_prob_lang_leng.min().rename(columns={'length': 'min'})

In [77]:
length_statistics_10 =  group_prob_lang_leng.quantile(0.1, interpolation='higher')
length_statistics_25 =  group_prob_lang_leng.quantile(0.25, interpolation='higher')
length_statistics_50 =  group_prob_lang_leng.quantile(0.50, interpolation='higher')
length_statistics_75 =  group_prob_lang_leng.quantile(0.75, interpolation='higher')

In [78]:
length_statistics['10%'] = length_statistics_10['length']
length_statistics['25%'] = length_statistics_25['length']
length_statistics['50%'] = length_statistics_50['length']
length_statistics['75%'] = length_statistics_75['length']

In [79]:
length_statistics.drop('min', axis=1).to_csv('./length_statistics', header=False)

In [80]:
list_for_user_info = ['user_id', 'problem_id', 'language', 'length', 'execution_time']
pre_user_info = data_ac[list_for_user_info]

In [81]:
pre_user_info.head()

Unnamed: 0,user_id,problem_id,language,length,execution_time
0,i_lohas_MATCH,abc151_c,C++14 (GCC 5.4.1),777,73
1,rniya,abc141_b,C++14 (GCC 5.4.1),1002,1
2,naoki2016,abc067_b,C++14 (GCC 5.4.1),598,1
4,t_kato,dwango2017qual_a,PyPy3 (2.4.0),99,172
5,zoooma13,agc006_b,C++14 (GCC 5.4.1),651,38


In [82]:
group_user_lang_prob = pre_user_info.groupby(['user_id', 'language', 'problem_id'], as_index=False)

In [83]:
user_info = group_user_lang_prob.min()

In [84]:
user_info.size

275690

In [85]:
user_info.head()

Unnamed: 0,user_id,language,problem_id,length,execution_time
0,714zcy,C++ (GCC 9.2.1),abc164_a,311,2
1,714zcy,C++ (GCC 9.2.1),abc164_c,369,195
2,A0iro,C++14 (GCC 5.4.1),abc137_c,574,168
3,A1phamath,Python3 (3.4.3),abc135_a,218,17
4,A1phamath,Python3 (3.4.3),abc139_d,42,17


In [19]:
def get_score(value, border):
    if value <= border['10%']:
        return 5
    
    elif value <= border['25%']:
        return 4
    
    elif value <= border['50%']:
        return 3
    
    elif value <= border['75%']:
        return 2
    
    else:
        return 1

In [86]:
length_statistics.head()

Unnamed: 0,language,problem_id,min,10%,25%,50%,75%
0,Awk (mawk 1.3.3),abc010_2,36,37,37,37,37
1,Awk (mawk 1.3.3),abc015_1,36,36,36,36,36
2,Awk (mawk 1.3.3),abc025_b,83,85,85,85,85
3,Awk (mawk 1.3.3),abc026_b,67,67,67,67,67
4,Awk (mawk 1.3.3),abc027_b,68,69,69,72,75


In [87]:
length_statistics['keys'] = length_statistics['language'] + length_statistics['problem_id']

In [88]:
length_statistics.head()

Unnamed: 0,language,problem_id,min,10%,25%,50%,75%,keys
0,Awk (mawk 1.3.3),abc010_2,36,37,37,37,37,Awk (mawk 1.3.3)abc010_2
1,Awk (mawk 1.3.3),abc015_1,36,36,36,36,36,Awk (mawk 1.3.3)abc015_1
2,Awk (mawk 1.3.3),abc025_b,83,85,85,85,85,Awk (mawk 1.3.3)abc025_b
3,Awk (mawk 1.3.3),abc026_b,67,67,67,67,67,Awk (mawk 1.3.3)abc026_b
4,Awk (mawk 1.3.3),abc027_b,68,69,69,72,75,Awk (mawk 1.3.3)abc027_b


In [31]:
length_dict = length_statistics.to_dict(orient='index')
exec_dict = exec_statistics.to_dict(orient='index')

In [47]:
lang = user_info['language'].values.reshape([-1, 1])

In [48]:
prob = user_info['problem_id'].values.reshape([-1, 1])

In [62]:
keys = np.hstack([lang, prob])

In [63]:
keys

array([['C++ (GCC 9.2.1)', 'abc164_a'],
       ['C++ (GCC 9.2.1)', 'abc164_c'],
       ['C++14 (GCC 5.4.1)', 'abc137_c'],
       ...,
       ['PyPy3 (2.4.0)', 'abc139_a'],
       ['PyPy3 (2.4.0)', 'abc139_b'],
       ['Python3 (3.4.3)', 'arc080_a']], dtype=object)

In [64]:
keys = np.array(tuple(map(tuple, keys)))

In [65]:
keys

array([['C++ (GCC 9.2.1)', 'abc164_a'],
       ['C++ (GCC 9.2.1)', 'abc164_c'],
       ['C++14 (GCC 5.4.1)', 'abc137_c'],
       ...,
       ['PyPy3 (2.4.0)', 'abc139_a'],
       ['PyPy3 (2.4.0)', 'abc139_b'],
       ['Python3 (3.4.3)', 'arc080_a']], dtype='<U37')

In [43]:
keys = keys.reshape([-1, 2])

In [44]:
keys

array([['C++ (GCC 9.2.1)', 'C++ (GCC 9.2.1)'],
       ['C++14 (GCC 5.4.1)', 'Python3 (3.4.3)'],
       ['Python3 (3.4.3)', 'Python3 (3.4.3)'],
       ...,
       ['abc140_d', 'abc143_a'],
       ['agc009_c', 'abc139_a'],
       ['abc139_b', 'arc080_a']], dtype=object)

In [29]:
length_scores = []
exec_scores = []

for idx, row in user_info.iterrows():
    language = row['language']
    problem_id = row['problem_id']
    length = row['length']
    exec_time = row['execution_time']
    
    length_border = length_statistics.loc[language, problem_id]
    exec_border = exec_statistics.loc[language, problem_id]
    
    length_score = get_score(length, length_border)
    exec_score = get_score(exec_time, exec_border)
    
    length_scores.append(length_score)
    exec_scores.append(exec_score)    

In [30]:
user_info['length_score'] = length_scores

In [31]:
user_info['exec_score'] = exec_scores

In [32]:
user_info.head()

Unnamed: 0,user_id,language,problem_id,length,execution_time,length_score,exec_score
0,714zcy,C++ (GCC 9.2.1),abc164_a,311,2,2,5
1,714zcy,C++ (GCC 9.2.1),abc164_c,369,195,3,1
2,A0iro,C++14 (GCC 5.4.1),abc137_c,574,168,3,2
3,A1phamath,Python3 (3.4.3),abc135_a,218,17,1,5
4,A1phamath,Python3 (3.4.3),abc139_d,42,17,3,5


In [33]:
user_status = user_info.loc[:, ['user_id', 'language', 'length_score', 'exec_score']]

In [34]:
user_status.head()

Unnamed: 0,user_id,language,length_score,exec_score
0,714zcy,C++ (GCC 9.2.1),2,5
1,714zcy,C++ (GCC 9.2.1),3,1
2,A0iro,C++14 (GCC 5.4.1),3,2
3,A1phamath,Python3 (3.4.3),1,5
4,A1phamath,Python3 (3.4.3),3,5


In [35]:
user_scores = user_status.groupby(['user_id', 'language'], as_index=False).sum()
user_ac_count = user_status.groupby(['user_id', 'language'], as_index=False).count()

In [36]:
user_scores['ac_count'] = user_ac_count['length_score'].values

In [37]:
user_scores['length_ave'] = user_scores['length_score'] / user_scores['ac_count']
user_scores['exec_ave'] = user_scores['exec_score'] / user_scores['ac_count']

In [38]:
user_scores.head()

Unnamed: 0,user_id,language,length_score,exec_score,ac_count,length_ave,exec_ave
0,714zcy,C++ (GCC 9.2.1),5,6,2,2.5,3.0
1,A0iro,C++14 (GCC 5.4.1),3,2,1,3.0,2.0
2,A1phamath,Python3 (3.4.3),4,10,2,2.0,5.0
3,A2412,Python3 (3.4.3),3,10,2,1.5,5.0
4,A7075,C++14 (GCC 5.4.1),19,20,4,4.75,5.0


In [44]:
user_scores.drop(['length_ave', 'exec_ave'], axis=1).to_csv('./user_rankings.csv', index=False, header=False)