In [None]:
from data import df_columns, data

In [None]:
import matplotlib.pyplot as plt

def create_bar_chart(x, y, xlabel, ylabel, title="", ylim_above=10, text_space_above_rect=2, do_show=True, ylim=None):
    h = plt.bar(x, y)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.ylim(0, max(y) + ylim_above)
    
    if ylim != None:
        plt.ylim(0, ylim)
    
    plt.title(title)
    
    for i in range(len(x)):
        year_value, count = x[i], y[i]
        plt.text(year_value, count+text_space_above_rect, str(count), ha='center')

    if do_show:
        plt.show()
    

In [None]:
years = list(data.keys())
number_of_teams = [len(value) for key, value in data.items()]
create_bar_chart(years, number_of_teams, "Year", "Number of teams", "Growth # of teams")

In [None]:
print("How each row looks like?")
print(data[2017][0])
print(data[2017][30])
# team_ranking, university_name_team_name, solved, penalty, dont_know, submissions...

In [None]:
def get_attempts(s):
    return int(s.split('/')[0])

def get_accept_time(s):
    accept_time = s.split('/')[1]
    if accept_time.isdigit():
        return int(accept_time)
    else:
        return 0

def get_name(s):
    s = s.strip()
    point_idx = s.strip().find(':')
    return s[point_idx+2:].strip()

def get_university(s):
    s = s.strip()
    point_idx = s.strip().find(':')
    return s[:point_idx].strip()
    
class Submission:
    def __init__(self, sub):
        if sub == 0:
            self.accepted = False
            self.attempts = 0
            self.accept_time = 0
        else:
            self.attempts = get_attempts(sub[0])
            self.accept_time = get_accept_time(sub[0])
            self.accepted = sub[1] != 'no' and self.attempts > 0 and self.accept_time > 0
            
    def get_accept_time(self):
        return self.accept_time
    
    def get_attempts(self):
        return self.attempts
    
    def get_accepted(self):
        return self.accepted
            
    def __str__(self):
        return "Attempts:{}, Accepted:{}, Accept Time:{}".format(self.attempts, self.accepted, self.accept_time)
    
    def __repr__(self):
        return self.__str__()
    
class Team:
    def __init__(self, row=None):
        if row != None:
            self.ranking = row[0]
            self.name = get_name(row[1])
            self.university = get_university(row[1])
            self.solved = row[2]
            self.penalty = row[3]
            self.submissions = [Submission(x) for x in row[5:-1]]        
        else:
            self.ranking = -1
            self.name = ""
            self.university = ""
            self.submissions = []
            self.solved = 0
            self.penalty = 0
        
    def __hash__(self):
        return hash( (self.university, self.name) )
        
    def __eq__(self, other):
        return self.name == other.name and self.university == other.university
        
    def __str__(self):
        #return self.name + " -> " + str(self.submissions)
        return "{} - {}".format(self.university, self.name)
    
    def __repr__(self):
        return self.__str__()
    
    def __lt__(self, other):
        return self.solved > other.solved or \
            (self.solved == other.solved and self.penalty < other.penalty) or \
            (self.solved == other.solved and self.penalty == other.penalty and self.name < other.name)
    
    def add_solved(self, sub):
        self.solved += 1
        self.penalty += sub.get_accept_time() + (sub.attempts-1) * 20
    
    def set_name(self, name):
        self.name = name
        
    def get_name(self):
        return self.name
        
    def set_university(self, university):
        self.university = university
    
    def get_university(self):
        return self.university
    
    def get_solved(self):
        return self.solved
    
    def get_penalty(self):
        return self.penalty
    
    def get_submissions(self):
        return self.submissions


In [None]:
s1 = Submission(["1/8", 'yes'])
s2 = Submission(["10/--", 'no'])
print(s1)
print(s2)

In [None]:
t1 = Team([15, 'Kharazmi University: Bits Please.\n', 3, 385, 35, ['1/3', 'yes'], ['2/24', 'yes'], 0, ['1/--', 'no'], ['6/--', 'no'], 0, 0, ['4/278', 'yes'], 0, 0, 0, ['14/3', '']])
print(t1)

In [None]:
import pandas as pd
from functools import reduce

class Contest:
    def __init__(self, year, teams):
        self.year = year
        self.teams = [Team(t) for t in teams]
    
    def __str__(self):
        return str(self.year)
    
    def get_unique_university_names(self):
        return {t.university for t in self.teams}
    
    def get_year(self):
        return self.year
    
    def get_teams(self):
        return self.teams
    
    def update_university_names(self, university_name_handler):
        for t in self.teams:
            t.set_university(university_name_handler(t.get_university()))
    
    def __repr__(self):
        return str(self.year) + " -> " + str([str(t) for t in self.teams]) + "\n"
    
    def get_df(self):
        df = pd.DataFrame(columns=df_columns)
        team_ranking = 1
        for t in self.teams:
            team_data = {'year':[self.year], 'ranking': [team_ranking], 'name': [t.get_name()], 'university': [t.get_university()], 'solved': [t.get_solved()], 
                        'penalty': [t.get_penalty()]}
            problem = 'a'
            
            for s in t.get_submissions():
                team_data["p{}_time".format(problem)] = [s.get_accept_time()]
                problem = chr(ord(problem) + 1)
                
            df = df.append(pd.DataFrame(data=team_data, columns=df_columns), ignore_index=True)            
            team_ranking += 1
            
#         print(df.head())
        
        return df
    
    
    def add_to_ranking(self, sub, team):
        found = False
        for t in self.ranking:
            if t == team:
                t.add_solved(sub)
                found = True
        
        if not found:
#             print("Hey", team.get_name())
            t = Team()
            t.set_name(team.get_name())
            t.set_university(team.get_university())
            t.add_solved(sub)
            self.ranking.append(t)
            
    
    def get_ranking_team(self, place):
        return self.ranking[0]
    
    
    def calculate_teams_with_rank(self, place):
        all_subs = reduce(list.__add__, [ [(s, t) for s in t.get_submissions()] for t in self.teams])
        all_subs = [(sub, team) for sub, team in all_subs if sub.accepted]        
        all_subs = sorted(all_subs, key=(lambda x: x[0].get_accept_time()))
        
        self.ranking = []
        result = dict()
        prev_accept_time = 0
        
        idx = 0
        while idx < len(all_subs):

            sub, team = all_subs[idx]
            current_accept_time = sub.get_accept_time()

            for h in self.ranking:
                if h.get_solved() == self.ranking[0].get_solved() and h.get_penalty() == self.ranking[0].get_penalty():
                    if h in result:
                        result[h] += current_accept_time - prev_accept_time
                    else:
                        result[h] = current_accept_time - prev_accept_time
            
            self.add_to_ranking(sub, team)

            idx += 1
            while idx < len(all_subs) and current_accept_time == all_subs[idx][0].get_accept_time():
                sub, team = all_subs[idx]
                self.add_to_ranking(sub, team)
                idx += 1
            
            self.ranking = sorted(self.ranking)
            prev_accept_time = current_accept_time

        for h in self.ranking:
            if h.get_solved() == self.ranking[0].get_solved() and h.get_penalty() == self.ranking[0].get_penalty():
                if h in result:
                    result[h] += 300 - prev_accept_time
                else:
                    result[h] = 300 - prev_accept_time

#             print("##" * 30)
#             print("now", idx)
#             for t in self.ranking[0:5]:
#                 print("{} -> solved={}, penalty={}".format(t.get_name(), t.get_solved(), t.get_penalty()))
            
        
        df = pd.DataFrame(columns=['year', 'university', 'name', 'time_rank_1'])
    
#         print("##" * 30)
        sorted_result = (sorted([(team, time) for team, time in result.items()], key=lambda x: -x[1]) )
        for row in sorted_result:
            df = df.append(pd.DataFrame({'year':[self.year], 'university':[row[0].get_university()], 'name':[row[0].get_name()], 'time_rank_1':[row[1]]}))

#         print(df)
        return df
            
        


In [None]:
contests = [Contest(year=key, teams=value) for key, value in data.items()]
years = [c.get_year() for c in contests]
nunique_universities = [len(c.get_unique_university_names()) for c in contests]
create_bar_chart(years, nunique_universities, "Year", "Number of unique universities", "Growth # of unique universities")

In [None]:
# printing all university names, for further exploration and removing duplicate names

all_universities = []
for c in contests:
    all_universities.extend( list(c.get_unique_university_names()) )
all_universities = set(all_universities)

#for t in sorted(all_universities):
#    print("['{}'],".format(t))


In [None]:
from universities import universities
print("Number of unique universities over years : {}".format(len(universities)))

In [None]:
# updating all university names
from universities import get_university_name
for c in contests:    
    c.update_university_names(get_university_name)
    print(sorted(c.get_unique_university_names())[:5])


In [None]:
# creating a csv from all the data to be used by DataFrame from now on
import pandas as pd
df = pd.DataFrame(columns=df_columns)
for c in contests:
    df = df.append(c.get_df(), ignore_index=True)
df.to_csv("standings_data.csv", index=False)

In [None]:
for col in set(df.columns) - set(['name', 'university']):
    df[col] = pd.to_numeric(df[col])

In [None]:
grouped = df.groupby('year').mean()['solved']
years = list(grouped.index)
avg_solved = [round(x, 2) for x in list(grouped)]
create_bar_chart(years, avg_solved, "Year", "Solved", "Average number of solved problems by each team", ylim_above=1, text_space_above_rect=0.12)

In [None]:
grouped = df.groupby(['year', 'university']).mean()['solved']
grouped = grouped.unstack(level=-1)
grouped = grouped.round(2)
grouped.columns = sorted(grouped.columns)
grouped.to_csv('average_number_of_solved_by_university.csv')

# for year in df['year'].unique():
#     print("Year = {}".format(year))
#     grouped = df[df.year == year].groupby('university').mean()['solved']
#     print(grouped.sort_values(ascending=False)[0:10])
#     print("--" * 35)

In [None]:
# how many problems medalists have solved at each hour of contest?

for hour in range(1, 6):
    df['solved_hour_{}'.format(hour)] = sum([ ((hour-1) * 60 < df["p{}_time".format(chr(ch_int))]) & (df["p{}_time".format(chr(ch_int))] <= hour * 60) for ch_int in range(ord('a'), ord('n'))])

df[df.ranking <= 12][['year', 'ranking', 'name', 'university', 'solved', 'solved_hour_1', 'solved_hour_2', 'solved_hour_3', 'solved_hour_4', 'solved_hour_5']]

In [None]:
grouped = df.groupby('year').sum()
grouped['total_solved'] = (grouped['solved_hour_1'] + grouped['solved_hour_2'] + grouped['solved_hour_3'] + grouped['solved_hour_4'] + grouped['solved_hour_5'])

for hour in range(1, 6):
    col = 'solved_hour_{}'.format(hour)
    grouped[col] = round(grouped[col] / grouped['total_solved'] * 100, 0).astype(int)

solved_columns = ['solved_hour_1', 'solved_hour_2', 'solved_hour_3', 'solved_hour_4', 'solved_hour_5']
grouped[solved_columns]

In [None]:
grouped = grouped[solved_columns]
grouped = grouped.transpose()

In [None]:
# distribution of accepts in each hour of the contest

fig, axes = plt.subplots(3, 2, figsize=(15,15))

plot_number = 1
for year in grouped.columns:
    plt.subplot(3,2,plot_number)
    plot_number += 1
    create_bar_chart(['hour 1', 'hour 2', 'hour 3', 'hour 4', 'hour 5'], list(grouped[year]), "", "% of solved problems", title="Year {}".format(year), ylim=100, do_show=False)

fig.delaxes(axes[2][1])
plt.tight_layout()
plt.savefig("percent_of_solved_problems.jpg")
plt.show()

In [None]:
time_first_df = pd.DataFrame(columns=['year', 'university', 'name', 'time_rank_1'])
for c in contests:
   time_first_df = time_first_df.append(c.calculate_teams_with_rank(1))
time_first_df.to_csv("time_at_rank_1.csv", index=False)

time_first_df

In [None]:
import numpy as np

for year in time_first_df['year'].unique():
    times_first = list(time_first_df[time_first_df.year == year]['time_rank_1'])
    team_names = list(time_first_df[time_first_df.year == year]['name'])
    university_names = list(time_first_df[time_first_df.year == year]['university'])
    y_pos = np.arange(0, 8*len(times_first), 8)

    fig, ax = plt.subplots(figsize=(5,len(times_first)))

    ax.barh(y_pos, times_first, align='center', height=4)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(["{}\n{}".format(team_names[i], university_names[i]) for i in range(len(times_first))], linespacing=1.5)
    ax.set_xlim(0, max(times_first) + 25)

    plt.title( "Number of minutes at rank 1 - Year {}".format(year) )
    
    for i in range(len(times_first)):
        x, y = times_first[i], y_pos[i]
        ax.text(x+1, y, "{}'".format(x), va='center')

    plt.savefig("minutes_rank_one_{}.jpg".format(year))
    plt.show()

In [None]:
def calculate_top_universities_combination(top_count=4):
    rank_data = {}
    rank_df = pd.DataFrame(columns = [i for i in range(1, top_count+1)])

    for c in contests:
        universities = []
        for t in c.get_teams():
            if t.get_university() not in universities:
                universities.append(t.get_university())
                if len(universities) == top_count:
                    break

        university_rank = 1
        for u in universities:
            if u in rank_df.index:
                rank_df[university_rank][u] += 1
            else:
                team_data = {i:0 for i in range(1, top_count+1)}
                team_data[university_rank] += 1
                temp_df = pd.DataFrame(team_data, columns=[i for i in range(1,top_count+1)], index=[u])
                rank_df = rank_df.append(temp_df)

            university_rank += 1
    
    return rank_df

calculate_top_universities_combination(2)

In [None]:
calculate_top_universities_combination(4)

In [None]:
from utils import ProvinceHandler

def get_province_count(contest):
    data = dict()
    for t in contest.get_teams():
        prov = ProvinceHandler.get_province(t.get_university())
        if prov in data:
            data[prov] += 1
        else:
            data[prov] = 1
            
#     print( sorted( [(key, value) for key, value in data.items()], key=lambda x: -x[1] ) )
    return data


In [None]:
year_columns = [c.get_year() for c in contests]

province_df = pd.DataFrame(columns=year_columns, index=ProvinceHandler.get_all_provinces())

for c in contests:
    data = get_province_count(c)
    temp_df = pd.DataFrame(data=data.values(), index=data.keys(), columns=[c.get_year()])
    province_df[c.get_year()] = temp_df[c.get_year()]
    
province_df.sort_values(by=[2018], ascending=False, inplace=True)
province_df

In [None]:
solve_count_df = df.groupby('year')['solved'].value_counts()
solve_count_df = solve_count_df.unstack(level=-1)
solve_count_df.fillna(0, inplace=True)
solve_count_df = solve_count_df.transpose()

solve_count_df

In [None]:
import seaborn as sns
ax = sns.lineplot(data=solve_count_df, dashes=False)
ax.set(xlabel="# of solved problems", ylabel="# of teams")

In [None]:
# what should your ranking be before last hour to guarantee medal in ICPC WF? (statistically) (you must have some ranking, or you must solve certain number of problems to guarantee medal)