In [2]:
# This script examines the UCI sanctioned racing calendar to look at global changes in the number of events.
# Source is https://www.uci.org/road/calendar, all categories/classes selected
# Year could be extracted from the start date, but UCI 'seasons' differ from calendar years
# for Africa, Asia and Oceania circuits, so when outputting new calendars, add the year as _2XXX before .xlsx
# The script will add the year as a new column.

In [3]:
import pandas as pd
import glob
import os
#Map the UCI Calendar excel output to the Continental circuits using the UCI's country names
continents = {'Americas': ['ANTIGUA AND BARBUDA', 'ARGENTINA', 'ARUBA', 'BELIZE', 'BERMUDA', 
                           'BOLIVARIAN REPUBLIC OF VENEZUELA', 'BOLIVIA', 'BRAZIL', 'CANADA',
                           'CHILE', 'COLOMBIA', 'CUBA', 'DOMINICAN REPUBLIC', 'ECUADOR', 
                           'EL SALVADOR', 'HONDURAS', 'PANAMA', 'PARAGUAY', 'PUERTO RICO', 
                           'SAINT VINCENT AND THE GRENADINES', 'TRINIDAD AND TOBAGO', 
                           'UNITED STATES OF AMERICA', 'URUGUAY','MEXICO', 'COSTA RICA','GUATEMALA'],
              'Africas' : ['ALBANIA', 'ALGERIA', 'ANGOLA', 'CAMEROON', 'CONGO', 'COTE D\'IVOIRE',
                           'EGYPT', 'ETHIOPIA', 'GUYANA', 'LIBYA', 'MALI', 'MAURITIUS', 'MOROCCO',
                           'NAMIBIA', 'RWANDA', 'SENEGAL', 'SOUTH AFRICA', 'SWAZILAND', 'TUNISIA', 
                           'UGANDA', 'ZIMBABWE', 'GABON','ERITREA', 'BURKINA FASO'],
              'Asia' : ['AZERBAIJAN', 'BAHRAIN', 'BRUNEI DARUSSALAM', 'CHINESE TAIPEI', 'GEORGIA',
                        'INDIA', 'ISLAMIC REPUBLIC OF IRAN', 'ISRAEL', 'KAZAKHSTAN', 'KOREA', 'KUWAIT', 
                        'KYRGYZSTAN', 'LEBANON', 'MALAYSIA', 'MONGOLIA', 'MYANMAR', 'OMAN', 'PHILIPPINES',
                        'RUSSIAN FEDERATION', 'SINGAPORE', 'SRI LANKA', 'SYRIAN ARAB REPUBLIC', 'THAILAND',
                        'TURKEY', 'UNITED ARAB EMIRATES', 'VIETNAM','HONG KONG, CHINA','QATAR', 'UZBEKISTAN',
                        'INDONESIA', 'JAPAN','PEOPLE\'S REPUBLIC OF CHINA'],
              'Europe' : ['AUSTRIA', 'BELARUS', 'BELGIUM', 'BOSNIA AND HERZEGOVINA', 'BULGARIA', 'CROATIA',
                          'CYPRUS', 'CZECH REPUBLIC', 'DENMARK', 'ESTONIA', 'FINLAND', 
                          'FORMER YUGOSLAV REPUBLIC OF MACEDONIA', 'FRANCE', 'GERMANY', 'GREAT BRITAIN', 
                          'GREECE', 'HUNGARY', 'ICELAND', 'IRELAND', 'ITALY', 'KOSOVO', 'LATVIA', 'LITHUANIA',
                          'LUXEMBOURG', 'MONTENEGRO', 'NETHERLANDS', 'NORWAY', 'POLAND', 'PORTUGAL', 
                          'REPUBLIC OF MOLDOVA', 'ROMANIA', 'SAN MARINO', 'SERBIA', 
                          'SLOVAKIA', 'SLOVENIA', 'SWEDEN', 'SWITZERLAND', 'UKRAINE','SPAIN'],
              'Oceania' : ['NEW ZEALAND','AUSTRALIA']}

In [4]:
#Pull in every excel sheet from the current folder that starts with Calendar and ends in xlsx. YMMV!

all_data = pd.DataFrame()
for f in glob.glob('Calendar*.xlsx'):
    df = pd.read_excel(f, header=1, encoding='UTF-8')
    year = os.path.basename(f).split('.')[0].split('_')[-1]
    df['Season'] = year
    all_data = all_data.append(df,ignore_index=True)

In [5]:
#Examine the head and length to make sure it's kosher
# all_data['Class'].unique()
# array(['2.2', 'CRT', '2.HC', '2.1', '1.2', '1.1', 'CN', '2.UWT', '1.UWT',
#        '2.Ncup', '1.HC', 'CC', '1.WWT', '1.Ncup', '1.2U', '2.2U', 'JR',
#        '2.WWT', 'JC', 'CM', 'JOJ', 'CDM', nan, 'JO', 'CPE', 'MNM', 'AU1',
#        '2.CH', '1.CH', 'GT2', 'GT1'], dtype=object)

# I suspect there are some classes have changed over the years. MP is ME, CPE was WT stage races during dispute
# with ASO (2.UWT), MNM = Monument and CPE were WT one-days during disputed years (1.UWT)
all_data['Category'].replace('MP', 'ME')
all_data['Class'].replace('CPE', '2.UWT')
all_data['Class'].replace(['MNM','AU1', '1.CH'], '1.UWT')

all_data['Class'].unique()

array(['2.2', 'CRT', '2.HC', '2.1', '1.2', '1.1', 'CN', '2.UWT', '1.UWT',
       '2.Ncup', '1.HC', 'CC', '1.WWT', '1.Ncup', '1.2U', '2.2U', 'JR',
       '2.WWT', 'JC', 'CM', 'JOJ', 'CDM', nan, 'JO', 'CPE', 'MNM', 'AU1',
       '2.CH', '1.CH', 'GT2', 'GT1'], dtype=object)

In [6]:
#Get rid of unwanted columns.
del all_data['EMail']
del all_data['WebSite']
del all_data['Calendar']
del all_data['Venue']
all_data.head()

Unnamed: 0,Date From,Date To,Name,Country,Category,Class,Season
0,24/10/2017,29/10/2017,Vuelta a Colombia Femenina Oro y Paz,COLOMBIA,WE,2.2,2018
1,23/10/2017,01/11/2017,Vuelta a Guatemala,GUATEMALA,ME,2.2,2018
2,04/11/2017,04/11/2017,Subaru Australian Open Criterium,AUSTRALIA,ME,CRT,2018
3,04/11/2017,04/11/2017,Le Tour De France Saitama Criterium,JAPAN,ME,CRT,2018
4,04/11/2017,04/11/2017,Subaru Australian Open Criterium,AUSTRALIA,WE,CRT,2018


In [95]:
#You can extract the year from the start date, but UCI 'seasons' differ from calendar years
# for Africa, Asia and Oceania circuits, so when outputting new calendars, add the year as _2XXX before .xlsx
# If you want you can just export the combined data

#all_data['Year'] = pd.DatetimeIndex(all_data['Date From']).year

# writer = pd.ExcelWriter('combined.xlsx')
# all_data.to_excel(writer,'Sheet1')

# writer.save()

In [37]:
#We need to make the dictionary of lists one big dictionary to add the Continents

cont_dict_converted = {k: old_key for old_key, old_value in continents.items() for k in old_value}

#Note there are some events that cross borders or Continental championships that have no Continent.
#Add some code here to handle these cases.

all_data['Continent'] = all_data['Country'].map(cont_dict_converted).fillna('Stateless')


In [38]:
#add in the Continents for each race

uci_country_coord = pd.DataFrame()
uci_country_coord = pd.read_excel('uci_country_coord.xlsx', encoding='UTF-8')
uci_df = pd.merge(all_data, uci_country_coord, right_on='uci_name', left_on='Country', how="left")

df_nulls = all_data[all_data.isnull().any(axis=1)]
df_nulls.head()

Unnamed: 0,Date From,Date To,Name,Country,Category,Class,Season,Continent
19,04/01/2018,07/01/2018,National Road Championships - Australia,AUSTRALIA,,CN,2018,Oceania
20,05/01/2018,07/01/2018,National Road Championships - New Zealand (MU ...,NEW ZEALAND,,CN,2018,Oceania
21,05/01/2018,07/01/2018,National Road Championships - New Zealand (WU ...,NEW ZEALAND,,CN,2018,Oceania
47,02/02/2018,04/02/2018,National Road Championships - Colombia,COLOMBIA,,CN,2018,Americas
49,02/02/2018,04/02/2018,National Road Championships - Namibia,NAMIBIA,,CN,2018,Africas


In [39]:
#Let's only look at Class - ME/WE, which excludes national, continental championships and Olympics
#this should remove most of the nulls in Classi
cats = ['ME', 'WE']
all_data.Category.isin(cats)
elites_df = all_data[all_data.Category.isin(cats)]
elites_df.head()

Unnamed: 0,Date From,Date To,Name,Country,Category,Class,Season,Continent
0,24/10/2017,29/10/2017,Vuelta a Colombia Femenina Oro y Paz,COLOMBIA,WE,2.2,2018,Americas
1,23/10/2017,01/11/2017,Vuelta a Guatemala,GUATEMALA,ME,2.2,2018,Americas
2,04/11/2017,04/11/2017,Subaru Australian Open Criterium,AUSTRALIA,ME,CRT,2018,Oceania
3,04/11/2017,04/11/2017,Le Tour De France Saitama Criterium,JAPAN,ME,CRT,2018,Asia
4,04/11/2017,04/11/2017,Subaru Australian Open Criterium,AUSTRALIA,WE,CRT,2018,Oceania


In [40]:
df_nulls = elites_df[elites_df.isnull().any(axis=1)]
df_nulls.head()

Unnamed: 0,Date From,Date To,Name,Country,Category,Class,Season,Continent
2128,21/11/2018,25/11/2018,Africa Cup,ERITREA,ME,,2019,Africas
2129,21/11/2018,25/11/2018,Africa Cup,ERITREA,WE,,2019,Africas
5945,23/03/2006,26/03/2006,The Paths of King Nikola,,ME,2.2,2006,Stateless
6181,13/06/2006,18/06/2006,Tour de Serbie,,ME,2.2,2006,Stateless


In [44]:
for x in ['CC', 'CM', 'CN']:
    row = (elites_df.loc[elites_df['Class'] == x]).sort_values(by='Season', ascending=0).iloc[0]
dump_rows = row.name
base_df = elites_df.drop(dump_rows)

for nm in ['Africa Cup']:
    row = elites_df.loc[elites_df['Name'] == 'Africa Cup'].sort_values(by='Class', ascending =0).iloc[0]
dump_rows = row.name
elites_df = elites_df.drop(dump_rows)
df_nulls = elites_df[elites_df.isnull().any(axis=1)]
df_nulls.head(100)

Unnamed: 0,Date From,Date To,Name,Country,Category,Class,Season,Continent
5945,23/03/2006,26/03/2006,The Paths of King Nikola,,ME,2.2,2006,Stateless
6181,13/06/2006,18/06/2006,Tour de Serbie,,ME,2.2,2006,Stateless


In [46]:
#Look at the numbers of races per continent
grouped_df = base_df_cats.groupby(['Season','Category', 'Class', 'Continent'])
grouped_df.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Date From,Date To,Name,Country
Season,Category,Class,Continent,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006,ME,1.1,Americas,2,2,2,2
2006,ME,1.1,Asia,2,2,2,2
2006,ME,1.1,Europe,94,94,94,94
2006,ME,1.2,Americas,4,4,4,4
2006,ME,1.2,Asia,4,4,4,4
2006,ME,1.2,Europe,80,80,80,80
2006,ME,1.2,Oceania,1,1,1,1
2006,ME,1.HC,Americas,1,1,1,1
2006,ME,1.HC,Europe,15,15,15,15
2006,ME,2.1,Americas,1,1,1,1
