In [2]:
import pandas as pd
import glob
#Map the UCI Calendar excel output to the Continental circuits using the UCI's country names
continents = {'Americas': ['ANTIGUA AND BARBUDA', 'ARGENTINA', 'ARUBA', 'BELIZE', 'BERMUDA', 
                           'BOLIVARIAN REPUBLIC OF VENEZUELA', 'BOLIVIA', 'BRAZIL', 'CANADA',
                           'CHILE', 'COLOMBIA', 'CUBA', 'DOMINICAN REPUBLIC', 'ECUADOR', 
                           'EL SALVADOR', 'HONDURAS', 'PANAMA', 'PARAGUAY', 'PUERTO RICO', 
                           'SAINT VINCENT AND THE GRENADINES', 'TRINIDAD AND TOBAGO', 
                           'UNITED STATES OF AMERICA', 'URUGUAY','MEXICO', 'COSTA RICA','GUATEMALA'],
              'Africas' : ['ALBANIA', 'ALGERIA', 'ANGOLA', 'CAMEROON', 'CONGO', 'COTE D\'IVOIRE',
                           'EGYPT', 'ETHIOPIA', 'GUYANA', 'LIBYA', 'MALI', 'MAURITIUS', 'MOROCCO',
                           'NAMIBIA', 'RWANDA', 'SENEGAL', 'SOUTH AFRICA', 'SWAZILAND', 'TUNISIA', 
                           'UGANDA', 'ZIMBABWE', 'GABON','ERITREA', 'BURKINA FASO'],
              'Asia' : ['AZERBAIJAN', 'BAHRAIN', 'BRUNEI DARUSSALAM', 'CHINESE TAIPEI', 'GEORGIA',
                        'INDIA', 'ISLAMIC REPUBLIC OF IRAN', 'ISRAEL', 'KAZAKHSTAN', 'KOREA', 'KUWAIT', 
                        'KYRGYZSTAN', 'LEBANON', 'MALAYSIA', 'MONGOLIA', 'MYANMAR', 'OMAN', 'PHILIPPINES',
                        'RUSSIAN FEDERATION', 'SINGAPORE', 'SRI LANKA', 'SYRIAN ARAB REPUBLIC', 'THAILAND',
                        'TURKEY', 'UNITED ARAB EMIRATES', 'VIETNAM','HONG KONG, CHINA','QATAR', 'UZBEKISTAN',
                        'INDONESIA', 'JAPAN','PEOPLE\'S REPUBLIC OF CHINA'],
              'Europe' : ['AUSTRIA', 'BELARUS', 'BELGIUM', 'BOSNIA AND HERZEGOVINA', 'BULGARIA', 'CROATIA',
                          'CYPRUS', 'CZECH REPUBLIC', 'DENMARK', 'ESTONIA', 'FINLAND', 
                          'FORMER YUGOSLAV REPUBLIC OF MACEDONIA', 'FRANCE', 'GERMANY', 'GREAT BRITAIN', 
                          'GREECE', 'HUNGARY', 'ICELAND', 'IRELAND', 'ITALY', 'KOSOVO', 'LATVIA', 'LITHUANIA',
                          'LUXEMBOURG', 'MONTENEGRO', 'NETHERLANDS', 'NORWAY', 'POLAND', 'PORTUGAL', 
                          'REPUBLIC OF MOLDOVA', 'ROMANIA', 'SAN MARINO', 'SERBIA', 
                          'SLOVAKIA', 'SLOVENIA', 'SWEDEN', 'SWITZERLAND', 'UKRAINE','SPAIN'],
              'Oceania' : ['NEW ZEALAND','AUSTRALIA']}

In [3]:
#Pull in every excel sheet from the current folder that starts with Calendar and ends in xlsx. YMMV!

all_data = pd.DataFrame()
for f in glob.glob('Calendar*.xlsx'):
    df = pd.read_excel(f, header=1, encoding='UTF-8')
    all_data = all_data.append(df,ignore_index=True)


In [4]:
#Examine the head and length to make sure it's kosher
all_data.head()
len(all_data)

7985

In [5]:
#Get rid of unwanted columns.
del all_data['EMail']
del all_data['WebSite']
del all_data['Calendar']
del all_data['Venue']
all_data.head()

Unnamed: 0,Date From,Date To,Name,Country,Category,Class
0,29/10/2018,29/10/2018,Gran Premio ICODER,COSTA RICA,WE,1.2
1,30/10/2018,30/10/2018,Gran Premio Comite Olimpico Nacional Femenino,COSTA RICA,WE,1.2
2,23/10/2018,31/10/2018,Tour of Hainan,PEOPLE'S REPUBLIC OF CHINA,ME,2.HC
3,23/10/2018,01/11/2018,58 Vuelta a Guatemala,GUATEMALA,ME,2.2
4,26/10/2018,04/11/2018,Tour du Faso,BURKINA FASO,ME,2.2


In [68]:
#Extract the year from the start date noting that UCI 'seasons' differ from calendar years
#I will need to write code that changes seasons for Africa, Asia and or Oceania

all_data['Year'] = pd.DatetimeIndex(all_data['Date From']).year
#If you want you can just export the combined data
# writer = pd.ExcelWriter('combined.xlsx')
# all_data.to_excel(writer,'Sheet1')

# writer.save()

In [69]:
#We need to make the dictionary of lists one big dictionary to add the Continents

cont_dict_converted = {k: oldk for oldk, oldv in continents.items() for k in oldv}

#Note there are some events that cross borders or Continental championships that have no Continent.
#Add some code here to handle these cases.

all_data['Continent'] = all_data['Country'].map(cont_dict_converted).fillna('Stateless')

#Look at the numbers of races per continent
grouped_df = all_data.groupby('Continent')
grouped_df.count()

Unnamed: 0_level_0,Date From,Date To,Name,Country,Category,Class,Year
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Africas,348,348,348,348,314,346,348
Americas,851,851,851,851,787,851,851
Asia,840,840,840,840,777,840,840
Europe,5708,5708,5708,5708,5583,5708,5708
Oceania,221,221,221,221,206,221,221
Stateless,17,17,17,13,16,17,17


In [85]:
#add in the coordinates for each race so we can geolocate them on a map later

uci_country_coord = pd.DataFrame()
uci_country_coord = pd.read_excel('uci_country_coord.xlsx', encoding='UTF-8')
uci_df = pd.merge(all_data, uci_country_coord, right_on='uci_name', left_on='Country', how="left")

df_nulls = all_data[all_data.isnull().any(axis=1)]
df_nulls.head()

Unnamed: 0,Date From,Date To,Name,Country,Category,Class,Year,Continent
14,17/11/2018,18/11/2018,National Road Championships - Uzbekistan,UZBEKISTAN,,CN,2018,Asia
18,21/11/2018,25/11/2018,Africa Cup,ERITREA,WE,,2018,Africas
19,21/11/2018,25/11/2018,Africa Cup,ERITREA,ME,,2018,Africas
20,07/12/2018,07/12/2018,National Road Championships - Qatar (IRR),QATAR,,CN,2018,Asia
25,04/01/2019,06/01/2019,National Road Championships - New Zealand,NEW ZEALAND,,CN,2019,Oceania


In [79]:
uci_country_coord.head(1)

Unnamed: 0,uci_name,continent,name,country,latitude,longitude
0,ALBANIA,AFRICA,Albania,AL,41.153332,20.168331
