# Tour de Pologne Top 10 finishers prediction part 1 of 3

**Next notebooks:** <br>
[Part 2: Data stats and visuals](tdp_2_3_data_statistics_visualisation.ipynb)<br>
[Part 3: Data  modeling](tdp_3_3_data_modeling.ipynb)

In [2]:
import numpy as np
import pandas as pd

import re

import requests
from bs4 import BeautifulSoup

In [3]:
# Base URL for web scraping
url_base = 'http://firstcycling.com/race.php?r=19&y='

In [4]:
# Defining year range for data of interest
# Before the year of 2014 there is lack of general classification positions (GC) and time (GC Time) information
tour_year_start = 2014
tour_year_stop = 2018
tour_years = list(range(tour_year_start, tour_year_stop+1))
tour_years

[2014, 2015, 2016, 2017, 2018]

# Tour results by stages

In [8]:
# List of tour years for analysis
url_tour_results = []
for year in tour_years:
    url_tour_results.append(url_base + str(year))
# url_tour_results

In [9]:
# List of all stages in the above chosen tours
url_stages_results = []
for stage in url_tour_results:
    for s in range(1,8):
        url_stages_results.append(stage + '&k=etapper&e=0' + str(s))
# url_stages_results

### Generating dictionaries of biker and team name with corresponding system number  that are needed for further steps

In [11]:
biker_dict_stage = {}
team_dict_stage = {}
i = tour_year_start - 1
for stage in [stage for stage in url_stages_results if 'e=01' in stage]:
    # Preparing dataframe with all racer names
    dict_prep_ = pd.read_html(stage)
    dict_prep = dict_prep_[0]
    dict_prep['Name'] = dict_prep['Name'].str[0] + '.' + dict_prep['Name'].str.split(' ', 1, expand=True)[1]
    
    # Scraping system numbers
    r = requests.get(stage)
    soup = BeautifulSoup(r.text,'html.parser')
    system_numbers = soup.find('table')

    # Schema as `rider.php?r=2165` (2-5 digits with margin)
    biker_system_nr = pd.DataFrame(re.findall('rider.php\?r=(\d{2,6})', str(system_numbers)), columns=['biker_sys_nr'])
    # Schema as `team.php?l=8433` (3-4 digits with margin)
    team_system_nr = pd.DataFrame(re.findall('team.php\?l=(\d{3,5})', str(system_numbers)), columns=['team_sys_nr'])
    
    # Merging team and rider names with their system numbers 
    biker_dict_stage_ = dict(zip(dict_prep['Name'], biker_system_nr['biker_sys_nr']))
    biker_dict_stage.update(biker_dict_stage_)
    team_dict_stage_ = dict(zip(dict_prep['Team'], team_system_nr['team_sys_nr']))
    team_dict_stage.update(team_dict_stage_)
    i += 1
    print('Created dictionary of bikers participating in ' + str(i))

Dictionary of bikers participating in 2014
Dictionary of bikers participating in 2015
Dictionary of bikers participating in 2016
Dictionary of bikers participating in 2017
Dictionary of bikers participating in 2018


In [12]:
# Overall number of bikers and teams
len(biker_dict_stage), len(team_dict_stage)

(513, 69)

### Defining datetime functions

In [13]:
# Adding leading zeros for further splitting based on ':' character
def zeros_to_sec(df, col1, col2=False):
    df[col1] = np.where(df[col1].str.len() == 2, '00:00:' + df[col1],'00:' + df[col1]) 
    df[col1] = df[col1].where(df[col1].str.len() != 11, df[col1].str.slice(3,11))
    
    if col2:
        df[col2] = np.where(df[col2].str.len() == 2, '00:00:' + df[col2],'00:' + df[col2])
        df[col2] = df[col2].where(df[col2].str.len() != 11, df[col2].str.slice(3,11))

In [14]:
# Changind overall time into seconds
def time_to_sec(df, col1, col2=False):
    df[col1] = [(int(t[0])*3600 + int(t[1])*60 + int(t[2])) for t in df[col1].str.split(':')]
    df[col1] = df[col1] + df[col1].max()
    df.at[0, [col1]] = df[col1].max() / 2

    if col2:
        df[col2] = [(int(t[0])*3600 + int(t[1])*60 + int(t[2])) for t in df[col2].str.split(':')]
        df[col2] = df[col2] + df[col2].max()
        df.at[df[col2].idxmax(), [col2]] = df[col2].max() / 2

### Web scraping tour's stage-by-stage info for adding new columns with extended statistics. The output is `stages`

In [16]:
# Using URLs with all-year stages results
# url_stages_results[:]

In [19]:
stages_ = []

for year in tour_years:
    i = 1
    for stage_ in [stage_ for stage_ in url_stages_results if 'y='+str(year) in stage_]:
        try:
            stages__ = pd.read_html(stage_)
            stages__ = stages__[0]
            stages__['year'] = year
            # Limiting number of stages in each year to 7
            if i <= 7:
                stages__['stage'] = i
            else:
                i = 1        
            stages__.columns = ['position','to_drop','name','age','team','time_in_s','gc','gc_time_in_s','year','stage']
            stages__['age'] = year - stages__['age']
            stages__.replace(to_replace=r'\+ ', value='', regex=True, inplace=True)
            stages__.replace(to_replace=[np.NaN], value='00', inplace=True)
            zeros_to_sec(stages__, 'time_in_s','gc_time_in_s')
            time_to_sec(stages__, 'time_in_s','gc_time_in_s')
            stages_.append(stages__)
            print('Scraped tour of', year, '/', i)
            i += 1
        except ValueError:
            print('There is no data for this stage. Skipping...')
            i += 1
            pass

Scraped tour of 2014 / 1
Scraped tour of 2014 / 2
Scraped tour of 2014 / 3
Scraped tour of 2014 / 4
Scraped tour of 2014 / 5
Scraped tour of 2014 / 6
Scraped tour of 2014 / 7
Scraped tour of 2015 / 1
Scraped tour of 2015 / 2
Scraped tour of 2015 / 3
Scraped tour of 2015 / 4
Scraped tour of 2015 / 5
Scraped tour of 2015 / 6
Scraped tour of 2015 / 7
Scraped tour of 2016 / 1
Scraped tour of 2016 / 2
Scraped tour of 2016 / 3
Scraped tour of 2016 / 4
Scraped tour of 2016 / 5
There is no data for this stage. Skipping...
Scraped tour of 2016 / 7
Scraped tour of 2017 / 1
Scraped tour of 2017 / 2
Scraped tour of 2017 / 3
Scraped tour of 2017 / 4
Scraped tour of 2017 / 5
Scraped tour of 2017 / 6
Scraped tour of 2017 / 7
Scraped tour of 2018 / 1
Scraped tour of 2018 / 2
Scraped tour of 2018 / 3
Scraped tour of 2018 / 4
Scraped tour of 2018 / 5
Scraped tour of 2018 / 6
Scraped tour of 2018 / 7


In [20]:
# Mergind above stages
stages = pd.concat(stages_, axis=0)
stages.reset_index(drop=True, inplace=True)

# Dropping useless columns
stages.drop('to_drop', axis=1, inplace=True)

# Replacing 'Did Not Finished' and 'Did Not Start' with some high value
stages.replace(to_replace=['DNF', 'DNS'], value=999, inplace=True)
stages['position'] = stages['position'].astype('int')

# Shortening bikers names into 'initial.surname' form
stages['name'] = stages['name'].str[0] + '.' + stages['name'].str.split(' ', 1, expand=True)[1]

# Biker and Team system numbers
stages['biker_sys_nr'] = stages['name'].map(biker_dict_stage)
stages['team_sys_nr'] = stages['team'].map(team_dict_stage)

# Rearranging columns
stages = stages[['name','biker_sys_nr','age','team','team_sys_nr','time_in_s','gc','gc_time_in_s','year','stage','position']]

In [24]:
stages.head()

Unnamed: 0,name,biker_sys_nr,age,team,team_sys_nr,time_in_s,gc,gc_time_in_s,year,stage,position
0,Y.Hutarovich,100,31,AG2R La Mondiale,10072,20870.0,1,20860.0,2014,1,1
1,R.Maikin,14004,24,RusVelo,3482,20870.0,2,20864.0,2014,1,2
2,M.Mori,492,34,Lampre - Merida,6418,20870.0,3,20866.0,2014,1,3
3,G.Boivin,1937,25,Cannondale Pro Cycling,3481,20870.0,6,20870.0,2014,1,4
4,M.Haller,6246,23,Team Katusha,8441,20870.0,7,20870.0,2014,1,5


### **Creating columns with different statistics** for each biker. The output is `biker_by_stage` 

In [25]:
'''
'n_stages'           - number of stages started
'last_tour'          - number of years from last tour
'best_pos_ovr'       - best position from all stages
'worst_pos_ovr'      - worst position from all stages
'avg_pos_ovr'        - average position from all stages
'n_top10_ovr'        - number of top10 positions in all stages 
'perc_win_ovr'       - percentage of winning throughout all stages
'perc_top10_ovr'     - percentage of being in top10 in all stages
'avg_speed_ovr'      - average speed throughout all stages
'best_pos_l...'      - best position from last 1-3 stages
'wors_pos_l...'      - etc...
'avg_pos_l...'
'n_top10_l...'
'perc_win_l...'
'perc_top10_l...'
'avg_speed_l...'
'''

biker_by_stage_ = pd.DataFrame()

# Number of tours and stages ridden
biker_by_stage_['n_tours'] = stages.groupby(['name'])['year'].nunique()
biker_by_stage_['n_stages'] = stages.groupby(['name'])['stage'].count()

# Best, average and worst tour stats
min_max_ovr = stages.groupby('name')[['position','time_in_s','gc','gc_time_in_s']].agg(['min','mean','max'])

# Number of Top10 positions
top10 = pd.DataFrame(stages.groupby(stages[stages['position'] <= 10]['name'])['position'].count())

# Number of winning stage
winner_ = pd.DataFrame(stages.groupby(stages[stages['position'] == 1]['name'])['position'].count())

# Defining last 3 stages data for stats purpose
last_3_stages = stages.sort_values(['name','year','stage']).groupby('name').tail(3)
min_max_avg_l3 = last_3_stages.groupby('name')[['position','time_in_s','gc','gc_time_in_s']].agg(['min','mean','max'])
top10_l3 = pd.DataFrame(last_3_stages[last_3_stages['position'] <= 10]['name'].value_counts())
winner_l3 = pd.DataFrame(last_3_stages.groupby(last_3_stages[last_3_stages['position'] == 1]['name'])['position'].count())

biker_by_stage_ = pd.concat([biker_by_stage_, min_max_ovr, top10, winner_, min_max_avg_l3, top10_l3, winner_l3], axis=1, sort=False)
biker_by_stage_ = biker_by_stage_.replace(np.NaN, 0)
biker_by_stage_.columns = ['n_tours','n_stages','best_pos_ovr','avg_pos_ovr','worst_pos_ovr','best_time_ovr','avg_time_ovr','worst_time_ovr',
                         'best_gc_ovr','avg_gc_ovr','worst_gc_ovr','best_gc_time_ovr','avg_gc_time_ovr','worst_gc_time_ovr','n_top10_ovr','n_win_ovr',
                         'best_pos_ovr_l3','avg_pos_ovr_l3','worst_pos_ovr_l3','best_time_ovr_l3','avg_time_ovr_l3','worst_time_ovr_l3',
                         'best_gc_ovr_l3','avg_gc_ovr_l3','worst_gc_ovr_l3','best_gc_time_ovr_l3','avg_gc_time_ovr_l3','worst_gc_time_ovr_l3',
                         'n_top10_l3','n_win_l3']

# Percentage of Top10 and winnig stage
biker_by_stage_['perc_top10_ovr'] = biker_by_stage_['n_top10_ovr'] / biker_by_stage_['n_stages']
biker_by_stage_['perc_win_ovr'] = biker_by_stage_['n_win_ovr'] / biker_by_stage_['n_stages']
# biker_by_stage_

In [27]:
# Merging above results
biker_by_stage = pd.merge(stages, biker_by_stage_, left_on='name', right_on=biker_by_stage_.index)
biker_by_stage.head()

Unnamed: 0,name,biker_sys_nr,age,team,team_sys_nr,time_in_s,gc,gc_time_in_s,year,stage,...,best_gc_ovr_l3,avg_gc_ovr_l3,worst_gc_ovr_l3,best_gc_time_ovr_l3,avg_gc_time_ovr_l3,worst_gc_time_ovr_l3,n_top10_l3,n_win_l3,perc_top10_ovr,perc_win_ovr
0,Y.Hutarovich,100,31,AG2R La Mondiale,10072,20870.0,1,20860.0,2014,1,...,98,109.0,117,91441.0,104600.666667,112168.0,0.0,0.0,0.571429,0.142857
1,Y.Hutarovich,100,31,AG2R La Mondiale,10072,19455.0,2,40315.0,2014,2,...,98,109.0,117,91441.0,104600.666667,112168.0,0.0,0.0,0.571429,0.142857
2,Y.Hutarovich,100,31,AG2R La Mondiale,10072,13167.0,2,53482.0,2014,3,...,98,109.0,117,91441.0,104600.666667,112168.0,0.0,0.0,0.571429,0.142857
3,Y.Hutarovich,100,31,AG2R La Mondiale,10072,20609.0,3,74091.0,2014,4,...,98,109.0,117,91441.0,104600.666667,112168.0,0.0,0.0,0.571429,0.142857
4,Y.Hutarovich,100,31,AG2R La Mondiale,10072,17350.0,98,91441.0,2014,5,...,98,109.0,117,91441.0,104600.666667,112168.0,0.0,0.0,0.571429,0.142857


In [28]:
# Defining label (dependent variable)
# 1 is the result of being in the Top 10 position at the given stage.
biker_by_stage['y_label'] = np.where(biker_by_stage['position'] <= 10, 1, 0)

### Setting all DNFs/DNSs to 999 and corresponding `time_in_s` i `gc_time_in_s` values to twice the last finisher time

In [None]:
# TBD

### Manually setting captured jerseys

In [None]:
# TBD

<br><br><br><br>
# Stages summary
<br><br>

In [32]:
# url_tour_results

In [None]:
url_stages_summary = []
for tour in url_tour_results:
    url_stages_summary.append(tour + '&k=etapper')

In [None]:
url_stages_summary

In [None]:
stages_summary_ = []
for year in tour_years:
    for stage in [stage for stage in url_stages_summary if 'y='+str(year) in stage]:
        stages_summary__ = pd.read_html(stage)
        stages_summary__ = pd.DataFrame(stages_summary__[0])
        stages_summary__['year'] = year
        stages_summary_.append(stages_summary__)
stages_summary = pd.concat(stages_summary_, axis=0)
stages_summary

### Specifying stage type by its icon

In [None]:
# utworzenie słownika z nazw etapów po norwesku i odp. im angielskim znaczeniom
# stage_names_nor = list(set(stage_type))
stage_names_nor = ['Flatt','Smaakupert','Smaakupert-MF','Fjell','Fjell-MF', 'Tempo']
stage_names_en = ['flat','hilly','hilly-mf','mountain','mountain-mf','itt']
stage_names = dict(zip(stage_names_nor, stage_names_en))
stage_type_recode = dict(zip(stage_names_nor, list(range(1,7)))) 

stage_names

In [None]:
# historycznie pojawiło się jeszcze jako 8 etap:
# Bakketempo : mountain-itt; Ukjent : znak zapytania; 

stage_type = []
for stage in url_stages_summary:
    r = requests.get(stage)
    soup = BeautifulSoup(r.text,'html.parser')

    # selekcja wszystkich ścieżek do plików z nazwami typu etapu
    stage_soup = soup.find_all('img')
    stage_img = re.findall('([A-Z][a-z].*?)\.gif', str(stage_soup))
    
    # selekcja tylko nazw odp. nazwom typów etapów wg. schematu regex i usunięcie nazw innych niż etapowe
    for name_ in stage_img:
        if name_ in stage_names_nor:
            stage_type.append(name_)
            
stage_type

In [None]:
stages_summary.head()

### Columns rearranging and cleaning

In [None]:
stages_summary['Unnamed: 1'] = stage_type

stages_summary.columns = ['stage','stage_type','date','dist','to_drop','finish','stage_winner','stage_leader','points_leader','mountains_leader','sprint_leader','year']
stages_summary['stage_type_code'] = stages_summary['stage_type'].map(stage_type_recode)
stages_summary = stages_summary[['date','year','stage','stage_type','stage_type_code','dist','to_drop','finish','stage_winner','stage_leader','points_leader','mountains_leader','sprint_leader']]

stages_summary['stage_type'] = stages_summary['stage_type'].map(stage_names)

stages_summary.drop('to_drop', axis=1, inplace=True)
stages_summary = stages_summary.loc[:, 'date':'finish'].reset_index(drop=True)
stages_summary

In [None]:
stages_summary.groupby('year').agg(('min','max'))

### Setting stage date 

In [None]:
from datetime import timedelta

In [None]:
stages_summary['date'] = stages_summary['date'] + '.' + stages_summary['year'].astype('str')

In [None]:
stages_summary['date'] = pd.to_datetime(stages_summary['date'], format='%d.%b.%Y')
stages_summary['date']

### Weather info

In [None]:
# biorę pod uwagę Opady(0/1), Temperatura (float), ew. czy było zachmurzenie
# https://dane.imgw.pl/data/dane_pomiarowo_obserwacyjne/

'''
Kod stacji                        9
Nazwa stacji                     30
Rok                               4
Miesiąc                           2
Dzień                             2
Godzina                           2
Temperatura powietrza             6/1
Status pomiaru TEMP               1
Temperatura termometru zwilżonego 6/1
Status pomiaru TTZW               1
Wskaźnik lodu                     1
Wskaźnik wentylacji               1
Wilgotność względna               5
Status pomiaru WLGW               1
Kod kierunku wiatru               3
Status pomiaru DKDK               1
Prędkość wiatru                   5
Status pomiaru FWR                1
Zachmurzenie ogólne               5
Status pomiaru ZOGK               1
Widzialność                       5
Status pomiaru WID                1

Status "8" brak pomiaru
Zachmurzenie
        - skala 0 - 10 do dn.31.12.1988
        - skala 0 – 8 od dn.01.01.1989
'''

# PO UDOSTĘPNIENIU DANYCH POGODOWYCH ZMIENIĆ NA OBECNY ROK

names = ['station_code','station_name','year','month','day','hour',
         'temp','temp_status','wet_temp','ttzw_status','ice_ind','vent_ind',
         'rel_humid','wlgw_status','wind_dir','dkdk_status','wind_speed','fwr_status',
         'clouds_ovrl','zogk_status','visibility','wid_status']

weather_08_14 = pd.read_csv('data/weather_k_t_08_2014.csv', encoding = 'ISO-8859-2', header=None, names=names)
weather_08_15 = pd.read_csv('data/weather_k_t_08_2015.csv', encoding = 'ISO-8859-2', header=None, names=names)
weather_07_16 = pd.read_csv('data/weather_k_t_07_2016.csv', encoding = 'ISO-8859-2', header=None, names=names)
weather_07_17 = pd.read_csv('data/weather_k_t_07_2017.csv', encoding = 'ISO-8859-2', header=None, names=names)
weather_08_17 = pd.read_csv('data/weather_k_t_08_2017.csv', encoding = 'ISO-8859-2', header=None, names=names)

# TU TRZEBA POBRAĆ NOWE DANE Z ODPOWIEDNICH MIESIĘCY
weather_07_18 = pd.read_csv('data/weather_k_t_05_2018.csv', encoding = 'ISO-8859-2', header=None, names=names)
weather_08_18 = pd.read_csv('data/weather_k_t_06_2018.csv', encoding = 'ISO-8859-2', header=None, names=names)

In [None]:
weather_07_18['month'] = 7
weather_08_18['month'] = 8

In [None]:
weather_info = pd.concat([weather_08_14, weather_08_15,weather_07_16,
                          weather_07_17, weather_08_17, weather_08_18], axis=0)
weather_info = weather_info[weather_info.hour == 12].reset_index(drop=True)
weather_info.head()

In [None]:
weather_info['date'] = weather_info['year'].astype('str') + '.' +\
                       weather_info['month'].astype('str') + '.' +\
                       weather_info['day'].astype('str')
weather_info['date'] = pd.to_datetime(weather_info['date'], format='%Y.%m.%d')

In [None]:
weather_info

In [None]:
weather_info[weather_info.date.isin(stages_summary.date) & (weather_info.year == 2018) & weather_info.station_name.str.contains('KRAK')]

In [None]:
tour_years

In [None]:
weather_stages = pd.DataFrame()
for year in tour_years:
    # Stage 1
    st1_weather = weather_info[weather_info.date.isin(stages_summary.date) \
                    & (weather_info.year == year) \
                    & weather_info.station_name.str.contains('KRAK')].iloc[0] \
                    [['temp','rel_humid','wind_dir','wind_speed','clouds_ovrl','visibility']]
    # Stage 2
    st2_weather = weather_info[weather_info.date.isin(stages_summary.date) \
                    & (weather_info.year == year) \
                    & weather_info.station_name.str.contains('DRONI')].iloc[1] \
                    [['temp','rel_humid','wind_dir','wind_speed','clouds_ovrl','visibility']]
    # Stage 3
    st3_weather = weather_info[weather_info.date.isin(stages_summary.date) \
                    & (weather_info.year == year) \
                    & weather_info.station_name.str.contains('BRENNA')].iloc[2] \
                    [['temp','rel_humid','wind_dir','wind_speed','clouds_ovrl','visibility']]
    # Stage 4
    st4_weather = weather_info[weather_info.date.isin(stages_summary.date) \
                    & (weather_info.year == year) \
                    & weather_info.station_name.str.contains('DRONI')].iloc[3] \
                    [['temp','rel_humid','wind_dir','wind_speed','clouds_ovrl','visibility']]
    # Stage 5
    st5_weather = weather_info[weather_info.date.isin(stages_summary.date) \
                    & (weather_info.year == year) \
                    & weather_info.station_name.str.contains('CHORZ')].iloc[4] \
                    [['temp','rel_humid','wind_dir','wind_speed','clouds_ovrl','visibility']]
    # Stage 6
    st6_weather = weather_info[weather_info.date.isin(stages_summary.date) \
                    & (weather_info.year == year) \
                    & weather_info.station_name.str.contains('LIMA')].iloc[5] \
                    [['temp','rel_humid','wind_dir','wind_speed','clouds_ovrl','visibility']]
    # Stage 7
    st7_weather = weather_info[weather_info.date.isin(stages_summary.date) \
                    & (weather_info.year == year) \
                    & weather_info.station_name.str.contains('BUKOW')].iloc[6] \
                    [['temp','rel_humid','wind_dir','wind_speed','clouds_ovrl','visibility']]
    weather_stages = weather_stages.append([st1_weather, st2_weather, st3_weather, st4_weather,
                           st5_weather, st6_weather, st7_weather]).reset_index(drop=True)
#     weather_stages.append(stages_weather_)

In [None]:
weather_stages

### Joining stage summary, weather info and biker stage position list dataframes

In [None]:
stages_summary = pd.concat([stages_summary, weather_stages], axis=1)
stages_summary

In [None]:
# stages_summary_cut = stages_summary.loc[:,'date':'visibility']
# stages_summary_cut.to_csv('data/stages_summary_cut.csv', index=False)
# stages_summary_cut

<br><br><br>
# Bikers statistics and overall description
<br><br>

### Overall bikers list

In [None]:
# Lista wszystkich kolarzy
biker_base_url = 'http://firstcycling.com/rider.php?r='

bikers_ranking_urls = []
for k,v in biker_dict_stage.items():
    bikers_ranking_urls.append(biker_base_url + v)
len(bikers_ranking_urls)
# pd.DataFrame(bikers_ranking_urls)[0]

### Scapring bikers stats

In [None]:
bikers_stats_ = pd.DataFrame()
i = -1
# bikers_ranks_ = []
for url_ in bikers_ranking_urls:

    # Stats scraping with the season-by-season separation
    r = requests.get(url_)
    table_ = pd.read_html(r.text)   
    # Settig 'Season' column as an index for dataframe transposition
    biker_rank = table_[0].loc[:4,:].set_index('Season')
    biker_rank.columns = ['div','team','fc_rank','to_drop','uci_rank','to_drop','race_days','wins','year_km']
    biker_rank.drop(['to_drop'], axis=1, inplace=True)
    # Dataframe transposition
    biker_rank = biker_rank.stack().to_frame().T
    # Renaming column names
    biker_rank.columns = ['{}_{}'.format(*c) for c in biker_rank.columns]  
    # Removing duplicate rows
    biker_rank = biker_rank.loc[:,~biker_rank.columns.duplicated()]
    
    # Physical stats, nationality, jerseys, wictories and followers extraction
    soup = BeautifulSoup(r.text,'html.parser')
    stats = soup.find_all('div', class_='back')
    
    nation = re.findall('nat=.*?>(\w.*?)<|$', str(stats))[0]
    try:
        height = re.findall('Height.*?(\d{1}\.\d{2})|$', str(stats))[0] 
    except (ValueError, IndexError):
        height = np.NaN
    try:
        wins_search = re.search('Victories.*?<h2', str(stats)).group() 
        total_wins = len(re.findall('\d{4}(,|\))', wins_search))
    except (AttributeError, ValueError, IndexError):
        total_wins = 0
    try:
        youth_search = re.search('Youth.*?<\/p', str(stats)).group()
        youth_jersey = len(re.findall('[( ,](\d{4})', youth_search))
    except (AttributeError, ValueError, IndexError):
        youth_jersey = 0
    try:
        point_search = re.search('Points.*?<\/p', str(stats)).group()
        point_jersey = len(re.findall('[( ,](\d{4})', point_search))
    except (AttributeError, ValueError, IndexError):
        point_jersey = 0
    try:
        mount_search = re.search('Mount.*?<\/p', str(stats)).group()
        mount_jersey = len(re.findall('[( ,](\d{4})', mount_search))
    except (AttributeError, ValueError, IndexError):
        mount_jersey = 0
    try:
        sprint_search = re.search('Sprint.*?<\/p', str(stats)).group()
        sprint_jersey = len(re.findall('[( ,](\d{4})', sprint_search))
    except (AttributeError, ValueError, IndexError):
        sprint_jersey = 0
    try:
        followers_soup = soup.find_all('p', class_='small')
        followers = len(re.findall('ID', str(followers_soup)))
    except (AttributeError, ValueError, IndexError):
        followers = 0
    
    # zebranie powyższych danych w formę ramki
    results_dict = {
        'nation':nation,
        'height':height,
        'total_wins':total_wins,
        'youth_jerseys':youth_jersey,
        'point_jerseys':point_jersey,
        'mount_jerseys':mount_jersey,
        'sprint_jerseys':sprint_jersey,
        'followers':followers
    }
    stats_table = pd.DataFrame(data=results_dict, index=[0])
    
    # konkatenacja obu wynikowych ramek
    biker_rank = pd.concat([biker_rank, stats_table], axis=1)
    i += 1
    print('Scrapped position no.', i, nation, height, total_wins, youth_jersey, point_jersey, mount_jersey, sprint_jersey, followers)
#     bikers_ranks_.append(biker_rank)
    bikers_stats_ = bikers_stats_.append(biker_rank, sort=False)

In [None]:
# sortowanie kolumn alfabetycznie
bikers_stats_.sort_index(axis=1, inplace=True)

# przypisanie indexu z kolejnych kluczy ze słownika (dodam jednak jako kolumnę w innym kroku)
# bikers_ranks.index = [k for k,v in biker_dict_stage.items()][:len(bikers_ranks)]
bikers_stats_['name'] = [k for k,v in biker_dict_stage.items()][:len(bikers_stats_)]

# resetuję index do późniejszej konkatenacji tabel
bikers_stats_.reset_index(drop=True, inplace=True)
bikers_stats_['height'].replace('', np.NaN, inplace=True)

In [None]:
# From the whole historical results dataset (2013-2019) choosing only years 2015-2018 due to many missing values before that timepoint
bikers_stats = pd.concat([bikers_stats_.loc[:,'2015_div':'2018_year_km'], bikers_stats_.iloc[:,-9:]], axis=1)
bikers_stats.head(3)

### Setting final biker class dataframe

In [None]:
biker_by_stage.head()

In [None]:
bikers_stats.head()

In [None]:
# Creating biker class dataset
# Concatenating historical stats with last tour results by stages
biker_class = pd.merge(biker_by_stage, bikers_stats, left_on='name', right_on='name')
biker_class

In [None]:
biker_class.columns

In [None]:
# Rearranging columns for clearness
cols = biker_class.columns.tolist()
cols = cols[-9:] + cols[:-9]
biker_class = biker_class[cols]
biker_class = biker_class.reindex(columns=['name','nation','height','age','total_wins','youth_jerseys',
                             'point_jerseys','mount_jerseys','sprint_jerseys','followers','year']+cols[11:])
# biker_class.drop('name_to_drop', axis=1, inplace=True)

In [None]:
biker_class.head(3)

# Missing values

In [None]:
# Checking columns with missing data
biker_class_null_cols = biker_class.columns[biker_class.isnull().any()]
biker_class[biker_class_null_cols].isnull().sum()

<br><br><br>
# Final dataframes

**Next notebook:** [Part 2: Data statistics and visualization](tdp_2_3_data_statistics_visualization.ipynb)

### Overall stages summary

In [54]:
# tu dodać dla każdego etapu dane pogodowe: deszcz/słońce(temp)
# stages_summary.to_csv('preprocessed/stages_summary.csv', index=False)
# stages_summary.head()

### Tour results by stages

In [56]:
# tu etapy są jeden pod drugim, więc numery 'position' się powtarzają 1-154 x 7 etapów 
# stages.to_csv('preprocessed/stages.csv', index=False)
# stages.head()

### Biker by stage results

In [58]:
# biker_by_stage.to_csv('preprocessed/biker_by_stage.csv')
# biker_by_stage.head()

### Historical bikers data

In [60]:
# bikers_stats.to_csv('preprocessed/biker_stats.csv')
# bikers_stats.head()

### Final biker class dataframe

In [62]:
biker_class.to_csv('preprocessed/biker_class.csv')