<a href="https://colab.research.google.com/github/lucasreis95/world-surf-league-data/blob/main/notebooks/07_silver_atheletes_events_injuries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libs
import pandas as pd
import numpy as np
import re
import pandas_gbq
from bs4 import BeautifulSoup
import requests

In [None]:
# read df from gbq
df_raw = pandas_gbq.read_gbq(
                             query_or_table = 'wsl-data-397017.01_bronze.wsl_ranking_scrap'
                             )

Downloading: 100%|[32m██████████[0m|


In [None]:
df_rank = df_raw.copy()
df_rank = df_rank.sort_values(by = 'index')
df_rank = df_rank[['name',
                   '1_artboard',
                   '2_artboard',
                   '3_artboard',
                   '4_artboard',
                   '5_artboard',
                   '6_artboard',
                   '7_artboard',
                   '8_artboard',
                   '9_artboard',
                   '10_artboard',
                   '11_artboard',
                   'year'
                   ]]
# filter out rows containing cutoff (this just happen on 2022 rank)
df_rank = df_rank[~df_rank['name'].isin(['Final 5 Cutoff', 'Cut Line', 'WSL Final 5 Cutoff', 'Mid-Season Cut Line'])]

In [None]:
# skip 2020 in year list as we dont't had contest
year_list = [i for i in range(2010, 2025) if i != 2020]
# get injuried df
df_inj_list = []
for year in year_list:
  df_inj_raw = df_rank[df_rank['year'] == str(year)].copy()
  df_inj_raw = df_inj_raw.replace('-',None)
  df_inj_raw = df_inj_raw.replace('nan',None)
  # count how many empty columns this df has
  # that means events that doesn't happened, in some year we had 11 events, while in some, we had just 7
  try:
    empty_columns = df_inj_raw.isnull().all().value_counts()[True]
  except:
    empty_columns = 0
  # 5 is the number of static columns (name, country, year, rank, total_points)
  n_events_on_year = 11 - empty_columns
  # repeat df 11 times (numbers of events on this year)
  df_inj_raw_to_aux = df_inj_raw.copy()
  df_inj_raw = pd.DataFrame(np.repeat(df_inj_raw.values, n_events_on_year, axis=0))
  df_inj_raw.columns = df_inj_raw_to_aux.columns
  # create aux column
  df_inj_raw['aux_column'] = 1
  # set every row and a event order
  df_inj_raw['event_order'] = df_inj_raw.groupby('name')['aux_column'].transform(pd.Series.cumsum)
  df_inj_raw = df_inj_raw.drop(columns = 'aux_column')
  # convert year to string
  year_str = str(year)
  # make request
  r = requests.get('https://www.worldsurfleague.com/athletes/tour/mct?year=' + year_str).text
  soup = BeautifulSoup(r, 'html.parser')
  # get all event places and infos
  l = soup.find_all('td', attrs = {'class':'athlete-event-place'})
  # convert elements inside list to str
  l = [str(i) for i in l]
  # keep just events that already happened
  l = [s for s in l if s != '<td class="athlete-event-place"><span class="tooltip-item">-</span></td>']
  # hard code remove injury duplicated registers
  l = [s for s in l if s != '<td class="athlete-event-place"><span class="tooltip-item out out" data-tooltip=\'{"content":"&lt;strong class=\\"tooltip-contents\\"&gt;&lt;span class=\\"tooltip-inj\\"&gt;INJ&lt;\\/span&gt; &lt;span class=\\"status-note\\"&gt;Injured: Ongoing Injury - Replaced by Brodi Sale&lt;\\/span&gt;&lt;\\/strong&gt;"}\'>-</span></td>']
  #l = [s for s in l if 'Ongoing Injury' not in s]
  # keep just pointing events or previus events that surfer don't came
  #l = [x for x in l if 'Points' in x or 'no-count' in x]
  # create and aux colummn with raw athlete event place info
  df_inj_raw['aux_athl_event_place'] = l
  # filter where contains Replaced or Injured
  df_inj_raw = df_inj_raw[df_inj_raw['aux_athl_event_place'].str.contains('Replaced|Injured|Withdrawn')]
  # select just relevant columns
  df_inj_raw = df_inj_raw[['name', 'year', 'event_order']]
  # append to main list
  df_inj_list.append(df_inj_raw)
  # print process has finished
  print(year, 'injuried df was added.')
# concat all years dfs
df_inj = pd.concat(df_inj_list)

2010 injuried df was added.
2011 injuried df was added.
2012 injuried df was added.
2013 injuried df was added.
2014 injuried df was added.
2015 injuried df was added.
2016 injuried df was added.
2017 injuried df was added.
2018 injuried df was added.
2019 injuried df was added.
2021 injuried df was added.
2022 injuried df was added.
2023 injuried df was added.
2024 injuried df was added.


In [None]:
# clean

# remove () from names
df_inj['name'] = df_inj['name'].str.replace(r'\([^)]*\)', '', regex=True).str.strip()
# replace country names
df_inj['name'] = df_inj['name'].str.replace('United States','United_states')\
                       .str.replace('South Africa','South_africa')\
                       .str.replace('New Zealand','New_zealand')\
                       .str.replace('French Polynesia','French_polynesia')\
                       .str.replace('Costa Rica','Costa_rica')

# add space before last capital letter, that means before surfer country
df_inj['name'] = df_inj['name'].str.replace( r"([A-Z])", r" \1", regex=True).str.strip()
# split name and country by last space
df_inj[['name','athlete_country']] = df_inj['name'].str.rsplit(expand=True, n=1)
# drop athlete_country country
df_inj = df_inj.drop(columns = 'athlete_country')
# lower case and strip names and countries
#df_inj['athlete_country'] = df_inj['athlete_country'].str.lower()
#df_inj['athlete_country'] = df_inj['athlete_country'].str.replace('_', ' ')
df_inj['name'] = df_inj['name'].str.lower()
df_inj['name']= df_inj['name'].replace(r'\s+', ' ', regex=True)
# rename columns
df_inj = df_inj.rename(columns = {
                          'name':'athlete_name',
                          'year':'season_year'
                          })

In [None]:
# write raw table in big query
pandas_gbq.to_gbq(
                  dataframe = df_inj,
                  destination_table = 'wsl-data-397017.02_silver.wsl_atheletes_events_injuries',
                  project_id = 'wsl-data-397017',
                  if_exists = 'replace'
                  )

100%|██████████| 1/1 [00:00<00:00, 8035.07it/s]
