<a href="https://colab.research.google.com/github/lucasreis95/world-surf-league-data/blob/main/notebooks/05_silver_athletes_dimensions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libs
import pandas as pd
import pandas_gbq
import numpy as np
import re

In [None]:
# read df from gbq
df_raw = pandas_gbq.read_gbq(
                             query_or_table = 'wsl-data-397017.01_bronze.wsl_athletes_scrap'
                             )

Downloading: 100%|[32m██████████[0m|


In [None]:
df = df_raw
# keep just numbers
df['athlete_height'] = df['athlete_height'].apply(lambda x: re.sub("[^0-9]", "", str(x)))
df['athlete_weight'] = df['athlete_weight'].apply(lambda x: re.sub("[^0-9]", "", str(x)))

# convert to datetime format
df['athlete_birth_date'] = pd.to_datetime(df['athlete_birth_date'], format='%b %d, %Y')

#strip str columns
df['athlete_name'] = df['athlete_name'].str.strip()
df['athlete_country'] = df['athlete_country'].str.strip()
df['athlete_stance'] = df['athlete_stance'].str.strip()
#df['athlete_home_town'] = df['athlete_home_town'].str.strip()
df['athlete_first_season'] = df['athlete_first_season'].str.strip()

# remove multiple spaces
df['athlete_name'] = df['athlete_name'].replace(r'\s+', ' ', regex=True)
df['athlete_country'] = df['athlete_country'].replace(r'\s+', ' ', regex=True)
df['athlete_stance'] = df['athlete_stance'].replace(r'\s+', ' ', regex=True)
#df['athlete_home_town'] = df['athlete_home_town'].replace(r'\s+', ' ', regex=True)
df['athlete_first_season'] = df['athlete_first_season'].replace(r'\s+', ' ', regex=True)

# lower case
df['athlete_country'] = df['athlete_country'].str.lower()
df['athlete_name'] = df['athlete_name'].str.lower()
df['athlete_stance'] = df['athlete_stance'].str.lower()
#df['athlete_home_town'] = df['athlete_home_town'].str.lower()
df['athlete_first_season'] = df['athlete_first_season'].str.lower()

# replace null values to nan
df = df.replace('', np.nan, regex=True)

# convert to numeric
df['athlete_height'] = df['athlete_height'].astype(float)
df['athlete_weight'] = df['athlete_weight'].astype(float)

###Fill stance null values


In [None]:
# check athletes where stance is not filled
df[df['athlete_stance'].isnull()].reset_index()['athlete_name']

0        chris davidson
1           daniel ross
2          luke stedman
3            roy powers
4            luke munro
5         kekoa bacalso
6              ben dunn
7            marco polo
8         drew courtney
9            kael walsh
10        tikanui smith
11    david delroy-carr
12       morgan cibilic
13        lucas vicente
14         dylan moffat
15        ivan florence
16          jack thomas
17           ben spence
18        inia nakalevu
Name: athlete_name, dtype: object

In [None]:
# create dictionary with those surfers instances
fill_na_stance_dict = {
'chris davidson':'regular',
'daniel ross':'regular',
'luke stedman':'regular',
'roy powers':'regular',
'luke munro':'regular',
'kekoa bacalso':'regular',
'ben dunn':'regular',
'marco polo':'regular',
'drew courtney':'regular',
'inia nakalevu':'goofy',
'kael walsh':'regular',
'tikanui smith':'regular',
'david delroy-carr':'regular',
'jett schilling':'goofy',
'morgan cibilic':'regular',
'lucas vicente':'goofy',
'dylan moffat':'regular',
'joao chianca':'regular',
'ivan florence':'goofy',
'tully wylie':'goofy',
'ben spence':'regular',
'jack thomas':'regular',
}

In [None]:
# fill na stance values with dict
df['athlete_stance'] = df['athlete_stance'].fillna(df['athlete_name'].map(fill_na_stance_dict))
# check athletes if has any null stance already (must be empty)
df[df['athlete_stance'].isnull()].reset_index()['athlete_name']

0    jack thomas
Name: athlete_name, dtype: object

In [None]:
# write raw table in big query
pandas_gbq.to_gbq(
                  dataframe = df,
                  destination_table = 'wsl-data-397017.02_silver.wsl_athletes_dimensions',
                  project_id = 'wsl-data-397017',
                  if_exists = 'replace'
                  )

100%|██████████| 1/1 [00:00<00:00, 1687.85it/s]
