In [1]:
import numpy as np
import pandas as pd

In [2]:
timestamp = '20230425190937'

In [6]:
df = pd.DataFrame()

for i in range(1, 7):
    partial_df = pd.read_csv(f"scraped_data/players_{timestamp}_{i}.csv", sep=';')
    df = pd.concat([df, partial_df])

df = df.reset_index(drop=True)
df

Unnamed: 0,league_country,scraping_time,link,no,name,league,club,league_level,market_value,market_value_currency,...,assists,own_goals,ins,outs,yellow_cards,yellow_red_cards,red_cards,penalty_goals,minutes_per_goal,minutes
0,Schweiz,20230425190948,https://www.transfermarkt.ch/david-von-ballmoo...,26.0,David von Ballmoos,Super League,BSC Young Boys,1.Liga,"2,50 Mio. €Letzte Änderung: 10.11.2022",Mio. €,...,,-,-,1,-,-,-,,,1.324'
1,Schweiz,20230425191002,https://www.transfermarkt.ch/anthony-racioppi/...,1.0,Anthony Racioppi,Super League,BSC Young Boys,1.Liga,"1,30 Mio. €Letzte Änderung: 28.03.2023",Mio. €,...,,-,1,-,-,-,-,,,1.286'
2,Schweiz,20230425191015,https://www.transfermarkt.ch/marvin-keller/pro...,33.0,Marvin Keller,Super League,BSC Young Boys,1.Liga,500 Tsd. €Letzte Änderung: 29.12.2022,Tsd. €,...,,-,-,-,-,-,-,,,-
3,Schweiz,20230425191033,https://www.transfermarkt.ch/dario-marzino/pro...,40.0,Dario Marzino,Super League,BSC Young Boys,1.Liga,100 Tsd. €Letzte Änderung: 28.03.2023,Tsd. €,...,,-,-,-,-,-,-,,,-
4,Schweiz,20230425191056,https://www.transfermarkt.ch/leandro-zbinden/p...,61.0,Leandro Zbinden,Super League,BSC Young Boys,1.Liga,100 Tsd. €Letzte Änderung: 10.11.2022,Tsd. €,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18330,Norwegen,20230426091213,https://www.transfermarkt.ch/gilbert-koomson/p...,7.0,Gilbert Koomson,Eliteserien,Sandefjord,1.Liga,450 Tsd. €Letzte Änderung: 09.12.2022,Tsd. €,...,-,-,2,1,-,-,-,-,-,88'
18331,Norwegen,20230426091231,https://www.transfermarkt.ch/youssef-chaib/pro...,11.0,Youssef Chaib,Eliteserien,Sandefjord,1.Liga,100 Tsd. €Letzte Änderung: 09.12.2022,Tsd. €,...,,,,,,,,,,
18332,Norwegen,20230426091252,https://www.transfermarkt.ch/jakob-dunsby/prof...,27.0,Jakob Dunsby,Eliteserien,Sandefjord,1.Liga,100 Tsd. €Letzte Änderung: 02.02.2023,Tsd. €,...,-,-,1,2,-,-,-,-,-,182'
18333,Norwegen,20230426091307,https://www.transfermarkt.ch/alexander-ruud-tv...,9.0,Alexander Ruud Tveter,Eliteserien,Sandefjord,1.Liga,300 Tsd. €Letzte Änderung: 09.12.2022,Tsd. €,...,,,,,,,,,,


In [7]:
# handle faulty records
df.loc[df['contract_until'] == '00.00.0', 'contract_until'] = np.NaN
df.loc[df['contract_until'] == '06.2024', 'contract_until'] = '01.06.2024'


In [8]:
def parse_market_value_with_latest_change(scraped_string):
    return parse_market_value(scraped_string[:scraped_string.find('Letzte Änderung')])

def parse_market_value(scraped_string):
    value_as_string = scraped_string.strip().replace(',', '.')
    if value_as_string[-6:] == 'Mio. €':
        return float(value_as_string[:-6]) * 1000000
    elif value_as_string[-6:] == 'Tsd. €':
        return float(value_as_string[:-6]) * 1000
    else:
        raise ValueError(value_as_string)

def league_level_function(level):
    try:
        return int(level[:1])
    except (ValueError, TypeError):
        return 9

df['league_level'] = df['league_level'].apply(league_level_function)
df['market_value'] = df[df['market_value'].notna()]['market_value'].apply(parse_market_value_with_latest_change)
df['market_value_currency'] = df[df['market_value_currency'].notna()]['market_value_currency'].str[-1:]
df['market_value_latest_correction'] = pd.to_datetime(df[df['market_value_latest_correction'].notna()]['market_value_latest_correction'].str[-10:], format='%d.%m.%Y')
df['highest_market_value'] = df[df['highest_market_value'].notna()]['highest_market_value'].apply(parse_market_value)
df['highest_market_value_date'] = pd.to_datetime(df[df['highest_market_value_date'].notna()]['highest_market_value_date'], format='%d.%m.%Y')
df['height'] = df[df['height'].notna()]['height'].str[:-2].str.replace(',', '.', regex=False).astype(float) * 100
df['club_since'] = pd.to_datetime(df['club_since'].replace('-', np.NaN), format='%d.%m.%Y')
df['contract_until'] = pd.to_datetime(df['contract_until'].replace('-', np.NaN), format='%d.%m.%Y')
for col in ['goals_conceded', 'clean_sheets', 'games', 'points_per_game', 'goals', 'assists', 'own_goals', 'ins', 'outs', 'yellow_cards', 'yellow_red_cards', 'red_cards', 'penalty_goals', 'minutes_per_goal', 'minutes']:
    df[col] = df[col].replace('-', 0)
df['points_per_game'] = df[df['points_per_game'].notna()]['points_per_game'].str.replace(',', '.', regex=False)
for col in ['minutes_per_goal', 'minutes']:
    df[col] = df[df[col].notna()][col].str.replace('\'', '', regex=False).str.replace('.', '', regex=False)

df.loc[df['international'].notnull(), 'international_team'] = df['international']
df.loc[df['international_active'].notnull(), 'international_team'] = df['international_active']
df.loc[df['former_international'].notnull(), 'international_team'] = df['former_international']
df['international_active'] = df['international_active'].notna().astype(int)
df['former_international'] = df['former_international'].notna().astype(int)
df[['international', 'international_active', 'former_international', 'international_team']].head(20)
df = df.drop('international', axis=1)



In [9]:
df = df.rename(columns={'league_country': 'LeagueCountry', 'scraping_time': 'ScrapingTime', 'link': 'Source', 'no': 'No', 'name': 'Name', 'league': 'League', 'club': 'Club', 'league_level': 'NationalLeagueLevel', 'market_value': 'Value', 'market_value_currency': 'ValueCurrency', 'market_value_latest_correction': 'ValueLastUpdate', 'highest_market_value': 'HighestValue', 'highest_market_value_date': 'HighestValueDate', 'age': 'Age', 'height': 'Height', 'nationality': 'Nationality', 'position': 'Position', 'foot': 'Foot', 'consultancy': 'Consultancy', 'supplier': 'Supplier', 'club_since': 'ClubSince', 'contract_until': 'ContractUntil', 'international_team': 'InternationalTeam', 'international_active': 'ActiveInternational', 'former_international': 'FormerInternational', 'international_games': 'InternationalGames', 'international_goals': 'InternationalGoals', 'starting_eleven_quote': 'StartingElevenQuote', 'minutes_quote': 'MinutesQuote', 'penalty_saves_quote': 'TwPenaltySavesQuote', 'goal_participation_quote': 'FsGoalParticipationQuote', 'instagram': 'Instagram', 'injury': 'Injury', 'goals_conceded': 'TwGoalsConceded', 'clean_sheets': 'TwCleanSheets', 'games': 'Games', 'points_per_game': 'PointsPerGame', 'goals': 'Goals', 'assists': 'FsAssists', 'own_goals': 'OwnGoals', 'ins': 'Ins', 'outs': 'Outs', 'yellow_cards': 'YellowCards', 'yellow_red_cards': 'YellowRedCards', 'red_cards': 'RedCards', 'penalty_goals': 'FsPenaltyGoals', 'minutes_per_goal': 'FsMinutesPerGoal', 'minutes': 'Minutes'})

df = df[['LeagueCountry', 'League', 'NationalLeagueLevel', 'Club', 'No', 'Name', 'Value', 'ValueLastUpdate', 'HighestValue', 'HighestValueDate', 'ValueCurrency', 'Age', 'Height', 'Nationality', 'Foot', 'Position', 'Consultancy', 'Supplier', 'ClubSince', 'ContractUntil', 'Injury', 'InternationalTeam', 'ActiveInternational', 'FormerInternational', 'InternationalGames', 'InternationalGoals', 'StartingElevenQuote','MinutesQuote', 'TwPenaltySavesQuote', 'FsGoalParticipationQuote', 'Games', 'PointsPerGame', 'Goals', 'OwnGoals', 'Ins', 'Outs', 'YellowCards', 'YellowRedCards', 'RedCards', 'Minutes', 'TwGoalsConceded', 'TwCleanSheets', 'FsAssists', 'FsPenaltyGoals', 'FsMinutesPerGoal', 'Instagram',  'Source', 'ScrapingTime']]

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18335 entries, 0 to 18334
Data columns (total 48 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   LeagueCountry             18335 non-null  object        
 1   League                    18238 non-null  object        
 2   NationalLeagueLevel       18335 non-null  int64         
 3   Club                      18331 non-null  object        
 4   No                        17905 non-null  float64       
 5   Name                      18331 non-null  object        
 6   Value                     17899 non-null  float64       
 7   ValueLastUpdate           17899 non-null  datetime64[ns]
 8   HighestValue              18004 non-null  float64       
 9   HighestValueDate          18004 non-null  datetime64[ns]
 10  ValueCurrency             17899 non-null  object        
 11  Age                       18328 non-null  float64       
 12  Height            

In [11]:
df.to_csv(f"cleansed_data/players_{timestamp}.csv", sep=';', index=False)