In [24]:
import numpy as np
import pandas as pd

In [25]:
timestamp = '20230422105449'

In [26]:
df = pd.DataFrame()

for i in range(1, 7):
    partial_df = pd.read_csv(f"scraped_data/players_{timestamp}_{i}.csv", sep=';')
    df = pd.concat([df, partial_df])

df = df.reset_index(drop=True)
df

Unnamed: 0,league_country,scraping_time,link,no,name,league,club,league_level,market_value,market_value_currency,...,assists,own_goals,ins,outs,yellow_cards,yellow_red_cards,red_cards,penalty_goals,minutes_per_goal,minutes
0,Schweiz,20230422105459,https://www.transfermarkt.ch/david-von-ballmoo...,26.0,David von Ballmoos,Super League,BSC Young Boys,1.Liga,"2,50 Mio. €Letzte Änderung: 10.11.2022",Mio. €,...,,-,-,1,-,-,-,,,1.324'
1,Schweiz,20230422105510,https://www.transfermarkt.ch/anthony-racioppi/...,1.0,Anthony Racioppi,Super League,BSC Young Boys,1.Liga,"1,30 Mio. €Letzte Änderung: 28.03.2023",Mio. €,...,,-,1,-,-,-,-,,,1.196'
2,Schweiz,20230422105524,https://www.transfermarkt.ch/marvin-keller/pro...,33.0,Marvin Keller,Super League,BSC Young Boys,1.Liga,500 Tsd. €Letzte Änderung: 29.12.2022,Tsd. €,...,,-,-,-,-,-,-,,,-
3,Schweiz,20230422105542,https://www.transfermarkt.ch/dario-marzino/pro...,40.0,Dario Marzino,Super League,BSC Young Boys,1.Liga,100 Tsd. €Letzte Änderung: 28.03.2023,Tsd. €,...,,-,-,-,-,-,-,,,-
4,Schweiz,20230422105559,https://www.transfermarkt.ch/leandro-zbinden/p...,61.0,Leandro Zbinden,Super League,BSC Young Boys,1.Liga,100 Tsd. €Letzte Änderung: 10.11.2022,Tsd. €,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,Kroatien,20230422110651,https://www.transfermarkt.ch/ivan-calusic/prof...,,Ivan Calusic,SuperSport HNL,Hajduk Split,1.Liga,350 Tsd. €Letzte Änderung: 08.11.2022,Tsd. €,...,,,,,,,,,,
260,Kroatien,20230422110714,https://www.transfermarkt.ch/marko-capan/profi...,34.0,Marko Capan,SuperSport HNL,Hajduk Split,1.Liga,,,...,2,-,1,3,-,-,-,-,-,188'
261,Kroatien,20230422110754,https://www.transfermarkt.ch/rokas-pukstas/pro...,21.0,Rokas Pukstas,SuperSport HNL,Hajduk Split,1.Liga,"1,00 Mio. €Letzte Änderung: 30.12.2022",Mio. €,...,1,-,6,2,2,-,-,-,257',1.028'
262,Kroatien,20230422110812,https://www.transfermarkt.ch/filip-krovinovic/...,23.0,Filip Krovinovic,SuperSport HNL,Hajduk Split,1.Liga,"5,00 Mio. €Letzte Änderung: 08.11.2022",Mio. €,...,3,-,-,8,3,-,-,-,2.488',2.488'


In [27]:
def parse_market_value_with_latest_change(scraped_string):
    return parse_market_value(scraped_string[:scraped_string.find('Letzte Änderung')])

def parse_market_value(scraped_string):
    value_as_string = scraped_string.strip().replace(',', '.')
    if value_as_string[-6:] == 'Mio. €':
        return float(value_as_string[:-6]) * 1000000
    elif value_as_string[-6:] == 'Tsd. €':
        return float(value_as_string[:-6]) * 1000
    else:
        raise ValueError(value_as_string)


df['market_value'] = df[df['market_value'].notna()]['market_value'].apply(parse_market_value_with_latest_change)
df['market_value_currency'] = df[df['market_value_currency'].notna()]['market_value_currency'].str[-1:]
df['market_value_latest_correction'] = pd.to_datetime(df[df['market_value_latest_correction'].notna()]['market_value_latest_correction'].str[-10:], format='%d.%m.%Y')
df['highest_market_value'] = df[df['highest_market_value'].notna()]['highest_market_value'].apply(parse_market_value)
df['highest_market_value_date'] = pd.to_datetime(df[df['highest_market_value_date'].notna()]['highest_market_value_date'], format='%d.%m.%Y')
df['height'] = df[df['height'].notna()]['height'].str[:-2].str.replace(',', '.', regex=False).astype(float) * 100
df['club_since'] = pd.to_datetime(df['club_since'], format='%d.%m.%Y')
df['contract_until'] = pd.to_datetime(df['contract_until'].replace('-', np.NaN), format='%d.%m.%Y')
for col in ['goals_conceded', 'clean_sheets', 'games', 'points_per_game', 'goals', 'assists', 'own_goals', 'ins', 'outs', 'yellow_cards', 'yellow_red_cards', 'red_cards', 'penalty_goals', 'minutes_per_goal', 'minutes']:
    df[col] = df[col].replace('-', 0)
df['points_per_game'] = df[df['points_per_game'].notna()]['points_per_game'].str.replace(',', '.', regex=False)
for col in ['minutes_per_goal', 'minutes']:
    df[col] = df[df[col].notna()][col].str.replace('\'', '', regex=False).str.replace('.', '', regex=False)



In [28]:
df = df.rename(columns={'league_country': 'LeagueCountry', 'scraping_time': 'ScrapingTime', 'link': 'Source', 'no': 'No', 'name': 'Name', 'league': 'League', 'club': 'Club', 'league_level': 'LeagueLevel', 'market_value': 'Value', 'market_value_currency': 'ValueCurrency', 'market_value_latest_correction': 'ValueLastUpdate', 'highest_market_value': 'HighestValue', 'highest_market_value_date': 'HighestValueDate', 'age': 'Age', 'height': 'Height', 'nationality': 'Nationality', 'position': 'Position', 'foot': 'Foot', 'consultancy': 'Consultancy', 'supplier': 'Supplier', 'club_since': 'ClubSince', 'contract_until': 'ContractUntil', 'international': 'International', 'international_active': 'ActiveInternational', 'former_international': 'FormerInternational', 'international_games': 'InternationalGames', 'international_goals': 'InternationalGoals', 'starting_eleven_quote': 'StartingElevenQuote', 'minutes_quote': 'MinutesQuote', 'penalty_saves_quote': 'TwPenaltySavesQuote', 'goal_participation_quote': 'FsGoalParticipationQuote', 'instagram': 'Instagram', 'injury': 'Injury', 'goals_conceded': 'TwGoalsConceded', 'clean_sheets': 'TwCleanSheets', 'games': 'Games', 'points_per_game': 'PointsPerGame', 'goals': 'Goals', 'assists': 'FsAssists', 'own_goals': 'OwnGoals', 'ins': 'Ins', 'outs': 'Outs', 'yellow_cards': 'YellowCards', 'yellow_red_cards': 'YellowRedCards', 'red_cards': 'RedCards', 'penalty_goals': 'FsPenaltyGoals', 'minutes_per_goal': 'FsMinutesPerGoal', 'minutes': 'Minutes'})

df = df[['LeagueCountry', 'League', 'LeagueLevel', 'Club', 'No', 'Name', 'Value', 'ValueLastUpdate', 'HighestValue', 'HighestValueDate', 'ValueCurrency', 'Age', 'Height', 'Nationality', 'Foot', 'Position', 'Consultancy', 'Supplier', 'ClubSince', 'ContractUntil', 'Injury', 'International', 'ActiveInternational', 'FormerInternational', 'InternationalGames', 'InternationalGoals', 'StartingElevenQuote','MinutesQuote', 'TwPenaltySavesQuote', 'FsGoalParticipationQuote', 'Games', 'PointsPerGame', 'Goals', 'OwnGoals', 'Ins', 'Outs', 'YellowCards', 'YellowRedCards', 'RedCards', 'Minutes', 'TwGoalsConceded', 'TwCleanSheets', 'FsAssists', 'FsPenaltyGoals', 'FsMinutesPerGoal', 'Instagram',  'Source', 'ScrapingTime']]

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 47 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   LeagueCountry             264 non-null    object        
 1   League                    253 non-null    object        
 2   LeagueLevel               255 non-null    object        
 3   Club                      264 non-null    object        
 4   No                        246 non-null    float64       
 5   Name                      264 non-null    object        
 6   Value                     250 non-null    float64       
 7   ValueLastUpdate           250 non-null    datetime64[ns]
 8   HighestValue              259 non-null    float64       
 9   HighestValueDate          259 non-null    datetime64[ns]
 10  ValueCurrency             250 non-null    object        
 11  Age                       264 non-null    int64         
 12  Height                

In [32]:
df.to_csv(f"cleansed_data/players_{timestamp}.csv", sep=';', index=False)