In [48]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from bs4 import BeautifulSoup
import requests
from itertools import chain
from tqdm import tqdm_notebook as tqdm
import itertools

In [135]:
STANDARD_STATS_URL = "https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats"
DEFENSE_STATS_URL = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
GCA_STATS_URL = "https://fbref.com/en/comps/Big5/gca/players/Big-5-European-Leagues-Stats"
MISC_STATS_URL = "https://fbref.com/en/comps/Big5/misc/players/Big-5-European-Leagues-Stats"
SHOOTING_STATS_URL = "https://fbref.com/en/comps/Big5/shooting/players/Big-5-European-Leagues-Stats"
PASSING_STATS_URL = "https://fbref.com/en/comps/Big5/passing/players/Big-5-European-Leagues-Stats"
POSSESSION_STATS_URL = "https://fbref.com/en/comps/Big5/possession/players/Big-5-European-Leagues-Stats"
PLAYING_TIME_URL = "https://fbref.com/en/comps/Big5/playingtime/players/Big-5-European-Leagues-Stats"

In [83]:
def append_names(feature_list, head_tuple):
    start = 0
    end = 0
    for head in head_tuple:
        end = end + int(head[1])
        if head[0] != '':
            feature_list[start:end] = [head[0]+"_"+x for x in feature_list[start:end]]
        start = end
    return feature_list
        

In [125]:
def parse_page(URL):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.findAll('table')[0]
    features = [col.attrs["data-stat"] for col in table.find('thead').findAll('tr')[1].findAll('th')]
    features.remove("ranker")
    #features.remove("matches")
    header_name = [(col.attrs["data-stat"], col.attrs["colspan"]) for col in table.find('thead').findAll('tr')[0].findAll('th') if "data-stat" in col.attrs]
    features = append_names(features, header_name)
    player_table = table.find('tbody')
    #Parse player_table
    pre_df_player = []
    rows_player = player_table.find_all('tr')
    for row in tqdm(rows_player):
        if(row.find('th',{"scope":"row"}) != None):
            # Not scraping unnecessary rows this way
            data = [x.text.strip().encode().decode("utf-8") for x in row.find_all('td')]
            pre_df_player.append(dict(zip(features, data)))
        '''
        
            for f in features:
                cell = row.find("td",{"data-stat": f})
                if cell is not None:
                    a = cell.text.strip().encode()
                    text=a.decode("utf-8")
                    if f in pre_df_player:
                        pre_df_player[f].append(text)
                    else:
                        pre_df_player[f] = [text]
                else:
                    print(f"couldn't get stat for {f}")
        '''
    df_player = pd.DataFrame(pre_df_player, columns=features)
    return df_player


In [126]:
big5_standard_df = parse_page(STANDARD_STATS_URL)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  





In [127]:
big5_standard_df.to_csv("./data/big5_standard_stats.csv", index=False)

In [128]:
big5_defense_df = parse_page(DEFENSE_STATS_URL)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  





In [130]:
big5_defense_df.to_csv("./data/big5_defense_stats.csv", index=False)

In [131]:
big5_gca_df = parse_page(GCA_STATS_URL)
big5_gca_df.to_csv("./data/big5_gca_stats.csv", index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  





In [132]:
big5_misc_df = parse_page(MISC_STATS_URL)
big5_misc_df.to_csv("./data/big5_misc_stats.csv", index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  





In [133]:
big5_shoot_df = parse_page(SHOOTING_STATS_URL)
big5_shoot_df.to_csv("./data/big5_shooting_stats.csv", index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  





In [134]:
big5_pass_df = parse_page(PASSING_STATS_URL)
big5_pass_df.to_csv("./data/big5_passing_stats.csv", index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  





In [136]:
big5_poss_df = parse_page(POSSESSION_STATS_URL)
big5_poss_df.to_csv("./data/big5_possesion_stats.csv", index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  





In [137]:
big5_play_df = parse_page(PLAYING_TIME_URL)
big5_play_df.to_csv("./data/big5_playing_time.csv", index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  



