Shooting Value
==============

* Sh/90 - 0.5
* npxG/90 - 0.5


Passing Value
=============

* xAG - 0.2
* xA - 0.2
* KP - 0.175
* PPA - 0.175
* 1/3 - 0.125
* PrgP - 0.125


Dribble Value
=============

* Att Pen - 0.235
* SuccDri - 0.118
* PrgC - 0.176
* 1/3 - 0.118
* CPA - 0.176
* Mis + Dis (inverse) - 0.059
* Fld 0.118


In [5]:
import pandas as pd

def clean_dataframe(df):
    # Remove rows where Player column contains 'Player' (these are repeated headers)
    df = df[df[('Unnamed: 1_level_0', 'Player')] != 'Player']
    return df

def scrape_metrics(min_90s=6):
    # Possession data
    df_possession = pd.read_html("https://fbref.com/en/comps/Big5/possession/players/Big-5-European-Leagues-Stats",
                    attrs={"id": "stats_possession"})[0]
    
    df = df_possession[[
        ('Unnamed: 1_level_0', 'Player'),
        ('Unnamed: 4_level_0', 'Squad'),
        ('Unnamed: 3_level_0', 'Pos'),
        ('Unnamed: 8_level_0', '90s'),
        ('Carries', 'CPA'),
        ('Carries', 'Dis'),
        ('Touches', 'Att Pen'),
        ('Carries', 'Mis'),
        ('Take-Ons', 'Succ'),
        ('Carries', '1/3')
    ]]
    
    # Clean the dataframe to remove repeated headers
    df = clean_dataframe(df)
    
    # Create unique identifier
    df['player_id'] = df[('Unnamed: 1_level_0', 'Player')] + ' - ' + df[('Unnamed: 4_level_0', 'Squad')]

    # Misc data
    df_misc = pd.read_html("https://fbref.com/en/comps/Big5/misc/players/Big-5-European-Leagues-Stats",
                    attrs={"id": "stats_misc"})[0]
    df_misc = df_misc[[
        ('Unnamed: 1_level_0', 'Player'),
        ('Unnamed: 4_level_0', 'Squad'),
        ('Performance', 'Fld')
    ]]
    df_misc = clean_dataframe(df_misc)
    df_misc['player_id'] = df_misc[('Unnamed: 1_level_0', 'Player')] + ' - ' + df_misc[('Unnamed: 4_level_0', 'Squad')]

    # First merge
    df = pd.merge(
        df,
        df_misc.drop([('Unnamed: 1_level_0', 'Player'), ('Unnamed: 4_level_0', 'Squad')], axis=1),
        on='player_id',
        how='left'
    )

    # Passing data
    df_passing = pd.read_html("https://fbref.com/en/comps/Big5/passing/players/Big-5-European-Leagues-Stats",
                    attrs={"id": "stats_passing"})[0]

    df_pass_cmp = df_passing[[
        ('Unnamed: 1_level_0', 'Player'),
        ('Unnamed: 4_level_0', 'Squad'),
        ('Unnamed: 29_level_0', 'PPA'),
        ('Unnamed: 28_level_0', '1/3'),
        ('Unnamed: 31_level_0', 'PrgP'),
        ('Unnamed: 27_level_0', 'KP'),
        ('Unnamed: 24_level_0', 'xAG'),
        ('Expected', 'xA'),
    ]]
    df_pass_cmp = clean_dataframe(df_pass_cmp)
    df_pass_cmp['player_id'] = df_pass_cmp[('Unnamed: 1_level_0', 'Player')] + ' - ' + df_pass_cmp[('Unnamed: 4_level_0', 'Squad')]

    # Second merge
    df = pd.merge(
        df,
        df_pass_cmp.drop([('Unnamed: 1_level_0', 'Player'), ('Unnamed: 4_level_0', 'Squad')], axis=1),
        on='player_id',
        how='left'
    )

    # Shooting data
    df_shooting = pd.read_html("https://fbref.com/en/comps/Big5/shooting/players/Big-5-European-Leagues-Stats",
                    attrs={"id": "stats_shooting"})[0]
    df_shooting = df_shooting[[
        ('Unnamed: 1_level_0', 'Player'),
        ('Unnamed: 4_level_0', 'Squad'),
        ('Expected', 'npxG'),
        ('Standard', 'Sh')
    ]]
    df_shooting = clean_dataframe(df_shooting)
    df_shooting['player_id'] = df_shooting[('Unnamed: 1_level_0', 'Player')] + ' - ' + df_shooting[('Unnamed: 4_level_0', 'Squad')]

    # Third merge
    df = pd.merge(
        df,
        df_shooting.drop([('Unnamed: 1_level_0', 'Player'), ('Unnamed: 4_level_0', 'Squad')], axis=1),
        on='player_id',
        how='left'
    )

    # Drop the temporary player_id column
    df = df.drop('player_id', axis=1)

    # Rename columns
    df.columns = [
        "player", "team", "position", "90s",
        "carries_pa", "dispossessed", "touches_pa", "miscontrols", "succ_takeons", "carries_final_third", 
        "fouls_drawn",
        "passes_pa", "passes_final_third", "prog_passes", "key_passes", "xAG", "xA",
        "npxG", 'shots'
    ]

    # Convert to numeric and adjust metrics
    df['90s'] = pd.to_numeric(df['90s'], errors='coerce')
    
    metrics_to_adjust = [
        'carries_pa', 'dispossessed', 'touches_pa', "miscontrols", "succ_takeons", "carries_final_third",
        'fouls_drawn',
        "passes_pa", "passes_final_third", "prog_passes", "key_passes", "xAG", "xA",
        'npxG', 'shots'
    ]
    
    for metric in metrics_to_adjust:
        df[metric] = pd.to_numeric(df[metric], errors='coerce')
        df[metric] = df[metric] / df['90s']

    df["carry_losses"] =  df["miscontrols"] + df["dispossessed"]

    df = df.drop(["miscontrols", "dispossessed"], axis=1)

    # Filter conditions
    df = df[df['90s'] > min_90s]
    df = df[df['position'].str.contains('FW')]

    return df

# Try running with the cleaned data
df = scrape_metrics(min_90s=6)
print(df.head(20))

  df = pd.merge(


                             player             team position   90s  \
5                     Himad Abdelli           Angers    MF,FW  16.8   
12                   Matthis Abline           Nantes       FW  14.6   
14                Zakaria Aboukhlal         Toulouse    MF,FW  14.9   
17                    Tammy Abraham            Milan       FW   7.1   
21                       Akor Adams      Montpellier       FW  11.9   
22                        Che Adams           Torino       FW  13.8   
24                     Junior Adamu         Freiburg       FW   9.3   
34                 Oladapo Afolayan        St. Pauli    FW,MF  13.2   
53                  Ludovic Ajorque            Brest       FW  12.5   
57                   Ilias Akhomach       Villarreal    MF,FW   6.5   
59                Maghnes Akliouche           Monaco    FW,MF  11.6   
71                  Mohamed Ali Cho             Nice    FW,MF   9.0   
75                    Jim Allevinah           Angers       FW   9.7   
76   D

  df = df.drop('player_id', axis=1)


In [9]:
# List of metrics to calculate percentiles for
metrics = [
    'touches_pa', 'succ_takeons', 'carries_final_third', 
    'fouls_drawn', 'passes_pa', 'passes_final_third', 
    'prog_passes', 'key_passes', 'xAG', 'xA',
    'npxG', 'shots', 'carries_pa'
]

# Calculate percentiles for regular metrics (higher is better)
for metric in metrics:
    percentile_col = f"{metric}_percentile"
    df[percentile_col] = df[metric].rank(pct=True) * 100

# Calculate percentile for carry_losses (lower is better, so invert)
df['carry_losses_percentile'] = (1 - df['carry_losses'].rank(pct=True)) * 100

# Create df_triple_threat with necessary columns
df_triple_threat = df[["player", "team", "npxG_percentile", "shots_percentile"]]

# Calculate shooting score (0.5 weight for each metric)
df_triple_threat["shooting_score"] = (df["npxG_percentile"] * 0.5) + (df["shots_percentile"] * 0.5)

print(df_triple_threat.sort_values(by="shooting_score", ascending=False).head(20))

                    player             team  npxG_percentile  \
2042         Patrik Schick       Leverkusen       100.000000   
2288           Deniz Undav        Stuttgart        99.541284   
301        Victor Boniface       Leverkusen        98.853211   
680             Jhon Durán      Aston Villa        97.935780   
1891         Mateo Retegui         Atalanta        96.559633   
1278    Robert Lewandowski        Barcelona        99.770642   
2191     Alexander Sørloth  Atlético Madrid        98.394495   
942         Erling Haaland  Manchester City        97.018349   
591        Ousmane Dembélé        Paris S-G        94.266055   
139          Marco Asensio        Paris S-G        99.082569   
1984         Mohamed Salah        Liverpool        96.330275   
203        Bradley Barcola        Paris S-G        97.706422   
426   Valentín Castellanos            Lazio        92.660550   
1458         Kylian Mbappé      Real Madrid        93.348624   
702           Hugo Ekitike   Eint Frankf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_triple_threat["shooting_score"] = (df["npxG_percentile"] * 0.5) + (df["shots_percentile"] * 0.5)
