# Euro-league stat normalization

### Install important packages
Uncomment and run the below commands if packages aren't installed

In [1]:
# !pip install requests
# !pip install pandas
# !pip install numpy
# !pip install beautifulsoup4
# !pip install html5lib
# !pip install scikit-learn

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

### Euro league stats scraping
Using `requests`, download the appropriate html pages. We will consider only years 2010-2023.

In [3]:
years = list(range(2010, 2023))

In [4]:
# url_euro = "https://www.basketball-reference.com/international/euroleague/{}_totals.html"
# for year in years:
#     url = url_euro.format(year)

#     # Send an HTTP GET request to the URL to fetch the webpage content
#     data = requests.get(url)

#     #Write the retrieved HTML data to a file named after the year
#     with open("utils/euro_stats/{}.html".format(year), "w+") as f:
#         f.write(data.text)

In [5]:
# Create an empty DataFrame called 'euro_totals' to store the collected data
euro_totals = pd.DataFrame()

Using `BeautifulSoup`, extract metadata from the collected web pages. Additionally, append the year feature to the table for each player.

In [6]:
from io import StringIO

for year in years:
    # Read the HTML content from the file for the specific year
    with open("utils/euro_stats/{}.html".format(year)) as f:
        page = f.read()

    #Initialize the parser
    soup = BeautifulSoup(page, "html.parser")

    #Find specific metadata of the table and convert the HTML table to DataFrame
    table = soup.find(id="totals-stats-{}".format(year))
    df_euro = pd.read_html(StringIO(str(table)))[0]

    #Add the 'Year' column
    df_euro["Year"] = year

    #Concatenate with current DataFrame
    euro_totals = pd.concat([euro_totals, df_euro], ignore_index=True)

print(euro_totals)

                  Player                       Team   G   MP  FG  FGA    FG%  \
0              Acha Njei      EWE Baskets Oldenburg   3    9   2    2  1.000   
1         Adam Hrycaniuk       Asseco Prokom Gydnia  20  272  30   52  0.577   
2            Adam Lapeta       Asseco Prokom Gydnia  10   77   7   13  0.538   
3       Adas Juškevičius                   Žalgiris   9   80   7   17  0.412   
4         Adrien Moerman  Entente Orléanaise Loiret  10  199  19   58  0.328   
...                  ...                        ...  ..  ...  ..  ...    ...   
4046     Youssoupha Fall                 LDLC ASVEL  28  353  67   95  0.705   
4047       Yovel Zoosman                Alba Berlin  31  656  74  167  0.443   
4048  Zaccharie Risacher                 LDLC ASVEL   3   28   1    2  0.500   
4049           Zan Sisko             Bayern München  27  481  47  111  0.423   
4050        Zoran Dragić                   Žalgiris   9  119  12   41  0.293   

      3P  3PA    3P%  ...  ORB  DRB  TR

### Webscrape euro league -> NBA players

In [7]:
# url_nba = "https://www.basketball-reference.com/leagues/NBA_{}_totals.html"

# for year in years:
#     url = url_nba.format(year)

#     data = requests.get(url)

#     #Write the retrieved HTML data to a file named after the year
#     with open("utils/nba_stats/{}.html".format(year), "w+") as f:
#         f.write(data.text)

In [8]:
# Create an empty DataFrame called 'nba_totals' to store the collected data
nba_totals = pd.DataFrame()

In [9]:
for year in years:
    # Read the HTML content from the file for the specific year
    with open("utils/nba_stats/{}.html".format(year)) as f:
        page = f.read()

    #Initialize the parser
    soup = BeautifulSoup(page, "html.parser")

    #Find specific metadata of the table and convert the HTML table to DataFrame
    table = soup.find(id="totals_stats")
    df_nba = pd.read_html(StringIO(str(table)))[0]
    players_with_mult_teams = df_nba['Player'][df_nba['Player'].duplicated(keep=False)]
    filtered_df = df_nba[df_nba['Tm'] == 'TOT']
    df_nba = pd.concat([filtered_df, df_nba[~df_nba['Player'].isin(players_with_mult_teams)]])
    
    #Add the 'Year' column
    df_nba["Year"] = year

    #Concatenate with current DataFrame
    nba_totals = pd.concat([nba_totals, df_nba], ignore_index=True)

print(nba_totals)

       Rk            Player   Pos Age   Tm   G  GS    MP   FG   FGA  ...  ORB  \
0       8      Rafer Alston    PG  33  TOT  52  38  1421  155   446  ...   13   
1      18  Hilton Armstrong  PF-C  25  TOT  33   0   335   28    79  ...   27   
2      41         Raja Bell    SG  33  TOT   6   5   180   28    61  ...    5   
3      47       Steve Blake    PG  29  TOT  80  38  2159  217   522  ...   21   
4      59     Ronnie Brewer    SG  24  TOT  58  53  1742  209   429  ...   45   
...   ...               ...   ...  ..  ...  ..  ..   ...  ...   ...  ...  ...   
6512  600         Gabe York    SG  28  IND   2   0    21    2     7  ...    0   
6513  602        Trae Young    PG  23  ATL  76  76  2652  711  1544  ...   50   
6514  603    Omer Yurtseven     C  23  MIA  56  12   706  130   247  ...   85   
6515  604       Cody Zeller     C  29  POR  27   0   355   51    90  ...   50   
6516  605       Ivica Zubac     C  24  LAC  76  76  1852  310   495  ...  217   

      DRB  TRB  AST STL BLK

### Find all Euro league -> NBA players
Cross check the NBA dataframe against the euro league data frame to see which players went from Euro league to the NBA and during which year.

In [10]:
# Map for time range that players were in the league
nba_player_ranges = dict()
euro_player_ranges = dict()

# Map for row index
filtered_nba_players = pd.DataFrame()
filtered_euro_players = pd.DataFrame()

# Store ranges for NBA players
for idx, row in nba_totals.iterrows():
    player = row["Player"].strip()
    if player in nba_player_ranges:
        player_ranges = nba_player_ranges.get(player)
        player_range = player_ranges[len(player_ranges) - 1]
        if row["Year"] - player_range[1] == 1:
            player_range[1] = row["Year"]
            player_ranges[len(player_ranges) - 1] = player_range
        else:
            player_ranges.append([int(row["Year"]), int(row["Year"])])
        nba_player_ranges[player] = player_ranges
    else:
        player_range = [[int(row["Year"]), int(row["Year"])]]
        nba_player_ranges[player] = player_range
        
#print(nba_player_ranges)

# Store ranges for Euro league players
for idx, row in euro_totals.iterrows():
    player = row["Player"].strip()
    
    if player in nba_player_ranges:
        if player in euro_player_ranges:
            player_ranges = euro_player_ranges.get(player)
            player_range = player_ranges[len(player_ranges) - 1]
            if row["Year"] - player_range[1] == 1:
                player_range[1] = row["Year"]
                player_ranges[len(player_ranges) - 1] = player_range
            else:
                player_ranges.append([int(row["Year"]), int(row["Year"])])
            euro_player_ranges[player] = player_ranges
        else:
            player_range = [[int(row["Year"]), int(row["Year"])]]
            euro_player_ranges[player] = player_range

# Cross check euro league and nba players
for player in euro_player_ranges:
    ranges_nba = nba_player_ranges[player]
    ranges_euro = euro_player_ranges[player]
    for range_euro in ranges_euro:
        for range_nba in ranges_nba:
            if range_nba[0] > range_euro[1]:
                euro_result_row = euro_totals.loc[(euro_totals['Player'] == player) & (euro_totals['Year'] == range_euro[1])]
                filtered_euro_players = pd.concat([filtered_euro_players, euro_result_row], ignore_index=True)
                nba_result_row = nba_totals.loc[(nba_totals['Player'] == player) & (nba_totals['Year'] == range_nba[0])]
                filtered_nba_players = pd.concat([filtered_nba_players, nba_result_row], ignore_index=True)
                break
                
print(filtered_euro_players)
print(filtered_nba_players)

              Player                       Team   G   MP  FG  FGA    FG%  3P  \
0      Alan Anderson         Regal FC Barcelona  10  272  39   87  0.448  15   
1       Alexey Shved                CSKA Moscow  21  454  77  158  0.487  33   
2        Aron Baynes             Lietuvos rytas  10  133  23   45  0.511   0   
3          Gal Mekel  Maccabi Playtika Tel Aviv   1    1   0    0    NaN   0   
4      Joel Freeland                    Unicaja  14  361  75  147  0.510   4   
..               ...                        ...  ..  ...  ..  ...    ...  ..   
125     Keifer Sykes         Panathinaikos OPAP   3   48   4   14  0.286   3   
126  Leandro Bolmaro                  Barcelona  30  294  29   71  0.408   8   
127     Nik Stauskas          Kirolbet Baskonia  22  452  67  161  0.416  38   
128     Usman Garuba                Real Madrid  38  628  59  117  0.504  11   
129    Emanuel Terry          Crvena zvezda mts   9  170  20   31  0.645   0   

     3PA    3P%  ...  ORB  DRB  TRB  AS

### Filter data
- We want to filter out data that doesn't meet a minimum games/minutes played requirement
- Add position column to euro-league
- Fill NaN values with 0s
- Convert all numerical categories to floats

In [12]:
# Add position column
filtered_euro_players['Pos'] = filtered_nba_players['Pos']

# Get rid of all unwanted features
features_to_keep = ['FG%', '3P%', 'FT%', 'TRB', 'AST', 'STL', 'BLK', 'PTS', 'TOV', 'MP', 'Pos']
filtered_euro_players = filtered_euro_players[features_to_keep + ['Team']]
filtered_nba_players = filtered_nba_players[features_to_keep]

# Delete rows that don't meet min minutes requirement
min_total_mins = 100
indices_to_remove = filtered_euro_players[filtered_euro_players['MP'] < min_total_mins].index
filtered_euro_players.drop(indices_to_remove, inplace=True)
filtered_nba_players.drop(indices_to_remove, inplace=True)

# Replace NaN values
filtered_euro_players = filtered_euro_players.fillna(0)
filtered_nba_players = filtered_nba_players.fillna(0)

# Convert all numerical features into floats or ints
float_features = ['FG%', '3P%', 'FT%']
int_features = ['TRB', 'AST', 'STL', 'BLK', 'PTS', 'TOV']
filtered_euro_players[float_features] = filtered_euro_players[float_features].astype(float)
filtered_nba_players[float_features] = filtered_nba_players[float_features].astype(float)
filtered_euro_players[int_features] = filtered_euro_players[int_features].astype(int)
filtered_nba_players[int_features] = filtered_nba_players[int_features].astype(int)


print(filtered_nba_players)
print(filtered_euro_players)

       FG%    3P%    FT%  TRB  AST  STL  BLK  PTS  TOV    MP Pos
0    0.387  0.393  0.853   34   26    5    3  163   25   461  SF
1    0.372  0.295  0.720  175  286   54   27  665  147  1840  SG
2    0.500  0.000  0.583   32    5    1    6   43   11   141   C
4    0.408  0.000  0.667  119   13   13   11  134   19   477  PF
5    0.565  0.063  0.492  155   42   33   23  272   44   894  SG
..     ...    ...    ...  ...  ...  ...  ...  ...  ...   ...  ..
124  0.495  0.326  0.829  138   45   10   14  265   31   589   C
126  0.315  0.278  0.846   43   21    6    0   50   13   241  SF
127  0.357  0.400  0.667    3    2    0    0   18    1    39  SG
128  0.432  0.250  0.714   83   17   10   11   48    6   240  PF
129  0.000  0.000  0.000   15    2    1    0    0    5    18  PF

[102 rows x 11 columns]
       FG%    3P%    FT%  TRB  AST  STL  BLK  PTS  TOV   MP Pos  \
0    0.448  0.455  0.840   31   15    6    2  114   16  272  SF   
1    0.487  0.493  0.833   55   63   12    4  222   35  454  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_euro_players.drop(indices_to_remove, inplace=True)


### Pipeline for ridge regression

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Differentiate categorical and numerical features
cat_cols = ['Pos', 'Team']
num_cols = ['FG%', '3P%', 'FT%', 'TRB', 'AST', 'STL', 'BLK', 'PTS', 'TOV']

# Function for running pipeline
def run_pipeline(X_train, y_train, X_test, y_test):

    cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])
    num_transformer = Pipeline(steps=[('scaler', StandardScaler())])

    #column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            #('num', num_transformer, num_cols),
            ('cat', cat_transformer, cat_cols)
        ])
    
    # Build pipeline for linear regression
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Ridge(alpha=0.1))
    ])
    
    # Fit model
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    return pipeline, y_pred
    

### Run pipeline for each stat
- We will split the data into training and testing sets, where the prediction value for a euro league player will be based on the corresponding NBA stat
- We will run a linear regression for each of the different stat categories listed above
- Evaluate performance using R^2 and RMSE

In [25]:
# Create custom train_test_split function
def train_test_split(pred_feature):
    subset_fraction = 0.9
    
    y = filtered_nba_players[pred_feature]
    X_train = filtered_euro_players.sample(frac=subset_fraction, random_state=42)
    subset_indices = X_train.index
    X_test = filtered_euro_players[~filtered_euro_players.index.isin(subset_indices)]
    y_train = y[subset_indices]
    y_test = y[~y.index.isin(subset_indices)]
    
    return X_train, y_train, X_test, y_test


In [26]:
# Evaluation function for convenience
def performance_eval(y_pred, y_test, feature):
    rmse = np.sqrt(mean_squared_error(y_pred, y_test))
    r2 = r2_score(y_pred, y_test)
    print(feature)
    print("RMSE: ", rmse)
    print("R^2:", r2)
    print()

In [27]:
models = []

# Run linear pipeline for each feature
for pred_feature in num_cols:
    
    X_train, y_train, X_test, y_test = train_test_split(pred_feature)
    res = run_pipeline(X_train, y_train, X_test, y_test)
    models.append(res[0])

    print("y_pred: ", res[1])
    print("y_test: ", y_test)
    
    # Evaluate performance
    performance_eval(res[1], y_test, pred_feature)

y_pred:  [0.46208019 0.40558891 0.39062801 0.43276582 0.45267872 0.4139068
 0.35327806 0.38093344 0.46208019 0.41138698]
y_test:  15     0.529
23     0.500
59     0.367
68     0.374
86     0.444
92     0.427
104    0.384
112    0.500
119    0.467
120    0.333
Name: FG%, dtype: float64
FG%
RMSE:  0.062401584972436995
R^2: -2.2966545386981485

y_pred:  [0.12394588 0.29319173 0.16439578 0.42248185 0.52753313 0.31901931
 0.27184797 0.35839303 0.12394588 0.15916989]
y_test:  15     0.000
23     0.429
59     0.214
68     0.236
86     0.411
92     0.327
104    0.360
112    0.000
119    0.190
120    0.250
Name: 3P%, dtype: float64
3P%
RMSE:  0.15281223202201966
R^2: -0.40653469162052946

y_pred:  [0.66127602 0.37802804 0.77405038 0.74930221 0.85044417 0.73965804
 0.62708285 0.76705873 0.66127602 0.5586501 ]
y_test:  15     0.543
23     0.000
59     0.600
68     0.806
86     0.866
92     0.713
104    0.865
112    0.412
119    0.727
120    0.500
Name: FT%, dtype: float64
FT%
RMSE:  0.19539344320



##### TO-DO: CROSS VALIDATION FOR REGRESSION