## This notebook has the purpose of designing a random classifier for player's positions

In the project, this will be useful to establish a null hypothesis, measuring the distribution hypothesys of each iteration of this null model to verify at what point a season could be classified as "positionless"

In [9]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score
from scipy.stats import randint, entropy
import numpy as np

from UtilFunctions import format_season
from UtilCollections import STATS_MAPPER, COLOR_MAPPER, POSITIONS

In [2]:
position_to_number = {
    "PG": 0,
    "SG": 1,
    "SF": 2,
    "PF": 3,
    "C": 4
}

In [3]:
def pre_process(seasons):
    df = None
    for season in seasons:
        per_min_cols = ['FGA', '3PA', '2PA', 'PF', 'PTS', 'OWS', 'DWS', 'OBPM', 'DBPM', 'BPM']
        cols_to_drop = None
        first_year, second_year = format_season(season)
        player_data = pd.read_csv(f"DataCollection/Player_Stats/player_stats_{first_year}-{second_year}.csv")
        cols_to_drop = ['index', 'TRB', 'DRB', 'ORB', 'AST', 'G', 'GS', 'FT', 'FG', 'FG%', 'BLK', 'STL',
                                'WS', 'FTA', 'TOV', 'Age', '2P', '3P', 'VORP']
            
        player_data = player_data.fillna(0)
        player_data = player_data[(player_data["MP"] > 15) & (player_data["G"] >= 30)]
        player_data = player_data.sort_values(by=['G'], ascending=False)
        player_data = player_data[player_data["Tm"] != "TOT"]
        player_data = player_data.drop_duplicates(subset ="Player",keep = "first")
        player_data.reset_index(inplace=True)
        player_data.drop(columns=cols_to_drop, inplace=True)
        
        for col in per_min_cols:
            player_data[col] = player_data[col] / player_data['MP'] * 36
        
        if df is None:
            df = player_data
        else:
            df = pd.concat([df, player_data])
            
    df.reset_index(inplace=True)
    df.drop(['index', 'Player', 'Tm'], axis=1, inplace=True)
    return df

In [4]:
def calculate_probability_matrix(cm):
    prob_matrix = []
    for i in range(len(cm)):
        prob_matrix.append([])
        players_in_position = sum(cm[i,:])
        for j in range(len(cm)):
            prob_matrix[i].append(round(cm[i,j]/players_in_position, 2))
    
    return prob_matrix

In [5]:
def entropy_by_position(positions, p_matrix):
    entropies = {}
    i = 0
    for position in positions:
        entropies[position] = entropy(p_matrix[i], base=2)
        i += 1
    return entropies

In [10]:
def random_hypothesis(seasons, positions):
    """
    Generates a random hypothesis for position prediction, with probabilities
    proportional to the distribution of players in each position.

    Args:
        seasons (list): List of seasons to test.
        positions (list): List of positions to consider.

    Returns:
        dict: Dictionary of entropies by position for each season.
    """
    entropies_df = {season: [] for season in seasons}
    accuracies_df = {season: [] for season in seasons}

    for i in range(1000):
        if i % 100 == 0:
            print(f"Iteration {i}")
        for season in seasons:
            df = pre_process([season])
            df = df[df['Pos'].isin(positions)]

            y_test = df['Pos']

            # Calculate position distribution
            position_counts = df['Pos'].value_counts(normalize=True)
            position_probabilities = [position_counts.get(pos, 0) for pos in positions]

            # Generate random predictions based on position distribution
            y_pred = np.random.choice(positions, size=len(df.index), p=position_probabilities)

            cm = confusion_matrix(y_test, y_pred, labels=positions)
            prob_matrix = calculate_probability_matrix(cm)

            entropies_df[season].append(entropy_by_position(positions, prob_matrix))
            accuracies_df[season].append(accuracy_score(y_test, y_pred))

    return entropies_df, accuracies_df

In [11]:
entropies, accuracies = random_hypothesis(range(1980, 2025), POSITIONS)

Iteration 0
Iteration 100
Iteration 200
Iteration 300
Iteration 400
Iteration 500
Iteration 600
Iteration 700
Iteration 800
Iteration 900


In [15]:
print(accuracies)

{1980: [0.23711340206185566, 0.16494845360824742, 0.19072164948453607, 0.20618556701030927, 0.17525773195876287, 0.20103092783505155, 0.20103092783505155, 0.17010309278350516, 0.1958762886597938, 0.22164948453608246, 0.17010309278350516, 0.21649484536082475, 0.20618556701030927, 0.211340206185567, 0.20103092783505155, 0.16494845360824742, 0.15979381443298968, 0.21649484536082475, 0.21649484536082475, 0.23711340206185566, 0.1958762886597938, 0.18556701030927836, 0.1958762886597938, 0.20618556701030927, 0.18041237113402062, 0.15463917525773196, 0.22164948453608246, 0.20618556701030927, 0.20618556701030927, 0.1958762886597938, 0.29896907216494845, 0.23195876288659795, 0.20103092783505155, 0.20103092783505155, 0.13917525773195877, 0.211340206185567, 0.21649484536082475, 0.17010309278350516, 0.211340206185567, 0.1958762886597938, 0.18556701030927836, 0.21649484536082475, 0.23195876288659795, 0.19072164948453607, 0.211340206185567, 0.2422680412371134, 0.20103092783505155, 0.18556701030927836

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def get_positionless_borderlines(entropies, positions):
    positionless_borderlines = {}
    for season in entropies.keys():        
        positionless_borderlines[season] = {}
        lb, ub, pb1, pb5, pb99 = 0.0, 0.0, 0.0, 0.0, 0.0
        for position in positions:                        
            entropies_each_season = [e[position] for e in entropies[season]]
            print(f"Entropy for {position} in {season}: {entropies_each_season}")
            lb = np.percentile(entropies_each_season, 0.1)
            pb1 = np.percentile(entropies_each_season, 1)
            pb5 = np.percentile(entropies_each_season, 5)
            pb99 = np.percentile(entropies_each_season, 99)
            ub = np.percentile(entropies_each_season, 99.9)
            positionless_borderlines[season][position] = (lb, pb1, pb5, pb99, ub)
        
            
                
    return positionless_borderlines

In [None]:
positionless_borderlines = get_positionless_borderlines(entropies, POSITIONS)