## This notebook has the purpose of designing a random classifier for player's positions

In the project, this will be useful to establish a null hypothesis, measuring the distribution hypothesys of each iteration of this null model to verify at what point a season could be classified as "positionless"

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score
from scipy.stats import randint, entropy

from UtilFunctions import format_season
from UtilCollections import STATS_MAPPER, COLOR_MAPPER, POSITIONS

In [2]:
position_to_number = {
    "PG": 0,
    "SG": 1,
    "SF": 2,
    "PF": 3,
    "C": 4
}

In [3]:
def pre_process(seasons):
    df = None
    for season in seasons:
        per_min_cols = ['FGA', '3PA', '2PA', 'PF', 'PTS', 'OWS', 'DWS', 'OBPM', 'DBPM', 'BPM']
        cols_to_drop = None
        first_year, second_year = format_season(season)
        player_data = pd.read_csv(f"DataCollection/Player_Stats/player_stats_{first_year}-{second_year}.csv")
        cols_to_drop = ['index', 'TRB', 'DRB', 'ORB', 'AST', 'G', 'GS', 'FT', 'FG', 'FG%', 'BLK', 'STL',
                                'WS', 'FTA', 'TOV', 'Age', '2P', '3P', 'VORP']
            
        player_data = player_data.fillna(0)
        player_data = player_data[(player_data["MP"] > 15) & (player_data["G"] >= 30)]
        player_data = player_data.sort_values(by=['G'], ascending=False)
        player_data = player_data[player_data["Tm"] != "TOT"]
        player_data = player_data.drop_duplicates(subset ="Player",keep = "first")
        player_data.reset_index(inplace=True)
        player_data.drop(columns=cols_to_drop, inplace=True)
        
        for col in per_min_cols:
            player_data[col] = player_data[col] / player_data['MP'] * 36
        
        if df is None:
            df = player_data
        else:
            df = pd.concat([df, player_data])
            
    df.reset_index(inplace=True)
    df.drop(['index', 'Player', 'Tm'], axis=1, inplace=True)
    return df

In [4]:
def calculate_probability_matrix(cm):
    prob_matrix = []
    for i in range(len(cm)):
        prob_matrix.append([])
        players_in_position = sum(cm[i,:])
        for j in range(len(cm)):
            prob_matrix[i].append(round(cm[i,j]/players_in_position, 2))
    
    return prob_matrix

In [5]:
def entropy_by_position(positions, p_matrix):
    entropies = {}
    i = 0
    for position in positions:
        entropies[position] = entropy(p_matrix[i])
        i += 1
    return entropies

In [6]:
def random_hypothesis(seasons, amount_training_seasons=4):
    entropies_df = {}
    f1_scores = {}
    labels = ["PG", "SG", "SF", "PF", "C"]
    
    for season in seasons:
        entropies_df[season] = []
        f1_scores[season] = []
    
    for i in range(1000):
        if i % 100 == 0:
            print(f"Iteration {i}")
        for season in seasons:
            df = pre_process([season])
            y_test = df['Pos']
            y_pred = [labels[randint.rvs(0, len(POSITIONS))] for _ in range(len(df.index))]

            cm = confusion_matrix(y_test, y_pred, labels=labels)
            prob_matrix = calculate_probability_matrix(cm)
            entropies_df[season].append(entropy_by_position(labels, prob_matrix))
            f1_scores[season].append(f1_score(y_test, y_pred, average='weighted'))
    return entropies_df, f1_scores

In [1]:
entropies, f1_scores = random_hypothesis(range(1980, 2025))

NameError: name 'random_hypothesis' is not defined

In [8]:
import matplotlib.pyplot as plt
import numpy as np

In [17]:
def get_positionless_borderlines(entropies):
    positionless_borderlines = {}
    for season in entropies.keys():
        positionless_borderlines[season] = {}
        
        for position in POSITIONS:            
            entropies_each_season = [e[position] for e in entropies[season]]
            positionless_borderlines[season][position] = np.percentile(entropies_each_season, 1)
    
    return positionless_borderlines

In [15]:
positionless_borderlines = get_positionless_borderlines(entropies)