In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
import xlsxwriter
from openpyxl import load_workbook
import scipy.stats as sts
from sklearn.linear_model import LinearRegression

In [2]:
TEAMS= ["ATL", "BOS", "BRK", "CHI", "CHO", "CLE", "DAL", "DEN", "DET", "GSW", "HOU", "IND", "LAC", "LAL", "MEM", "MIA", "MIL", "MIN", "NOP", "NYK", "OKC", "ORL", "PHI", "PHO", "POR", "SAC", "SAS", "TOR", "UTA", "WAS"]
WESTERN_CONF_TEAMS = ["DAL", "DEN", "GSW", "HOU", "LAC", "LAL", "MEM", "MIN", "NOP", "OKC", "PHO", "POR", "SAC", "SAS", "UTA"]
EASTERN_CONF_TEAMS = ["ATL", "BOS", "BRK", "CHI", "CHO", "CLE", "DET", "IND", "MIA", "MIL", "NYK", "ORL", "PHI", "TOR", "WAS"]
ATLANTIC_DIV_TEAMS = ["BOS", "BRK", "NYK", "PHI", "TOR"]
CENTRAL_DIV_TEAMS = ["CHI", "CLE", "DET", "IND", "MIL"]
SOUTHEAST_DIV_TEAMS = ["ATL", "CHO", "MIA", "ORL", "WAS"]
NORTHWEST_DIV_TEAMS = ["DEN", "MIN", "OKC", "POR", "UTA"]
SOUTHWEST_DIV_TEAM = ["GSW", "LAC", "LAL", "PHO", "SAC"]
PACIFIC_DIV_TEAMS = ["DAL", "HOU", "MEM", "NOP", "SAS"]
SEASONS = [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
POSITIONS = ["PG", "SG", "SF", "PF", "C"]

In [3]:
def mean(data):
    return sum(data) / len(data)

In [4]:
def variance(data):
    n = len(data)
    avg = mean(data)
    
    deviations = [(x - avg) ** 2 for x in data]
    
    variance = sum(deviations) / n
    return variance

In [5]:
def std(data):
    var = variance(data)
    std = var ** 0.5
    return std

In [6]:
def growth_rate(data):
    straight_line_growth = []
    
    for i in range(1, len(data)):
        growth = (data[i] - data[i-1]) / data[i-1]
        straight_line_growth.append(growth)
        
    growth_rt = sum(straight_line_growth) / len(straight_line_growth)
    
    return "{:.2f}%".format(growth_rt*100)

In [7]:
def merge_standings(eastern_standings, western_standings):
    standings = pd.concat([western_standings, eastern_standings])
    standings.sort_values(by=["Made_Playoffs", "W", "Rank", "Team"], ascending=[False, False, True, True], inplace=True)
    standings.reset_index(inplace=True)
    standings.drop('index', axis=1, inplace=True)
    standings.index += 1
    teams_ranked = dict(standings["Team"])
    inv_teams_ranked = {v: k for k, v in teams_ranked.items()}
    return inv_teams_ranked

In [8]:
def spearman_rank_corr(rank1, rank2):
    d_square = []
    
    if len(rank1) != len(rank2):
        return
    
    n = len(rank1)
    
    for i in range(n):
        d = rank1[i] - rank2[i]
        d_square.append(d ** 2)
    
    return 1 - ((6 * sum(d_square)) / (n * (n ** 2 - 1)))

In [9]:
def linear_regression_stat(x, y):
    if len(x) != len(y):
        return
    
    return sts.linregress(x, y)

In [10]:
def multiple_linear_regression_stat(X, y):
    return LinearRegression().fit(X, y)

In [12]:
def calculate_rank_corr_by_team_stat(df, eastern_standings, western_standings, stats):
    
    stats_rankings_dicts = []
    
    for stat in stats:
        corr_df = df.sort_values(by=[stat, 'Team'], ascending=[False, True])
        corr_df.set_index('Team', inplace=True)

        teams = list(corr_df.index)
        rank = range(1, 31)
        zip_iterator = zip(teams, rank)

        teams_ordered_by_stat = dict(zip_iterator)
        
        stats_rankings_dicts.append(teams_ordered_by_stat)

        
    standings = merge_standings(eastern_standings, western_standings)
    
    
    rank1 = []
    rank2 = []
    for team in TEAMS:
        if team in standings:
            stats_rankings = []
            for stat_ranking in stats_rankings_dicts:
                stats_rankings.append(stat_ranking[team])
            rank1.append(stats_rankings)
            rank2.append(standings[team])
    
    print(rank1)
    rank1 = np.array(rank1)
    rank2 = np.array(rank2)
    return multiple_linear_regression_stat(rank1, rank2)

In [14]:
def rank_team_corrs_to_excel(dfs, eastern_standings_dfs, western_standings_dfs, stat, mode='spearman'):
    corrs = [stat]
    
    n = len(dfs)
    
    for i in range(n):
        corrs.append(calculate_rank_corr_by_team_stat(dfs[i], eastern_standings_dfs[i], western_standings_dfs[i], stat))
    
    wb = load_workbook("Linear_Regression_Team_Stats.xlsx")
    ws = wb.worksheets[0]
    
    ws.append(corrs)
    wb.save("Linear_Regression_Team_Stats.xlsx")

In [15]:
def format_season(season):
        #Formats years in season to use in File Name
        #Ex: 2021 turns into 2020-21
        first_year = season-1
        second_year = str(season)[2:]
        
        return first_year, second_year

In [16]:
def get_csvs_by_season(season):
    first_year, second_year = format_season(season)

    player_df = pd.read_csv("DataCollection/Player_Stats/player_stats_{0}-{1}.csv".format(first_year, second_year))
    team_df = pd.read_csv("DataCollection/Team_Stats/team_stats_{0}-{1}.csv".format(first_year, second_year))
    adv_team_df = pd.read_csv("DataCollection/Advanced_Team_Stats/adv_team_stats_{0}-{1}.csv".format(first_year, second_year))
    opponent_df = pd.read_csv("DataCollection/Opponent_Stats/opponent_stats_{0}-{1}.csv".format(first_year, second_year))
    eastern_standings_df = pd.read_csv("DataCollection/Standings/standings_eastern_conference_{0}-{1}.csv".format(first_year, second_year))
    western_standings_df = pd.read_csv("DataCollection/Standings/standings_western_conference_{0}-{1}.csv".format(first_year, second_year))

    return player_df, team_df, adv_team_df, opponent_df, eastern_standings_df, western_standings_df

In [17]:
player_dfs, team_dfs, adv_team_dfs, opponent_dfs, eastern_standings_dfs, western_standings_dfs = [], [], [], [], [], []

for season in SEASONS:
    player_df, team_df, adv_team_df, opponent_df, eastern_standings_df, western_standings_df = get_csvs_by_season(season)
    player_dfs.append(player_df)
    team_dfs.append(team_df)
    adv_team_dfs.append(adv_team_df)
    opponent_dfs.append(opponent_df)
    eastern_standings_dfs.append(eastern_standings_df)
    western_standings_dfs.append(western_standings_df)

In [18]:
stats = ['W', 'L', 'PW', 'PL', 'MOV', 'SOS', 'SRS', 'ORtg',
       'DRtg', 'NRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'Off_eFG%', 'Off_TOV%',
       'Off_ORB%', 'Off_FT/FGA', 'Def_eFG%', 'Def_TOV%', 'Def_DRB%',
       'Def_FT/FGA', 'Arena', 'Attend.', 'Attend./G']