# Inżynieria cech
 - połączenie danych drużynowych z podsumowaniami meczów
 - przygotowanie cech na formę optymalną do użycia w modelach - normalizacja

In [None]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.set_option('display.max_columns', 40)

In [None]:
con = sqlite3.connect('data/transformed/team_moving_avgs.sqlite')
team_last_20 = pd.read_sql_query(f"SELECT * FROM \"{'team_last_20'}\"", con)
team_last_30 = pd.read_sql_query(f"SELECT * FROM \"{'team_last_30'}\"", con)
team_last_40 = pd.read_sql_query(f"SELECT * FROM \"{'team_last_40'}\"", con)
team_all_season = pd.read_sql_query(f"SELECT * FROM \"{'team_all_season'}\"", con)
con.close()

con = sqlite3.connect('data/transformed/games.sqlite')
games = pd.read_sql_query(f"SELECT * FROM \"{'games'}\"", con)
con.close()

In [None]:
team_last_20

In [None]:
games

In [None]:
games_minus1 = games.copy()
games_minus1['Date'] = (pd.to_datetime(games_minus1['Date']) - pd.Timedelta(days=1)).dt.strftime('%Y-%m-%d')

games_plus1 = games.copy()
games_plus1['Date'] = (pd.to_datetime(games_plus1['Date']) + pd.Timedelta(days=1)).dt.strftime('%Y-%m-%d')

# games_minus2 = games.copy()
# games_minus1['Date'] = (pd.to_datetime(games_minus1['Date']) - pd.Timedelta(days=2)).dt.strftime('%Y-%m-%d')
#
# games_plus2 = games.copy()
# games_plus1['Date'] = (pd.to_datetime(games_plus1['Date']) + pd.Timedelta(days=2)).dt.strftime('%Y-%m-%d')


games_original = games.copy()
games_original['Date'] = pd.to_datetime(games_original['Date']).dt.strftime('%Y-%m-%d')

# Łączenie wszystkich wersji tabeli 'games' w jeden DataFrame
games_expanded = pd.concat([games_original, games_minus1, games_plus1])

games = pd.concat([games, games_minus1, games_plus1])

###

In [None]:
away_df = team_last_20[team_last_20.index % 2 == 0].reset_index(drop=True)
home_df = team_last_20[team_last_20.index % 2 == 1].reset_index(drop=True)

away_df = away_df.add_prefix('away_')
team_last_20 = pd.concat([home_df, away_df], axis=1) \
            .drop(columns=['away_id', 'id', 'away_game_id', 'away_Season', 'away_win', 'away_Date']) \
            .rename(columns={col: f'home_{col}' for col in team_last_20.columns[4:40]})
team_last_20

In [None]:
away_df = team_last_30[team_last_30.index % 2 == 0].reset_index(drop=True)
home_df = team_last_30[team_last_30.index % 2 == 1].reset_index(drop=True)
away_df = away_df.add_prefix('away_')
team_last_30 = pd.concat([home_df, away_df], axis=1) \
            .drop(columns=['away_id', 'id', 'away_game_id', 'away_Season', 'away_win', 'away_Date']) \
            .rename(columns={col: f'home_{col}' for col in team_last_30.columns[4:40]})


away_df = team_last_40[team_last_40.index % 2 == 0].reset_index(drop=True)
home_df = team_last_40[team_last_40.index % 2 == 1].reset_index(drop=True)
away_df = away_df.add_prefix('away_')
team_last_40 = pd.concat([home_df, away_df], axis=1) \
            .drop(columns=['away_id', 'id', 'away_game_id', 'away_Season', 'away_win', 'away_Date']) \
            .rename(columns={col: f'home_{col}' for col in team_last_40.columns[4:40]})


away_df = team_all_season[team_all_season.index % 2 == 0].reset_index(drop=True)
home_df = team_all_season[team_all_season.index % 2 == 1].reset_index(drop=True)
away_df = away_df.add_prefix('away_')
team_all_season = pd.concat([home_df, away_df], axis=1) \
            .drop(columns=['away_id', 'id', 'away_game_id', 'away_Season', 'away_win', 'away_Date']) \
            .rename(columns={col: f'home_{col}' for col in team_all_season.columns[4:40]})

In [None]:
team_last_20

In [None]:
nba_teams_mapping = {
    'ATL': 'Atlanta Hawks',
    'BOS': 'Boston Celtics',
    'BRK': 'Brooklyn Nets',
    'CHO': 'Charlotte Hornets',
    'CHI': 'Chicago Bulls',
    'CLE': 'Cleveland Cavaliers',
    'DAL': 'Dallas Mavericks',
    'DEN': 'Denver Nuggets',
    'DET': 'Detroit Pistons',
    'GSW': 'Golden State Warriors',
    'HOU': 'Houston Rockets',
    'IND': 'Indiana Pacers',
    'LAC': 'LA Clippers',
    'LAL': 'Los Angeles Lakers',
    'MEM': 'Memphis Grizzlies',
    'MIA': 'Miami Heat',
    'MIL': 'Milwaukee Bucks',
    'MIN': 'Minnesota Timberwolves',
    'NOP': 'New Orleans Pelicans',
    'NYK': 'New York Knicks',
    'OKC': 'Oklahoma City Thunder',
    'ORL': 'Orlando Magic',
    'PHI': 'Philadelphia 76ers',
    'PHO': 'Phoenix Suns',
    'POR': 'Portland Trail Blazers',
    'SAC': 'Sacramento Kings',
    'SAS': 'San Antonio Spurs',
    'TOR': 'Toronto Raptors',
    'UTA': 'Utah Jazz',
    'WAS': 'Washington Wizards'
}

In [None]:
team_last_20['home_team_full'] = team_last_20['home_team'].map(nba_teams_mapping)
team_last_20['away_team_full'] = team_last_20['away_team'].map(nba_teams_mapping)

In [None]:
team_avgs_last_20 = pd.merge(
    team_last_20,
    games,
    how='left',
    left_on=['Date', 'home_team_full', 'away_team_full'],
    right_on=['Date', 'TEAM_NAME', 'TEAM_NAME.1']) \
    .drop(columns=['GP.1', 'TEAM_NAME.1', 'W.1', 'L.1', 'GP', 'TEAM_NAME',
                   'W', 'L', 'index', 'home_team_full', 'away_team_full'])

In [None]:
team_avgs_last_20
team_avgs_last_20.columns

In [None]:
team_last_30['home_team_full'] = team_last_30['home_team'].map(nba_teams_mapping)
team_last_30['away_team_full'] = team_last_30['away_team'].map(nba_teams_mapping)
team_avgs_last_30 = pd.merge(
    team_last_30,
    games,
    how='left',
    left_on=['Date', 'home_team_full', 'away_team_full'],
    right_on=['Date', 'TEAM_NAME', 'TEAM_NAME.1']) \
    .drop(columns=['GP.1', 'TEAM_NAME.1', 'W.1', 'L.1', 'GP', 'TEAM_NAME',
                   'W', 'L', 'index', 'home_team_full', 'away_team_full'])

team_last_40['home_team_full'] = team_last_40['home_team'].map(nba_teams_mapping)
team_last_40['away_team_full'] = team_last_40['away_team'].map(nba_teams_mapping)
team_avgs_last_40 = pd.merge(
    team_last_40,
    games,
    how='left',
    left_on=['Date', 'home_team_full', 'away_team_full'],
    right_on=['Date', 'TEAM_NAME', 'TEAM_NAME.1']) \
    .drop(columns=['GP.1', 'TEAM_NAME.1', 'W.1', 'L.1', 'GP', 'TEAM_NAME',
                   'W', 'L', 'index', 'home_team_full', 'away_team_full'])


team_all_season['home_team_full'] = team_all_season['home_team'].map(nba_teams_mapping)
team_all_season['away_team_full'] = team_all_season['away_team'].map(nba_teams_mapping)
team_avgs_all_season = pd.merge(
    team_all_season,
    games,
    how='left',
    left_on=['Date', 'home_team_full', 'away_team_full'],
    right_on=['Date', 'TEAM_NAME', 'TEAM_NAME.1']) \
    .drop(columns=['GP.1', 'TEAM_NAME.1', 'W.1', 'L.1', 'GP', 'TEAM_NAME',
                   'W', 'L', 'index', 'home_team_full', 'away_team_full'])

Użycie .rename w powyższym łańcuchu operacji nie zmieniało nazw kolumn, stąd przypisanie nazw jak poniżej

In [None]:
team_avgs_all_season.columns = [
    'game_id', 'Date', 'Season', 'home_team', 'home_win', 'home_streak',
    'home_last10', 'home_FG', 'home_FGA', 'home_FG%', 'home_3P', 'home_3PA',
    'home_3P%', 'home_FT', 'home_FTA', 'home_FT%', 'home_ORB', 'home_DRB',
    'home_TRB', 'home_AST', 'home_STL', 'home_BLK', 'home_TOV', 'home_PF',
    'home_PTS', 'home_TS%', 'home_eFG%', 'home_3PAr', 'home_FTr',
    'home_ORB%', 'home_DRB%', 'home_TRB%', 'home_AST%', 'home_STL%',
    'home_BLK%', 'home_TOV%', 'home_ORtg', 'home_DRtg', 'home_Pace',
    'away_team', 'away_streak', 'away_last10', 'away_FG', 'away_FGA',
    'away_FG%', 'away_3P', 'away_3PA', 'away_3P%', 'away_FT', 'away_FTA',
    'away_FT%', 'away_ORB', 'away_DRB', 'away_TRB', 'away_AST', 'away_STL',
    'away_BLK', 'away_TOV', 'away_PF', 'away_PTS', 'away_TS%', 'away_eFG%',
    'away_3PAr', 'away_FTr', 'away_ORB%', 'away_DRB%', 'away_TRB%',
    'away_AST%', 'away_STL%', 'away_BLK%', 'away_TOV%', 'away_ORtg',
    'away_DRtg', 'away_Pace', 'home_W_pct', 'home_GP_rank', 'home_W_pct_rank',
    'home_+/-_rank', 'away_W_pct', 'away_GP_rank', 'away_W_pct_rank',
    'away_+/-_rank', 'home_days_rest', 'away_days_rest'
]

team_avgs_last_40.columns = [
    'game_id', 'Date', 'Season', 'home_team', 'home_win', 'home_streak',
    'home_last10', 'home_FG', 'home_FGA', 'home_FG%', 'home_3P', 'home_3PA',
    'home_3P%', 'home_FT', 'home_FTA', 'home_FT%', 'home_ORB', 'home_DRB',
    'home_TRB', 'home_AST', 'home_STL', 'home_BLK', 'home_TOV', 'home_PF',
    'home_PTS', 'home_TS%', 'home_eFG%', 'home_3PAr', 'home_FTr',
    'home_ORB%', 'home_DRB%', 'home_TRB%', 'home_AST%', 'home_STL%',
    'home_BLK%', 'home_TOV%', 'home_ORtg', 'home_DRtg', 'home_Pace',
    'away_team', 'away_streak', 'away_last10', 'away_FG', 'away_FGA',
    'away_FG%', 'away_3P', 'away_3PA', 'away_3P%', 'away_FT', 'away_FTA',
    'away_FT%', 'away_ORB', 'away_DRB', 'away_TRB', 'away_AST', 'away_STL',
    'away_BLK', 'away_TOV', 'away_PF', 'away_PTS', 'away_TS%', 'away_eFG%',
    'away_3PAr', 'away_FTr', 'away_ORB%', 'away_DRB%', 'away_TRB%',
    'away_AST%', 'away_STL%', 'away_BLK%', 'away_TOV%', 'away_ORtg',
    'away_DRtg', 'away_Pace', 'home_W_pct', 'home_GP_rank', 'home_W_pct_rank',
    'home_+/-_rank', 'away_W_pct', 'away_GP_rank', 'away_W_pct_rank',
    'away_+/-_rank', 'home_days_rest', 'away_days_rest'
]

team_avgs_last_30.columns = [
    'game_id', 'Date', 'Season', 'home_team', 'home_win', 'home_streak',
    'home_last10', 'home_FG', 'home_FGA', 'home_FG%', 'home_3P', 'home_3PA',
    'home_3P%', 'home_FT', 'home_FTA', 'home_FT%', 'home_ORB', 'home_DRB',
    'home_TRB', 'home_AST', 'home_STL', 'home_BLK', 'home_TOV', 'home_PF',
    'home_PTS', 'home_TS%', 'home_eFG%', 'home_3PAr', 'home_FTr',
    'home_ORB%', 'home_DRB%', 'home_TRB%', 'home_AST%', 'home_STL%',
    'home_BLK%', 'home_TOV%', 'home_ORtg', 'home_DRtg', 'home_Pace',
    'away_team', 'away_streak', 'away_last10', 'away_FG', 'away_FGA',
    'away_FG%', 'away_3P', 'away_3PA', 'away_3P%', 'away_FT', 'away_FTA',
    'away_FT%', 'away_ORB', 'away_DRB', 'away_TRB', 'away_AST', 'away_STL',
    'away_BLK', 'away_TOV', 'away_PF', 'away_PTS', 'away_TS%', 'away_eFG%',
    'away_3PAr', 'away_FTr', 'away_ORB%', 'away_DRB%', 'away_TRB%',
    'away_AST%', 'away_STL%', 'away_BLK%', 'away_TOV%', 'away_ORtg',
    'away_DRtg', 'away_Pace', 'home_W_pct', 'home_GP_rank', 'home_W_pct_rank',
    'home_+/-_rank', 'away_W_pct', 'away_GP_rank', 'away_W_pct_rank',
    'away_+/-_rank', 'home_days_rest', 'away_days_rest'
]

team_avgs_last_20.columns = [
    'game_id', 'Date', 'Season', 'home_team', 'home_win', 'home_streak',
    'home_last10', 'home_FG', 'home_FGA', 'home_FG%', 'home_3P', 'home_3PA',
    'home_3P%', 'home_FT', 'home_FTA', 'home_FT%', 'home_ORB', 'home_DRB',
    'home_TRB', 'home_AST', 'home_STL', 'home_BLK', 'home_TOV', 'home_PF',
    'home_PTS', 'home_TS%', 'home_eFG%', 'home_3PAr', 'home_FTr',
    'home_ORB%', 'home_DRB%', 'home_TRB%', 'home_AST%', 'home_STL%',
    'home_BLK%', 'home_TOV%', 'home_ORtg', 'home_DRtg', 'home_Pace',
    'away_team', 'away_streak', 'away_last10', 'away_FG', 'away_FGA',
    'away_FG%', 'away_3P', 'away_3PA', 'away_3P%', 'away_FT', 'away_FTA',
    'away_FT%', 'away_ORB', 'away_DRB', 'away_TRB', 'away_AST', 'away_STL',
    'away_BLK', 'away_TOV', 'away_PF', 'away_PTS', 'away_TS%', 'away_eFG%',
    'away_3PAr', 'away_FTr', 'away_ORB%', 'away_DRB%', 'away_TRB%',
    'away_AST%', 'away_STL%', 'away_BLK%', 'away_TOV%', 'away_ORtg',
    'away_DRtg', 'away_Pace', 'home_W_pct', 'home_GP_rank', 'home_W_pct_rank',
    'home_+/-_rank', 'away_W_pct', 'away_GP_rank', 'away_W_pct_rank',
    'away_+/-_rank', 'home_days_rest', 'away_days_rest'
]


In [None]:
team_avgs_all_season.columns

In [None]:
def missing_data_summary(data):
    return data[data.isnull().any(axis=1)], \
        (data.isnull().mean() * 100).round(3)

rows_with_nan, missing_percentages = missing_data_summary(team_avgs_last_30)
rows_with_nan

In [None]:
corr_matrix = team_avgs_last_20.select_dtypes(include=[np.number]) \
  .corr() \
  .dropna(axis=0, how='all') \
  .dropna(axis=1, how='all')

plt.figure(figsize=(55, 55))
heatmap1 = sns.heatmap(corr_matrix, cmap="coolwarm", annot=True, fmt=".2f")

plt.savefig("graphs/corr_processed_20.png", dpi=300, bbox_inches='tight')
plt.close()
#plt.show()

In [None]:
corr_matrix = team_avgs_last_30.select_dtypes(include=[np.number]) \
  .corr() \
  .dropna(axis=0, how='all') \
  .dropna(axis=1, how='all')

plt.figure(figsize=(55, 55))
heatmap1 = sns.heatmap(corr_matrix, cmap="coolwarm", annot=True, fmt=".2f")

plt.savefig("graphs/corr_processed_30.png", dpi=300, bbox_inches='tight')
plt.close()
#plt.show()

In [None]:
corr_matrix = team_avgs_last_40.select_dtypes(include=[np.number]) \
  .corr() \
  .dropna(axis=0, how='all') \
  .dropna(axis=1, how='all')

plt.figure(figsize=(55, 55))
heatmap1 = sns.heatmap(corr_matrix, cmap="coolwarm", annot=True, fmt=".2f")

plt.savefig("graphs/corr_processed_40.png", dpi=300, bbox_inches='tight')
plt.close()
#plt.show()

In [None]:
corr_matrix = team_avgs_all_season.select_dtypes(include=[np.number]) \
  .corr() \
  .dropna(axis=0, how='all') \
  .dropna(axis=1, how='all')

plt.figure(figsize=(55, 55))
heatmap1 = sns.heatmap(corr_matrix, cmap="coolwarm", annot=True, fmt=".2f")

plt.savefig("graphs/corr_processed_all_season.png", dpi=300, bbox_inches='tight')
plt.close()
#plt.show()

In [None]:
pearson_corr_with_target = corr_matrix["home_win"].dropna()

spearman_corr_matrix = team_avgs_all_season.select_dtypes(include=[np.number]) \
    .corr(method="spearman") \
    .dropna(axis=0, how='all') \
    .dropna(axis=1, how='all')
spearman_corr_with_target = spearman_corr_matrix["home_win"].dropna()

spearman_corr_sorted = spearman_corr_with_target.sort_values()
pearson_corr_sorted = pearson_corr_with_target.loc[spearman_corr_sorted.index]

spearman_corr_sorted = spearman_corr_sorted.drop(["home_win", "game_id"])
pearson_corr_sorted = pearson_corr_sorted.drop(["home_win", "game_id"])

prc_columns = ['home_TS%', 'home_eFG%', 'home_3PAr', 'home_FTr',
       'home_ORB%', 'home_DRB%', 'home_TRB%', 'home_AST%', 'home_STL%',
       'home_BLK%', 'home_TOV%', 'home_ORtg', 'home_DRtg', 'away_TS%', 'away_eFG%',
       'away_3PAr', 'away_FTr', 'away_ORB%', 'away_DRB%', 'away_TRB%',
       'away_AST%', 'away_STL%', 'away_BLK%', 'away_TOV%', 'away_ORtg',
       'away_DRtg', 'away_TS%']
highlight_color_prc = "red"

totals_columns = ['home_FG', 'home_FGA', 'home_FG%', 'home_3P', 'home_3PA',
   'home_3P%', 'home_FT', 'home_FTA', 'home_FT%', 'home_ORB', 'home_DRB',
   'home_TRB', 'home_AST', 'home_STL', 'home_BLK', 'home_TOV', 'home_PF',
   'home_PTS',  'home_TS%', 'away_FG', 'away_FGA',
   'away_FG%', 'away_3P', 'away_3PA', 'away_3P%', 'away_FT', 'away_FTA',
   'away_FT%', 'away_ORB', 'away_DRB', 'away_TRB', 'away_AST', 'away_STL',
   'away_BLK', 'away_TOV', 'away_PF', 'away_PTS', 'home_Pace', 'away_Pace']

add_columns = [col for col in spearman_corr_sorted.index if col not in prc_columns and col not in totals_columns]

highlight_color_totals = "blue"

plt.figure(figsize=(10, 14))

bar_width = 0.4
indices = range(len(spearman_corr_sorted))

plt.barh(indices, pearson_corr_sorted.values, bar_width, label="Pearson", color="dodgerblue")
plt.barh([i + bar_width for i in indices], spearman_corr_sorted.values, bar_width, label="Spearman", color="orange")

for i, label in enumerate(spearman_corr_sorted.index):
    if label in prc_columns:
        color = "red"
    elif label in totals_columns:
        color = highlight_color_totals
    else:
        color = "black"

    plt.text(
        -0.4,
        i + bar_width / 2,
        label,
        color=color,
        va="center",
        fontsize=12,
    )

plt.title("Korelacje z 'home_win'", fontsize=16)
plt.xlabel("Współczynniki korelacji", fontsize=14)
plt.tight_layout()
plt.legend(fontsize=12)

plt.savefig("graphs/corr_target.png", dpi=300, bbox_inches='tight')
plt.close()

plt.show()

In [None]:
with sqlite3.connect('data/transformed/team_moving_avgs_merged.sqlite') as con:
    team_avgs_last_20.to_sql('team_last_20', con, if_exists='replace', index=False)
    team_avgs_last_30.to_sql('team_last_30', con, if_exists='replace', index=False)
    team_avgs_last_40.to_sql('team_last_40', con, if_exists='replace', index=False)
    team_avgs_all_season.to_sql('team_all_season', con, if_exists='replace', index=False)