# Eksploracyjna analiza danych

### Badanie miar położenia i rozproszenia
- Użycie metody describe() do wyświetlenia podstawowych miar

### Badanie rozkładu danych
- Boxploty
- Histogramy 
    
### Korelacje
- Wykresy korelacji


In [None]:
import pandas as pd
import sqlite3 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.set_option('display.max_columns', 40)

In [None]:
con = sqlite3.connect('data/start/NBA-Game-Database-combined.sqlite')
tables = pd.read_sql_query(
    "SELECT name FROM sqlite_master WHERE type='table'", con)
tables

### Game Info 

In [None]:
game_info = pd.read_sql_query(f"SELECT * FROM \"{'game_info'}\" ORDER BY date", con)
game_info

In [None]:
game_info.describe().iloc[:,2:]

### Team Stats

In [None]:
team_stats = pd.read_sql_query(f"SELECT * FROM \"{'team_stats'}\"", con)
# team_stats['game_id'] = team_stats['game_id'] \
#                         .apply(lambda x: int.from_bytes(x, byteorder='little') if isinstance(x, bytes) else x)
# team_stats = team_stats.drop(columns='BPM')
num_cols = team_stats.columns[3:].tolist()
print(num_cols)
team_stats

In [None]:
team_stats.describe().iloc[:,1:16]

In [None]:
team_stats.describe().iloc[:,16:]

In [None]:
fig, axs = plt.subplots(4, 4, figsize=(12, 12))
fig.patch.set_facecolor('white')
fig.suptitle('Boxploty dla danych drużynowych', fontsize=16)
for i,column in enumerate(['FGA', '3PA', '3P%',  'FT%', 
               'TRB', 'AST', 'PF', 'PTS', 
               'TOV', 'eFG%', '3PAr', 'ORB%',  
               'AST%',  'TOV%', 'ORtg', 'Pace']):
    sns.boxplot(team_stats, y=column, ax=axs[i//4, i%4], showmeans=True)
plt.tight_layout()
plt.savefig("graphs/boxplots_team.png", dpi=400, bbox_inches='tight')
plt.show()

In [None]:
fig, axs = plt.subplots(4, 4, figsize=(12, 12))
fig.patch.set_facecolor('white')
fig.suptitle('Histogramy dla danych drużynowych', fontsize=16)
for i,column in enumerate(['FGA', '3PA', '3P%',  'FT%', 
               'TRB', 'AST', 'PF', 'PTS', 
               'TOV', 'eFG%', '3PAr', 'ORB%',  
               'STL%',  'TOV%', 'ORtg', 'Pace']):
    if column in ['TOV', 'PF', 'AST', 'FGA', '3PA', 'TRB', 'AST', 'PTS',]:
        sns.histplot(team_stats, x=column, element='bars', ax=axs[i//4, i%4], binwidth=1)
        continue
    sns.histplot(team_stats, x=column, element='bars', ax=axs[i//4, i%4])

plt.tight_layout()
plt.savefig("graphs/histograms_team.png", dpi=400, bbox_inches='tight')
plt.show()

In [None]:
team_stats['win'] = team_stats.groupby('game_id')['PTS'].transform(lambda x: x == x.max()).astype(int)
# team_stats.to_sql('team_stats', con, if_exists='replace', index=False)

In [None]:
corr_matrix = team_stats.select_dtypes(include=[np.number]) \
  .corr() \
  .dropna(axis=0, how='all') \
  .dropna(axis=1, how='all')

plt.figure(figsize=(25, 25))
heatmap1 = sns.heatmap(corr_matrix, cmap="coolwarm", annot=True)
heatmap1.set_title('Macierz korelacji\n\n', fontsize=17)

# _ = heatmap1.set_yticklabels(corr_matrix.index, size=14)
# _ = heatmap1.set_xticklabels(heatmap1.get_xticklabels(), size=14, rotation=45)
plt.savefig("graphs/corr_team.png", dpi=300, bbox_inches='tight')
plt.show()

### Player stats

In [None]:
player_stats = pd.read_sql_query(f"SELECT * FROM \"{'player_stats'}\"", con)
player_stats

In [None]:
player_stats.describe().iloc[:,1:19]

In [None]:
player_stats.describe().iloc[:,19:]

AST% błąd gruby (min -1000)

In [None]:
player_stats['NetRtg'] = player_stats['ORtg'] - player_stats['DRtg']

In [None]:
fig, axs = plt.subplots(4, 4, figsize=(12, 12))
fig.patch.set_facecolor('white')
fig.suptitle('Boxploty dla danych wg zawodników', fontsize=16)
for i,column in enumerate(['PTS', 'FTA', '3P%',  'FT%',
               'TRB', 'AST', 'PF', '3PA',
               'TOV', 'eFG%', '3PAr', '+/-',
               'AST%',  'TOV%', 'NetRtg', 'USG%']):
    sns.boxplot(player_stats, y=column, ax=axs[i//4, i%4], showmeans=True)

plt.tight_layout()
plt.savefig("graphs/boxplots_players.png", dpi=400, bbox_inches='tight')
plt.show()

In [None]:
fig, axs = plt.subplots(4, 4, figsize=(12, 12))
fig.patch.set_facecolor('white')
fig.suptitle('Histogramy dla danych wg zawodników', fontsize=16)
for i,column in enumerate(['PTS', 'FTA', '3P%',  'FT%',
               'TRB', 'AST', 'PF', '3PA',
               'TOV', 'eFG%', '3PAr', '+/-',
               'AST%',  'TOV%', 'NetRtg', 'USG%']):
    if column in ['TOV', 'PF', 'AST', 'FGA', '3PA', 'TRB', 'AST', 'PTS', 'FTA']:
        sns.histplot(player_stats, x=column, element='bars', ax=axs[i//4, i%4], binwidth=1)
        continue
    sns.histplot(player_stats, x=column, element='bars', ax=axs[i//4, i%4])

plt.tight_layout()
plt.savefig("graphs/histograms_players.png", dpi=400, bbox_inches='tight')
plt.show()

In [None]:
con.close()