In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy.stats import shapiro

In [None]:
stats = pd.read_csv('clean_stats.csv')
player = pd.read_csv('clean_players.csv')
team = pd.read_csv('clean_team.csv')
game = pd.read_csv('clean_games.csv')

#Merge the stats table  and player table 
data = stats.merge(player[['last_name']], left_on='player_id', right_on=player['id'])


In [None]:
# Display the column names of the 'data' DataFrame
data.columns


In [None]:
team.head()

In [None]:
#merge team and data table together 
data = data.merge(team['name'], left_on = 'team_id', right_on = team['id'])

In [None]:
data.head()

In [None]:
#drop all id columns 
data.drop(['player_id', 'team_id', 'game_id', 'visitor_team_id', 'home_team_id'], 1 , inplace = True)

In [None]:
data.head()

In [None]:
# Create a histogram of the 'PER' column in the 'data' DataFrame with 100 bins
# Set the title of the plot to 'PER Distribution'
# Save the plot as 'per_distr.jpg'
plt.figure(figsize=(6, 5))
sns.histplot(x=data['PER'], bins=100)
plt.title('PER Distribution')
plt.savefig('per_distr.jpg')


In [None]:
# Perform a Shapiro-Wilk test on the 'PER' column to assess normality
statistic, p_value = shapiro(data['PER'])

# Set the significance level
alpha = 0.05

# Check if the p-value is greater than alpha
if p_value > alpha:
    print("Sample looks Gaussian (fail to reject H0)")
else:
    print("Sample does not look Gaussian (reject H0)")


In [None]:
# Compute the correlation matrix for all columns in the 'data' DataFrame
correlation_matrix = data.corr()

# Extract the correlations of 'PER' with other columns
per_correlations = correlation_matrix['PER']

# Create a figure for the heatmap with a specific size
plt.figure(figsize=(1, 5))

# Generate a heatmap using the correlation values, add annotations, and specify color mapping
sns.heatmap(pd.DataFrame(per_correlations), annot=True, cmap='coolwarm', cbar=False)

# Set the title of the heatmap with a specified font size
plt.title('Correlation Matrix', fontsize=9)

# Save the heatmap as 'per_heatmap.jpg'
plt.savefig('per_heatmap.jpg')

# Display the heatmap
plt.show()


In [None]:
# Group the 'data' DataFrame by 'last_name' and aggregate statistics for each player
# Sum the points ('pts') earned, count the unique seasons ('season'), and sum the minutes played ('min_played')
players = data.groupby(['last_name']).agg({'pts':'sum', 'season':'nunique', 'min_played':'sum'}).reset_index()

In [None]:
# Create a relational plot (scatter plot) using 'pts' as the x-axis and 'min_played' as the y-axis
# Data is taken from the 'players' DataFrame
g = sns.relplot(x='pts', y='min_played', data=players)

# Add an annotated line to the plot with a starting point (10, 2) and a slope of 2.5
g.ax.axline(xy1=(10, 2), slope=2.5)

# Set the title of the plot to describe the relationship being visualized
plt.title('Relationship between Minutes Played and Points Gained')


In [None]:
#Top 10 players with respect to PER
top_10 = data.nlargest(n = 10, columns = 'PER')
ax = sns.barplot(x = 'PER', y = 'last_name', data= top_10,errwidth = 0)
#ax.bar_label(ax.containers[0])
plt.title('Top 10 players')
plt.tight_layout()