In [55]:
%run ./Imports.ipynb

✅ pandas is already installed.
✅ numpy is already installed.
✅ scikit-learn is already installed.
✅ scipy is already installed.
✅ umap-learn is already installed.
✅ matplotlib is already installed.
✅ seaborn is already installed.
✅ plotly is already installed.
✅ nbformat is already installed.


#### Removal of players with low minutes

In [56]:
# NOTE: Plots the cumulative distribution function (CDF) of a specific column.
# The function highlights a reference cut-off line at 400 (can be changed if needed).

def cdf_graph(dataset: pd.DataFrame, column: str, title: str):
    
    plt.figure(figsize=(4, 3))
    sns.ecdfplot(data=dataset, x=column, log_scale=(True, False))

    plt.axvline(400, color='red', linestyle='--', label='Cut-off Point')

    plt.xlabel('Minutes')
    plt.ylabel('Percentage')

    plt.legend()
    plt.tight_layout()
    
    plt.savefig(f"{title}.png", format="png", dpi=300)
    plt.close()

#### High correlations features


In [57]:
def matrix_of_correlation_graph(df: pd.DataFrame, columns: list[str] = []):

    # Plots a correlation heatmap for the selected columns of a DataFrame.
    # If no columns are provided, the heatmap uses all columns.
    # In our case column names are renamed for readability and abbreviated for shot types.

    filtered_df = df[columns] if columns else df

    filtered_df = filtered_df.rename(
        columns=lambda x: x.replace("_MEAN", "")
                          .replace("_", " ")
                          .replace("CLOSE SHOT", "CS")
                          .replace("MID RANGE SHOT", "MRS")
                          .replace("LONG", "L")
                          .replace("THREE POINT SHOT", "TPS")
    )

    corr = filtered_df.corr()

    mask = np.triu(np.ones_like(corr, dtype=bool))

    plt.figure(figsize=(5, 4))
    ax = sns.heatmap(
        corr,
        mask=mask,
        annot=True,
        annot_kws={"size": 4},
        fmt=".2f",
        cmap="coolwarm",
        xticklabels=True,
        yticklabels=True,
        cbar=False
    )

    ax.set_xticklabels(ax.get_xticklabels(), fontsize=8, rotation=90)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=8, rotation=0)

    legend_patches = [
        mpatches.Patch(color="white", label="CS = Close Shot"),
        mpatches.Patch(color="white", label="MRS = Mid Range Shot"),
        mpatches.Patch(color="white", label="L MRS = Long Mid Range Shot"),
        mpatches.Patch(color="white", label="TPS = Three Point Shot"),
    ]

    plt.legend(
        handles=legend_patches,
        loc="upper center",
        bbox_to_anchor=(0.5, 1.15),
        frameon=False,
        ncol=2,
        fontsize=8
    )

    plt.tight_layout()
    plt.savefig("heatmap.png", dpi=300, bbox_inches="tight")
    plt.close()

In [58]:
def plot_pca_loadings(df: pd.DataFrame, selected_columns, output_file: str = "pca_plot.png"):

    # Plots the PCA loadings of selected features on the first two principal components (PC1 and PC2).
    # In our case, features are grouped by type (e.g., CLOSE_SHOT, MID_RANGE_SHOT) and styled with different colors/markers.
    # The selected features are highlighted, must change if using other group of features.

    style_rules = {
        "CLOSE_SHOT": {"color": "red", "marker": "o"},
        "MID_RANGE_SHOT": {"color": "green", "marker": "s"},
        "LONG_MID_RANGE_SHOT": {"color": "orange", "marker": "D"},
        "THREE_POINT_SHOT": {"color": "blue", "marker": "^"}
    }

    highlighted_features = [
        "CLOSE_SHOT_M_MEAN",
        "MID_RANGE_SHOT_U_MEAN",
        "LONG_MID_RANGE_SHOT_M_MEAN",
        "CLOSE_SHOT_PCT_MEAN",
        "MID_RANGE_SHOT_PCT_MEAN",
    ]

    abbreviations = {
        "CLOSE_SHOT_M_MEAN": "CS M",
        "MID_RANGE_SHOT_U_MEAN": "MRS U",
        "LONG_MID_RANGE_SHOT_M_MEAN": "L MRS M",
        "CLOSE_SHOT_PCT_MEAN": "CS PCT",
        "MID_RANGE_SHOT_PCT_MEAN": "MRS PCT",
    }

    X_scaled = StandardScaler().fit_transform(df[selected_columns])
    pca = PCA()
    pca.fit(X_scaled)
    loadings = pca.components_.T

    df_loadings = pd.DataFrame({
        "Variable": selected_columns,
        "PC1": loadings[:, 0],
        "PC2": loadings[:, 1]
    })

    plt.figure(figsize=(5, 4))
    plt.axhline(0, color="gray", linewidth=0.8, linestyle="--")
    plt.axvline(0, color="gray", linewidth=0.8, linestyle="--")

    for group, style in style_rules.items():
        subset = df_loadings[df_loadings["Variable"].str.startswith(group)]
        for _, row in subset.iterrows():
            var, x, y = row["Variable"], row["PC1"], row["PC2"]
            print(f"{var}: PC1={x:.4f}, PC2={y:.4f}")

            if var in highlighted_features:
                plt.scatter(
                    x, y,
                    s=35,
                    color=style["color"],
                    marker=style["marker"],
                    edgecolor="black",
                    linewidths=1,
                    alpha=0.75
                )
                plt.text(
                    x - 0.003, y,
                    abbreviations.get(var, var),
                    fontsize=10,
                    ha="right",
                    va="center"
                )
            else:
                plt.scatter(
                    x, y,
                    s=30,
                    color=style["color"],
                    marker=style["marker"],
                    edgecolor="none",
                    alpha=0.75
                )

    handles = [
        plt.Line2D(
            [0], [0],
            marker=style["marker"],
            color="w",
            markerfacecolor=style["color"],
            markersize=10,
            label=group
        )
        for group, style in style_rules.items()
    ]

    plt.legend(
        handles=handles,
        fontsize=8,
        ncol=2,
        loc="upper center",
        bbox_to_anchor=(0.5, 1.15),
        frameon=False,
        handlelength=1,
        handletextpad=1,
        columnspacing=1,
        labelspacing=0.8
    )
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.tight_layout()
    plt.grid(False)
    plt.savefig(output_file, dpi=300, bbox_inches="tight")
    plt.close()

#### Heavy Tails

In [59]:
# Returns a DataFrame with mean, standard deviation, coefficient of variation, quartiles, median, kurtosis, and skewness for each numeric feature.
# Non-numeric columns are ignored.
# Useful for quick descriptive analysis before modeling or visualization.

def summarize_features(df: pd.DataFrame):

    mean_vals = df.mean(numeric_only=True)
    std_vals = df.std(numeric_only=True)

    summary_stats = pd.DataFrame({
        'Mean': mean_vals,
        'Standard Deviation': std_vals,
        'Coefficient of Variation': std_vals / mean_vals,
        'Q1 (25%)': df.quantile(0.25, numeric_only=True),
        'Median (50%)': df.quantile(0.50, numeric_only=True),
        'Q3 (75%)': df.quantile(0.75, numeric_only=True),
        'Kurtosis': df.kurtosis(numeric_only=True),
        'Skewness': df.skew(numeric_only=True),
    })

    summary_stats.to_csv("summarized_features")

In [60]:
def plot_histograms_in_batches(df: pd.DataFrame, batch_size: int = 10, output_prefix: str = 'histograms'):

    # Plots histograms for numeric columns of a DataFrame in batches.
    # Useful for exploratory data analysis to visualize feature distributions.

    num_cols = len(df.columns)

    for i in range(0, num_cols, batch_size):
        batch_cols = df.columns[i:i + batch_size]
        
        plt.figure(figsize=(20, 10))
        
        # NOTE: Plot each feature in the batch
        for j, col in enumerate(batch_cols, 1):
            plt.subplot(2, 5, j)  # 2 rows, 5 columns layout for up to 10 features
            df[col].hist(bins=30, edgecolor='black')
            plt.title(col)
        
        plt.tight_layout()
        
        plt.savefig(f'{output_prefix}_{i}.png')
        plt.close()

In [61]:
def transform_skewed_features(df: pd.DataFrame, skew_threshold: float = 1) -> pd.DataFrame:

    # Detects and transforms skewed numeric features in a DataFrame.
    # Positive skew above 'skew_threshold' is corrected using a log transformation.
    # Negative skew below -'skew_threshold' is corrected using a Box-Cox transformation.
    # Handles columns with negative or zero values by shifting them before transformation.

    df = df.copy()

    for col in df.columns:
        x = df[col]
        skewness = x.skew()

        has_neg = (x < 0).any()
        has_zero = (x == 0).any()

        if has_neg:
            min_val = x.min()
            df[col] = x + abs(min_val) + 1

        if skewness > skew_threshold:
            print(f"Transforming (log) column '{col}' with skewness {skewness:.4f}")
            if has_zero:
                df[col] = np.log(df[col] + 1)
            else:
                df[col] = np.log(df[col])

        elif skewness < -skew_threshold:
            print(f"Transforming (Box-Cox) column '{col}' with skewness {skewness:.4f}")
            if has_zero:
                transformed, _ = boxcox(df[col] + 1)
                df[col] = transformed
            else:
                transformed, _ = boxcox(df[col])
                df[col] = transformed

    return df

In [62]:
def transform_log_boxcox(df, skew_threshold=1):
    df = df.copy()

    for col in df.columns:
        x = df[col]
        skewness = x.skew()

        has_neg = (x < 0).any()
        has_zero = (x == 0).any()

        if has_neg:
            min_val = x.min()
            df[col] = x + abs(min_val) + 1

        if skewness > skew_threshold:
            print("PASSEI AQUI +1")
            if has_zero:
                df[col] = df[col] + 1
                df[col] = np.log(df[col])
            else:
                df[col] = np.log(df[col])

        elif skewness < -skew_threshold:
            print("PASSEI AQUI -1")
            if has_zero:
                df[col] = df[col] + 1
                transformed, _ = boxcox(df[col])
                df[col] = transformed
            else:
                transformed, _ = boxcox(df[col])
                df[col] = transformed

    return df

#### Min-Max

In [63]:
def min_max(df: pd.DataFrame) -> pd.DataFrame:

    min_max_scaler = MinMaxScaler()
    min_max_scaled = min_max_scaler.fit_transform(df)
    
    return pd.DataFrame(min_max_scaled, columns=df.columns, index=df.index)

def time_fixer(dataset):

    df = dataset.copy()

    suffixes = ("_MEAN", "_SKEW", "_VAR", "_KUR")
    stat_columns = [col for col in df.columns if col.endswith(suffixes)]

    original_order = df.columns.tolist()

    def scale_group(group):
        scaler = MinMaxScaler()
        scaled = scaler.fit_transform(group[stat_columns])
        group[stat_columns] = pd.DataFrame(scaled, columns=stat_columns, index=group.index)
        return group

    df = df.groupby('SEASON_ID', group_keys=False).apply(scale_group)
    df = df[original_order]

    dataset_features = df.iloc[:, :-12].copy()

    return dataset_features

#### UMAP

In [64]:
#NOTE
# Applies UMAP (Uniform Manifold Approximation and Projection) for dimensionality reduction.
# Reduces the input DataFrame to 'n_components' dimensions.
# 'n_neighbors' controls the local neighborhood size for manifold approximation.
# 'min_dist=0.0' preserves more of the global structure (points can be close together).
# Returns a NumPy array with the embedded coordinates.

def umap_method(df: pd.DataFrame, n_components: int, n_neighbors: int, random_state: int) -> np.ndarray:

    reducer = umap.UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=0.0,
        random_state=random_state
    )
    embedding = reducer.fit_transform(df)

    return embedding

In [65]:
def plot_umap_embedding(embedding: np.ndarray, n_neighbors: int, seed: int):

    # Plots 2D and 3D UMAP embeddings of the input data.
    # Useful for visualizing high-dimensional data in lower dimensions.

    # 2D UMAP embedding
    umap_2d = umap.UMAP(
        n_components=2,
        n_neighbors=n_neighbors,
        min_dist=0.0,
        random_state=seed
    )
    umap_emb_2d = umap_2d.fit_transform(embedding)

    plt.switch_backend('Agg')  # NOTE: Ensures plots can be saved without a display (headless)
    plt.figure(figsize=(8, 6))
    plt.scatter(umap_emb_2d[:, 0], umap_emb_2d[:, 1], s=10, c='green')
    plt.xlabel("UMAP 1")
    plt.ylabel("UMAP 2")
    plt.tight_layout()
    plt.savefig('UMAP_2D.png')
    plt.close()

    # 3D UMAP embedding
    umap_3d = umap.UMAP(
        n_components=3,
        n_neighbors=n_neighbors,
        min_dist=0.0,
        random_state=seed
    )
    umap_emb_3d = umap_3d.fit_transform(embedding)

    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(umap_emb_3d[:, 0], umap_emb_3d[:, 1], umap_emb_3d[:, 2], s=10, c='green')
    ax.set_xlabel("UMAP 1")
    ax.set_ylabel("UMAP 2")
    ax.set_zlabel("UMAP 3")
    plt.tight_layout()
    plt.savefig('UMAP_3D.png')
    plt.close()

#### Clustering


In [66]:
def kmeans_method(embedding: np.ndarray, n_clusters: int, seed: int, plot: bool):

    # Performs KMeans clustering on the provided embedding.
    # Returns cluster labels, average silhouette score, indices of points with negative silhouette, indices of points closest to each cluster center, and inertia.
    # Useful for evaluating cluster quality and detecting potential outliers.

    kmeans = KMeans(n_clusters=n_clusters, random_state=seed)
    labels = kmeans.fit_predict(embedding)

    score = silhouette_score(embedding, labels)
    silhouette_vals = silhouette_samples(embedding, labels)
    low_scores = np.where(silhouette_vals < 0)[0]  # points with negative silhouette

    distances = euclidean_distances(embedding, kmeans.cluster_centers_)
    closest_indices = distances.argmin(axis=0)  # closest point to each cluster center

    if plot:
        plot_2d_tsne_embedding(
            embedding,
            500,
            "k_means_outliers",
            seed,
            labels,
            kmeans.cluster_centers_
        )

    inertia = kmeans.inertia_

    return labels, score, low_scores, closest_indices, inertia

In [67]:
# NOTE:
# Computes and plots silhouette scores for different numbers of clusters.
# Runs KMeans multiple times per cluster count to account for randomness.
# Plots mean silhouette score with a shaded 95% confidence interval.
# Useful for identifying the optimal number of clusters.

def plot_silhouette_analysis(
    embedding: np.ndarray,
    cluster_range=range(2, 11),
    n_runs=10,
    filename='silhouette_score_clusters.png'
) -> pd.DataFrame:

    silhouette_results = []

    for k in cluster_range:
        silhouette_scores = []

        for _ in range(n_runs):
            seed = random.randint(0, 2**32 - 1)
            labels, score, outliers, closest_pts, inertia = kmeans_method(
                embedding=embedding, n_clusters=k, seed=seed, plot=False
            )
            silhouette_scores.append(score)

        mean_val = np.mean(silhouette_scores)
        std_val = np.std(silhouette_scores, ddof=1)
        ci95 = 1.96 * std_val / np.sqrt(n_runs)

        print(f"Clusters={k}, Mean Silhouette={mean_val:.4f}, 95% CI ±{ci95:.4f}")

        silhouette_results.append({
            'k': k,
            'mean': mean_val,
            'ci95': ci95
        })

    df_silhouette = pd.DataFrame(silhouette_results)

    plt.figure(figsize=(4, 3))
    sns.lineplot(
        data=df_silhouette,
        x='k',
        y='mean',
        marker='o',
        color='blue',
        linewidth=2,
        linestyle='-',
        label='Mean Silhouette Score'
    )

    for idx, row in df_silhouette.iterrows():
        plt.fill_betweenx(
            [row['mean'] - row['ci95'], row['mean'] + row['ci95']],
            row['k'] - 0.1, row['k'] + 0.1,
            color='blue',
            alpha=0.2
        )

    line = mlines.Line2D([], [], color='blue', marker='o', linestyle='-', label='Mean')
    rectangle = mpatches.Patch(color='blue', alpha=0.2, label='Confidence Interval')
    plt.legend(handles=[line, rectangle], loc='best')

    plt.xlabel(r'\#\ Clusters')
    plt.ylabel('Silhouette Score')
    plt.xticks(df_silhouette['k'])
    plt.tight_layout()
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()

    return df_silhouette


In [68]:
# NOTE:
# Computes and plots the KMeans inertia (within-cluster sum of squares) for a range of cluster numbers.
# Runs KMeans multiple times per cluster count to reduce randomness effects.
# Plots mean inertia with a shaded area representing the 95% confidence interval.

def plot_inertia_analysis(
    embedding: np.ndarray,
    cluster_range=range(2, 10),
    n_runs=30,
    filename='inertia_clusters.png'
) -> pd.DataFrame:

    inertia_results = []

    for k in cluster_range:
        inertia_scores = []

        for _ in range(n_runs):
            seed = random.randint(0, 2**32 - 1)
            labels, score, outliers, cs, inertia = kmeans_method(
                embedding=embedding,
                n_clusters=k,
                seed=seed,
                plot=False
            )
            inertia_scores.append(inertia)

        print(k)
        print(np.mean(inertia_scores))
        inertia_results.append({
            'k': k,
            'mean': np.mean(inertia_scores),
            'std': np.std(inertia_scores, ddof=1)
        })

    df_inertia = pd.DataFrame(inertia_results)

    plt.figure(figsize=(4, 3))
    sns.lineplot(
        data=df_inertia, x='k', y='mean', marker='o', color='green',
        linewidth=2, linestyle='-', label='Média da Inércia'
    )

    for idx, row in df_inertia.iterrows():
        plt.fill_betweenx(
            [row['mean'] - row['std'], row['mean'] + row['std']],
            row['k'] - 0.1, row['k'] + 0.1,
            color='green', alpha=0.2
        )

    line = mlines.Line2D([], [], color='green', marker='o', linestyle='-', label='Mean')
    rectangle = mpatches.Patch(color='green', alpha=0.2, label='Confidence Interval')

    plt.legend(handles=[line, rectangle], loc='best')

    plt.xlabel(r'\#\ Clusters')
    plt.ylabel('Inertia (WCSS)')
    plt.xticks(df_inertia['k'])
    plt.tight_layout()

    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.show()

    return df_inertia


In [69]:
# NOTE:
# Generates a 'blade-style' silhouette plot for KMeans clustering.
# Each cluster's silhouette scores are sorted and plotted as horizontal 'blades'.
# Helps visualize cohesion and separation of clusters.
# A vertical red dashed line indicates the average silhouette score.

def plot_silhouette_blades(embedding: np.ndarray, n_clusters: int, seed: int):


    kmeans = KMeans(n_clusters=n_clusters, random_state=seed)
    labels = kmeans.fit_predict(embedding)

    silhouette_vals = silhouette_samples(embedding, labels)
    silhouette_avg = silhouette_score(embedding, labels)
    print(f"Average Silhouette Score: {silhouette_avg:.4f}")

    fig, ax = plt.subplots(figsize=(4, 3))
    ax.set_xlim([-0.1, 1])
    ax.set_ylim([0, len(embedding)])

    y_upper = len(embedding)
    for i in range(n_clusters):
        cluster_vals = silhouette_vals[labels == i]
        cluster_vals.sort()
        size = cluster_vals.shape[0]
        y_lower = y_upper - size

        # Color using a spectral colormap
        color = cm.nipy_spectral(float(i) / n_clusters)
        ax.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_vals, facecolor=color, edgecolor=color, alpha=0.7)

        # Cluster number label
        ax.text(-0.05, y_lower + 0.5 * size, str(i + 1))

        y_upper = y_lower

    ax.set_xlabel("Silhouette Score")
    ax.set_ylabel("Clusters", labelpad=15)
    ax.axvline(x=silhouette_avg, color="red", linestyle="--", label="Average Silhouette")
    ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
    ax.set_yticks([])
    ax.legend()

    plt.tight_layout()
    plt.savefig(f"silhouette_analysis_{n_clusters}.png", dpi=300, bbox_inches="tight")
    plt.close()

In [70]:
# NOTE: 
# Prepares feature and metadata DataFrames for analysis.
# Resets indices, concatenates features and metadata, and scales features to [0,1] using MinMaxScaler.
# If cluster labels are provided, they are added as a 'CLUSTER' column to all returned DataFrames.

def formatted_data_to_analysis(features: pd.DataFrame, metadata: pd.DataFrame, labels=None):
    
    features = features.reset_index(drop=True)
    metadata = metadata.reset_index(drop=True)

    combined = pd.concat([features, metadata], axis=1)

    features_to_scale = features.copy()
    
    # Max-Min preparation is used on GINI plots, where we need scales in the same proportion
    scaler = MinMaxScaler()
    features_scaled_array = scaler.fit_transform(features_to_scale)
    features_scaled = pd.DataFrame(features_scaled_array, columns=features.columns)

    if labels is not None:
        labels = np.array(labels)
        features["CLUSTER"] = labels
        combined["CLUSTER"] = labels
        features_scaled["CLUSTER"] = labels

    return features, combined, features_scaled

#### Analysis

In [71]:
# NOTE:
# Choose the clusters colors

colors=['#97d8b2','#9e7b9b','#e7bb41','#531253','#ef3054']

In [72]:
# NOTE: Plots the size of each cluster by comparing:
#   1. The total number of rows assigned to each cluster.
#   2. The number of unique players (PLAYER_ID) per cluster.
# Produces a side-by-side bar chart for comparison.
# Cluster labels from KMeans usually start at 0, but the x-axis is shifted (+1) for readability.

def cluster_size(df: pd.DataFrame, title: str= "cluster_size"):

    groups = df.groupby('CLUSTER')[["PLAYER_ID"]]

    unique_players_per_cluster = groups.nunique()

    # Count total rows (including duplicates) per cluster
    total_players_per_cluster = groups.size()

    clusters = total_players_per_cluster.index
    width = 0.3

    r1 = np.arange(len(clusters))
    r2 = [x + width for x in r1]

    plt.figure(figsize=(4, 3))
    plt.bar(r1, total_players_per_cluster.values, color='#A0ACAD', width=width, label='Total')
    plt.bar(r2, unique_players_per_cluster['PLAYER_ID'].values, color='#97D8B2', width=width, label='Unique')

    plt.xlabel('Clusters')
    plt.ylabel(r"\# Players")
    plt.xticks([r + width/2 for r in range(len(clusters))], clusters + 1)

    plt.legend(
        bbox_to_anchor=(0.5, 1),
        loc='upper right',
        ncol=2,
        handlelength=1,
        columnspacing=0.5
    )

    plt.tight_layout()
    plt.savefig(f'{title}.png', dpi=300, bbox_inches='tight')
    plt.close()


In [73]:
# NOTE: 
# Plots the evolution of cluster membership counts over seasons.
# For each cluster, plots a time series showing how many players belong to it each season.
# Cluster indices are incremented by +1 for readability.

def cluster_members_by_year(df: pd.DataFrame, title: str= "cluster_members_by_year"):

    cluster_year_counts = df.groupby(['CLUSTER', 'SEASON_ID']).size().reset_index(name='Count')
    clusters = cluster_year_counts['CLUSTER'].unique()

    plt.figure(figsize=(4, 3))

    for idx, cluster in enumerate(clusters):
        cluster_data = cluster_year_counts[cluster_year_counts['CLUSTER'] == cluster]
        plt.plot(
            cluster_data['SEASON_ID'],
            cluster_data['Count'],
            label=f'Cluster {cluster + 1}',
            color=colors[idx % len(colors)]
        )

    plt.xticks(fontsize=6, rotation=90)
    plt.xlabel('Season')
    plt.ylabel(r'\# Players')
    plt.legend(
        loc="upper left",
        ncol=2,
        fontsize=8
    )
    plt.tight_layout()
    plt.grid(False)

    plt.savefig(f'{title}.png', dpi=300, bbox_inches='tight')
    plt.close()

In [74]:
# NOTE: 
# Plots cumulative cluster membership over years.
# Groups dataset by cluster and season (SEASON_ID).
# Computes cumulative player count per cluster across seasons.
# Each cluster’s curve shows how many players joined cumulatively up to each year.
# X-axis uses the starting year of each season for better readability.

def cluster_members_total(df: pd.DataFrame, title: str= "accumulate_cluster_by_year"):

    cluster_year_counts = df.groupby(['CLUSTER', 'SEASON_ID']).size().reset_index(name='Count')

    cluster_year_counts = cluster_year_counts.sort_values(['SEASON_ID', 'CLUSTER'])

    cluster_year_counts['Cumulative_Count'] = cluster_year_counts.groupby('CLUSTER')['Count'].cumsum()

    cluster_year_counts['Year'] = cluster_year_counts['SEASON_ID'].str.split('-').str[0].astype(int)

    clusters = cluster_year_counts['CLUSTER'].unique()

    plt.figure(figsize=(4, 3))

    for idx, cluster in enumerate(clusters):
        cluster_data = cluster_year_counts[cluster_year_counts['CLUSTER'] == cluster]
        plt.plot(
            cluster_data['Year'],
            cluster_data['Cumulative_Count'],
            label=f'Cluster {cluster + 1}',
            color=colors[idx % len(colors)]
        )

    plt.xlabel('Year', loc='center')
    plt.ylabel(r'\# Players')

    plt.xticks(fontsize=8, rotation=90)
    plt.yticks(fontsize=8)

    plt.legend(
        loc="upper left",
        ncol=2,
        fontsize=8
    )

    plt.grid(False)
    plt.tight_layout()

    plt.savefig(f'{title}.png', dpi=300, bbox_inches='tight')
    plt.close()


In [75]:
# NOTE:
# Plots bar chart of individual awards (MVP, DPOY, MIP, 6MOY) by cluster.
# Counts how many players in each cluster won each award.
# Uses grouped bar plot for side-by-side comparison.
# Each award type has a distinct color.

def awards_by_cluster(df: pd.DataFrame, title: str= "cluster_awards"):

    total_clusters = df["CLUSTER"].nunique()

    dpoy_count_per_cluster = (
        df[df["DPOY"] == 1]
        .groupby("CLUSTER")["DPOY"]
        .count()
        .reindex(range(total_clusters), fill_value=0)
    )

    mip_count = (
        df[df["MIP"] == 1]
        .groupby("CLUSTER")["MIP"]
        .count()
        .reindex(range(total_clusters), fill_value=0)
    )

    mvp_count = (
        df[df["MVP"] == 1]
        .groupby("CLUSTER")["MVP"]
        .count()
        .reindex(range(total_clusters), fill_value=0)
    )

    six_count = (
        df[df["SMOY"] == 1]
        .groupby("CLUSTER")["SMOY"]
        .count()
        .reindex(range(total_clusters), fill_value=0)
    )

    clusters = [1, 2, 3, 4, 5]
    width = 0.15

    r1 = np.arange(len(clusters))
    r2 = [x + width for x in r1]
    r3 = [x + width * 2 for x in r1]
    r4 = [x + width * 3 for x in r1]

    plt.figure(figsize=(4, 3))

    plt.bar(r1, mvp_count.values, color='#531253', width=width, label='MVP')
    plt.bar(r2, dpoy_count_per_cluster.values, color='#e09891', width=width, label='DPOY')
    plt.bar(r3, mip_count.values, color='#a0acad', width=width, label='MIP')
    plt.bar(r4, six_count.values, color='#97d8b2', width=width, label='6MOY')

    plt.xlabel('Cluster')
    plt.ylabel('Individual Awards')

    plt.xticks([r + width * 1.5 for r in range(len(clusters))], clusters)

    plt.legend(loc='upper left', fontsize=6, ncol=4, handlelength=0.8)

    plt.tight_layout()
    plt.savefig(f'{title}.png', dpi=300, bbox_inches='tight')
    plt.close()

In [None]:
# NOTE
# Computes Gini index and Lorenz curve utilities.
# gini(arr): returns the Gini coefficient for inequality measurement.
# lorenz(arr): returns cumulative Lorenz curve values.

def gini(arr):
    arr = np.sort(arr)          # NOTE: Sort values
    arr = np.clip(arr, 0, None) # NOTE: Clip negatives (Gini not defined for < 0)

    count = arr.size
    coefficient = 2 / count
    indexes = np.arange(1, count + 1)
    weighted_sum = (indexes * arr).sum()
    total = arr.sum()

    if total == 0:
        return 0  # NOTE: Avoid division by zero

    constant = (count + 1) / count
    return coefficient * weighted_sum / total - constant


def lorenz(arr):
    # NOTE: Builds Lorenz curve (cumulative distribution)
    scaled_prefix_sum = arr.cumsum() / arr.sum()
    return np.insert(scaled_prefix_sum, 0, 0)  # NOTE: Ensure curve starts at 0


# NOTE:
# Computes average Gini per variable across clusters and ranks variables.
# Groups dataset by CLUSTER.
# Computes averages Gini values across clusters.
# Returns top N variables with highest mean inequality.

def gini_most(df: pd.DataFrame, num_vars: int):
    gini_scores = {}
    grouped = df.groupby('CLUSTER')

    for column in df.columns:
        if column != 'CLUSTER' and 'MEAN' in column:
            gini_index_per_cluster = {}
            for name, group in grouped:
                sorted_values = group[column].sort_values().tolist()
                gini_index_per_cluster[name] = gini(np.array(sorted_values))
            gini_scores[column] = gini_index_per_cluster

    differences = {}
    for var, values in gini_scores.items():
        values_list = list(values.values())
        diff = np.mean(values_list)
        differences[var] = diff

    sorted_vars = sorted(differences.items(), key=lambda x: x[1], reverse=True)

    top_10_vars = sorted_vars[:num_vars]
    features = []

    for var, diff in top_10_vars:
        print(f'Feature: {var}, Gini Index: {diff}')
        features.append(var)

    df_top_10 = pd.DataFrame(top_10_vars, columns=['Features', 'Mean'])

    fig, ax = plt.subplots(figsize=(6, 3))
    ax.axis('tight')
    ax.axis('off')

    table = ax.table(
        cellText=df_top_10.values,
        colLabels=df_top_10.columns,
        cellLoc='center',
        loc='center'
    )

    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 1.2)

In [77]:
# NOTE: 
# Plots mean + 95% confidence intervals of a feature per cluster.
# For each cluster: computes mean, SEM, and 95% CI.
# Displays as bar plot with error bars.

def ginix_diference_plot(df: pd.DataFrame, column: str, title: str):

    grouped = df.groupby('CLUSTER')[column]
    graph_data = []

    for cluster, group in grouped:
        mean = np.mean(group)          
        sem = stats.sem(group)          
        df = len(group) - 1            

        if df > 0:
            interval = stats.t.interval(
                confidence=0.95,
                df=df,
                loc=mean,
                scale=sem
            )
        else:
            interval = (mean, mean) 

        graph_data.append({
            'cluster': cluster,
            'mean': mean,
            'lower': interval[0],
            'upper': interval[1]
        })

    graph_df = pd.DataFrame(graph_data).set_index('cluster')
    x_vals = graph_df.index + 1

    def decimal_format(x, pos):
        return f'{x:.1f}'
    formatter = FuncFormatter(decimal_format)

    plt.figure(figsize=(4, 3))

    plt.bar(x=x_vals, height=graph_df['mean'], color='red', alpha=0.7)

    plt.errorbar(
        x=x_vals,
        y=graph_df['mean'],
        yerr=[graph_df['mean'] - graph_df['lower'],
              graph_df['upper'] - graph_df['mean']],
        fmt='none',
        ecolor='black',
        capsize=3,
        elinewidth=0.8
    )

    plt.xticks(x_vals)
    plt.xlabel('Clusters')
    plt.ylabel(column)
    plt.gca().yaxis.set_major_formatter(formatter)

    plt.tight_layout()
    plt.savefig(f'{title}.png', dpi=300, bbox_inches='tight')

In [78]:
plotradar_columns = ["FG3M_MEAN", "LONG_MID_RANGE_SHOT_M_MEAN", "BLK_MEAN", "MID_RANGE_SHOT_U_MEAN",
           "AST_MEAN", "FTU_MEAN", "OREB_MEAN", "CLUSTER"]

# NOTE: 
# Plots radar charts for the mean values of selected features per cluster.
# Each subplot corresponds to a cluster. The radar shows normalized mean values for the given features.
# 'CLUSTER' column in the dataset.

def plotradar(data: pd.DataFrame, colors=colors, columns=plotradar_columns, title: str = "plot_radar"):

    df = pd.DataFrame(data)
    df = df[columns]

    clusters = [0, 1, 2, 3, 4]
    df_filtered = df[df["CLUSTER"].isin(clusters)]
    clusters = df_filtered["CLUSTER"].unique()
    clusters.sort()

    num_clusters = len(clusters)
    fig, axes = plt.subplots(
        1, num_clusters,
        figsize=(num_clusters * 1.5, 1.5),
        subplot_kw={'polar': True}
    )

    if num_clusters == 1:
        axes = [axes]  # NOTE: Ensure axes is iterable for single cluster

    for i, cluster in enumerate(clusters):
        cluster_data = df[df["CLUSTER"] == cluster].drop(columns=["CLUSTER"])

        if cluster_data.empty:
            print(f"Cluster {cluster} has no data.")
            continue

        N = len(cluster_data.columns)  # NOTE: Number of metrics/features
        metrics = list(range(1, N + 1))

        angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
        angles += angles[:1]  # NOTE: Close the radar polygon

        ax = axes[i]
        ax.set_theta_offset(np.pi / 2)
        ax.set_theta_direction(-1)

        ax.set_xticks(angles[:-1])
        ax.set_xticklabels([str(m) for m in metrics], fontsize=10)

        ax.set_rlabel_position(0)
        ax.set_yticks(np.arange(0, 1.1, 0.2))
        ax.set_yticklabels(["0.0", "0.2", "0.4", "0.6", "0.8", "1.0"], color="grey", fontsize=8)
        ax.set_ylim(0, 0.5)

        mean_values = cluster_data.mean().values.flatten().tolist()
        mean_values += mean_values[:1]

        ax.plot(angles, mean_values, linewidth=1, linestyle="solid", color=colors[i])
        ax.fill(angles, mean_values, alpha=0.2, color=colors[i])

        for label, angle in zip(ax.get_xticklabels(), angles[:-1]):
            x_offset = 0.2
            y_offset = 0.2
            label.set_position((angle, x_offset))
            label.set_fontsize(10)

    plt.tight_layout()
    plt.savefig(f'{title}.png', dpi=300, bbox_inches='tight')

In [79]:
# NOTE: 
# Computes cluster migration percentages for players across seasons.
# 'Pct_Leaving' = % of players leaving the origin cluster to a specific destination cluster.
# 'Pct_Total' = % of total migrations represented by this specific transfer.

def analyze_cluster_migration_percentages(df: pd.DataFrame, season_ids: list, n_clusters: int) -> pd.DataFrame:

    years_data = df[df["SEASON_ID"].isin(season_ids)].copy()

    years_data = years_data.sort_values(by=["PLAYER_ID", "SEASON_ID"])

    years_data["NEXT_CLUSTER"] = years_data.groupby("PLAYER_ID")["CLUSTER"].shift(-1)

    migration_year = years_data.dropna(subset=["NEXT_CLUSTER"])
    migration_year = migration_year[migration_year["CLUSTER"] != migration_year["NEXT_CLUSTER"]]

    migration_counts = (
        migration_year
        .groupby(["CLUSTER", "NEXT_CLUSTER"])
        .size()
        .reset_index(name="Transfers")
    )

    all_pairs = pd.DataFrame(
        [(i, j) for i in range(n_clusters) for j in range(n_clusters) if i != j],
        columns=["FROM_CLUSTER", "TO_CLUSTER"]
    )

    migration_counts = pd.merge(
        all_pairs,
        migration_counts,
        left_on=["FROM_CLUSTER", "TO_CLUSTER"],
        right_on=["CLUSTER", "NEXT_CLUSTER"],
        how="left"
    ).fillna({"Transfers": 0})

    migration_counts = migration_counts.drop(columns=["CLUSTER", "NEXT_CLUSTER"])

    total_outgoing = migration_counts.groupby("FROM_CLUSTER")["Transfers"].sum().reset_index()
    total_outgoing.rename(columns={"Transfers": "Total_Leaving"}, inplace=True)

    total_transfers = migration_counts["Transfers"].sum()

    migration_counts = migration_counts.merge(total_outgoing, on="FROM_CLUSTER", how="left")

    migration_counts["Pct_Leaving"] = migration_counts.apply(
        lambda row: (row["Transfers"] / row["Total_Leaving"] * 100) if row["Total_Leaving"] > 0 else 0,
        axis=1
    )

    migration_counts["Pct_Total"] = migration_counts.apply(
        lambda row: (row["Transfers"] / total_transfers * 100) if total_transfers > 0 else 0,
        axis=1
    )

    migration_counts["FROM_CLUSTER"] = migration_counts["FROM_CLUSTER"] + 1
    migration_counts["TO_CLUSTER"] = migration_counts["TO_CLUSTER"] + 1

    return migration_counts[["FROM_CLUSTER", "TO_CLUSTER", "Pct_Leaving", "Pct_Total"]]


In [80]:
# NOTE: 
# Plots the distribution of player positions across clusters.
# Groups similar positions into general categories: Center, Forward, Guard.
# Uses a grouped bar chart to show counts per cluster for each position.

def plot_players_position_by_cluster(df: pd.DataFrame, title: str = "player_position_by_cluster"):

    df["POSITION"] = df["POSITION"].replace(["Center-Forward"], "Center")
    df["POSITION"] = df["POSITION"].replace(["Forward-Center", "Forward-Guard"], "Forward")
    df["POSITION"] = df["POSITION"].replace(["Guard-Forward"], "Guard")

    positions = df.groupby(['CLUSTER', 'POSITION']).size().unstack(fill_value=0)

    clusters = [1, 2, 3, 4, 5]

    center_counts = positions["Center"].values
    forward_counts = positions["Forward"].values
    guard_counts = positions["Guard"].values

    width = 0.15
    x = np.arange(len(clusters))

    plt.figure(figsize=(4, 3))

    plt.bar(x - width, center_counts, width=width, color='#531253', label='Center')
    plt.bar(x, forward_counts, width=width, color='#e09891', label='Forward')
    plt.bar(x + width, guard_counts, width=width, color='#e7bb41', label='Guard')

    plt.xlabel('Clusters')
    plt.ylabel('# Players')
    plt.xticks(x, clusters)
    plt.legend(loc='upper left', fontsize=8, ncol=3, handlelength=0.6)
    plt.tight_layout()
    plt.savefig(f'{title}.png', dpi=300, bbox_inches='tight')

In [None]:
def draw_court(ax=None, color='black', lw=2, outer_lines=False):

    if ax is None:
        ax = plt.gca()

    hoop = Circle((0, 0), radius=7.5, linewidth=lw, color=color, fill=False)

    backboard = Rectangle((-30, -7.5), 60, -1, linewidth=lw, color=color)

    outer_box = Rectangle((-80, -47.5), 160, 190, linewidth=lw, color=color,
                          fill=False)

    inner_box = Rectangle((-60, -47.5), 120, 190, linewidth=lw, color=color,
                          fill=False)

    top_free_throw = Arc((0, 142.5), 120, 120, theta1=0, theta2=180,
                         linewidth=lw, color=color, fill=False)

    bottom_free_throw = Arc((0, 142.5), 120, 120, theta1=180, theta2=0,
                            linewidth=lw, color=color, linestyle='dashed')

    restricted = Arc((0, 0), 80, 80, theta1=0, theta2=180, linewidth=lw,
                     color=color)

    corner_three_a = Rectangle((-220, -47.5), 0, 140, linewidth=lw,
                               color=color)
    corner_three_b = Rectangle((220, -47.5), 0, 140, linewidth=lw, color=color)

    three_arc = Arc((0, 0), 475, 475, theta1=22, theta2=158, linewidth=lw,
                    color=color)

    center_outer_arc = Arc((0, 422.5), 120, 120, theta1=180, theta2=0,
                           linewidth=lw, color=color)
    center_inner_arc = Arc((0, 422.5), 40, 40, theta1=180, theta2=0,
                           linewidth=lw, color=color)

    court_elements = [hoop, backboard, outer_box, inner_box, top_free_throw,
                      bottom_free_throw, restricted, corner_three_a,
                      corner_three_b, three_arc, center_outer_arc,
                      center_inner_arc]

    if outer_lines:
        outer_lines = Rectangle((-250, -47.5), 500, 470, linewidth=lw,
                                color=color, fill=False)
        court_elements.append(outer_lines)

    for element in court_elements:
        ax.add_patch(element)

    return ax

# NOTE: 
# Plots a shot map for a given player.
# Made shots are shown as green circles, missed shots as red Xs.
# Accepts any player_id and output filename/title.

def plot_players_shot_map(shot_df: pd.DataFrame, player_id: int, title: str = "shot_map"):

    player_shots = shot_df[shot_df["PLAYER_ID"] == player_id]

    plt.figure(figsize=(3.5, 4))

    for _, row in player_shots.iterrows():
        if row["SHOT_MADE_FLAG"] == 1:  # Made shot
            plt.scatter(row['LOC_X'], row['LOC_Y'], color='#76b947', marker='o')  # Green circle
        else:  # Missed shot
            plt.scatter(row['LOC_X'], row['LOC_Y'], color='#ba0f30', marker='x')  # Red X

    draw_court()

    plt.xlim(-250, 250)
    plt.ylim(422.5, -47.5)

    plt.gca().set_xticks([])
    plt.gca().set_yticks([])

    plt.tight_layout()
    plt.savefig(f'{title}.png', dpi=300, bbox_inches='tight')
    plt.close()