In [55]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from typing import List

# Modifying the load_and_process_data function to aggregate the most frequent position for each player
def load_and_process_data(file_path):
    data = pd.read_excel(file_path)
    pd.set_option('display.max_rows', None)
    data = data[data["Season"]==1]

    agg_data = data.groupby('Player').agg({
        'Total Points': 'sum',
        'Goal Points': 'sum',
        'Defensive Score Points': 'sum',
        'Midfield Score': 'sum',
        'MVP Points': 'sum',
        'Date': 'count',
        'Position': lambda x: x.value_counts().index[0]  # Most frequent position
    }).rename(columns={'Date': 'Games Played'})

    agg_data['Average Points per Game'] = agg_data['Total Points'] / agg_data['Games Played']

    return agg_data

# Adjust the assign_clusters_and_prices function to add back the player position in the clustered_data
def assign_clusters_and_prices(agg_data, scaled_data):
    optimal_clusters = determine_optimal_clusters(scaled_data)
    kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', n_init=10, random_state=42)
    clusters = kmeans.fit_predict(scaled_data)
    agg_data['Cluster'] = clusters

    # Determine the average value of each cluster
    cluster_averages = agg_data.groupby('Cluster')['Average Points per Game'].mean().sort_values(ascending=False)

    # Define price values (assuming the prices are linearly decreasing from 15 to 2 for the clusters)
    prices = np.linspace(10, 2, num=optimal_clusters).round().astype(int)

    # Create a mapping of cluster to its price based on the sorted average values
    price_mapping = dict(zip(cluster_averages.index, prices))
    
    agg_data['Price (in $M)'] = agg_data['Cluster'].map(price_mapping)
    
    return agg_data

# Defining the initial functions again along with modifications

def determine_optimal_clusters(scaled_data, min_clusters=3, max_clusters=5):
    wcss = []

    for i in range(min_clusters, max_clusters+1):
        kmeans = KMeans(n_clusters=i, init='k-means++', n_init=10, random_state=42)
        kmeans.fit(scaled_data)
        wcss.append(kmeans.inertia_)
        
    # Calculate the differences between each point and its predecessor
    diffs = np.diff(wcss)
    # Return the point which has the largest difference
    optimal_clusters = np.argmin(diffs) + min_clusters + 1  # adjusting for the starting cluster number
    
    return optimal_clusters

def assign_clusters_and_prices(agg_data, scaled_data):
    optimal_clusters = determine_optimal_clusters(scaled_data)
    kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', n_init=10, random_state=42)
    clusters = kmeans.fit_predict(scaled_data)
    agg_data['Cluster'] = clusters

    # Determine the average value of each cluster
    cluster_averages = agg_data.groupby('Cluster')['Average Points per Game'].mean().sort_values(ascending=False)

    # Define price values (assuming the prices are linearly decreasing from 10 to 2 for the clusters)
    prices = np.linspace(10, 2, num=optimal_clusters).round().astype(int)

    # Create a mapping of cluster to its price based on the sorted average values
    price_mapping = dict(zip(cluster_averages.index, prices))
    
    agg_data['Price (in $M)'] = agg_data['Cluster'].map(price_mapping)
    
    return agg_data

def cluster_summary(clustered_data):
    # Grouping by cluster and aggregating data
    summary = clustered_data.groupby('Cluster').agg({
        'Total Points': ['count', 'mean'],
        'Games Played': ['mean'],
        'Average Points per Game': ['mean'],
        'Price (in $M)': 'first'  # Since all players in the same cluster have the same price
    }).reset_index()

    # Renaming columns for a cleaner look
    summary.columns = ['Cluster', 'Number of Players', 'Avg. Total Points','Average Games Played', 'Average Points/Game','Price (in $M)']
    
    return summary
file = "/workspaces/SPL/Middleput/points.xlsx"
aggregated_data = load_and_process_data(file)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(aggregated_data.drop(columns='Position'))

clustered_data = assign_clusters_and_prices(aggregated_data, scaled_data).reset_index().round(2)
clustered_summary = cluster_summary(clustered_data).round(2)
clustered_summary

Unnamed: 0,Cluster,Number of Players,Avg. Total Points,Average Games Played,Average Points/Game,Price (in $M)
0,0,26,20.92,2.62,7.85,2
1,1,7,206.86,18.71,11.2,7
2,2,8,157.12,15.5,10.1,5
3,3,23,51.17,4.26,12.81,10


In [57]:

def generate_html_table_for_cluster(cluster_id: int, summary: pd.Series, player_data: pd.DataFrame) -> str:
    """Generate HTML table for a specific cluster."""
    
    # Creating the summary table
    summary_table = f"""
    <table border="1">
        <thead>
            <tr>
                <th colspan="6">Cluster {cluster_id} Summary</th>
            </tr>
            <tr>
                <th>Number of Players</th>
                <th>Avg. Total Points</th>
                <th>Average Games Played</th>
                <th>Average Points/Game</th>
                <th>Price (in $M)</th>
            </tr>
        </thead>
        <tbody>
            <tr>
                <td>{summary['Number of Players']}</td>
                <td>{summary['Avg. Total Points']}</td>
                <td>{summary['Average Games Played']}</td>
                <td>{summary['Average Points/Game']}</td>
                <td>{summary['Price (in $M)']}</td>
            </tr>
        </tbody>
    </table>
    """
    
    # Creating the players table
    players_rows = []
    for _, row in player_data.iterrows():
        player_name = row['Player']
        player_link = f"Middleput/player_data/s1/player_graphs/{player_name}"
        players_rows.append(f"<tr><td><a href='{player_link}'>{player_name}</a></td></tr>")
    
    players_table = f"""
    <table border="1" style="margin-top: 20px;">
        <thead>
            <tr>
                <th>Players in Cluster {cluster_id}</th>
            </tr>
        </thead>
        <tbody>
            {''.join(players_rows)}
        </tbody>
    </table>
    """
    
    return summary_table + players_table

# Assuming you have cluster_summary and clustered_data saved as DataFrames in your Jupyter Notebook
# Generate HTML content
html_content = ["<html><head><title>Clustered Data</title></head><body>"]
for _, summary_row in clustered_summary.iterrows():
    cluster_id = int(summary_row['Cluster'])
    players_in_cluster = clustered_data[clustered_data['Cluster'] == cluster_id]
    html_content.append(generate_html_table_for_cluster(cluster_id, summary_row, players_in_cluster))
html_content.append("</body></html>")

# Joining all parts to create the final HTML
final_html = '\n'.join(html_content)

# Saving the HTML content to a file
output_path = "/workspaces/SPL/Middleput/clustered_data_web.html"
with open(output_path, 'w', encoding='utf-8') as file:
    file.write(final_html)

output_path

'/workspaces/SPL/Middleput/clustered_data_web.html'