In [29]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Modifying the load_and_process_data function to aggregate the most frequent position for each player
def load_and_process_data(file_path):
    data = pd.read_excel(file_path)
    pd.set_option('display.max_rows', None)
    data = data[data["Season"]==1]

    agg_data = data.groupby('Player').agg({
        'Total Points': 'sum',
        'Goal Points': 'sum',
        'Defensive Score Points': 'sum',
        'Midfield Score': 'sum',
        'MVP Points': 'sum',
        'Date': 'count',
        'Position': lambda x: x.value_counts().index[0]  # Most frequent position
    }).rename(columns={'Date': 'Games Played'})

    agg_data['Average Points per Game'] = agg_data['Total Points'] / agg_data['Games Played']

    return agg_data

# Adjust the assign_clusters_and_prices function to add back the player position in the clustered_data
def assign_clusters_and_prices(agg_data, scaled_data):
    optimal_clusters = determine_optimal_clusters(scaled_data)
    kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', n_init=10, random_state=42)
    clusters = kmeans.fit_predict(scaled_data)
    agg_data['Cluster'] = clusters

    # Determine the average value of each cluster
    cluster_averages = agg_data.groupby('Cluster')['Average Points per Game'].mean().sort_values(ascending=False)

    # Define price values (assuming the prices are linearly decreasing from 15 to 2 for the clusters)
    prices = np.linspace(10, 2, num=optimal_clusters).round().astype(int)

    # Create a mapping of cluster to its price based on the sorted average values
    price_mapping = dict(zip(cluster_averages.index, prices))
    
    agg_data['Price (in $M)'] = agg_data['Cluster'].map(price_mapping)
    
    return agg_data

# Defining the initial functions again along with modifications

def determine_optimal_clusters(scaled_data, min_clusters=3, max_clusters=5):
    wcss = []

    for i in range(min_clusters, max_clusters+1):
        kmeans = KMeans(n_clusters=i, init='k-means++', n_init=10, random_state=42)
        kmeans.fit(scaled_data)
        wcss.append(kmeans.inertia_)
        
    # Calculate the differences between each point and its predecessor
    diffs = np.diff(wcss)
    # Return the point which has the largest difference
    optimal_clusters = np.argmin(diffs) + min_clusters + 1  # adjusting for the starting cluster number
    
    return optimal_clusters

def assign_clusters_and_prices(agg_data, scaled_data):
    optimal_clusters = determine_optimal_clusters(scaled_data)
    kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', n_init=10, random_state=42)
    clusters = kmeans.fit_predict(scaled_data)
    agg_data['Cluster'] = clusters

    # Determine the average value of each cluster
    cluster_averages = agg_data.groupby('Cluster')['Average Points per Game'].mean().sort_values(ascending=False)

    # Define price values (assuming the prices are linearly decreasing from 15 to 2 for the clusters)
    prices = np.linspace(10, 2, num=optimal_clusters).round().astype(int)

    # Create a mapping of cluster to its price based on the sorted average values
    price_mapping = dict(zip(cluster_averages.index, prices))
    
    agg_data['Price (in $M)'] = agg_data['Cluster'].map(price_mapping)
    
    return agg_data

def cluster_summary(clustered_data):
    # Grouping by cluster and aggregating data
    summary = clustered_data.groupby('Cluster').agg({
        'Total Points': ['count', 'mean'],
        'Games Played': ['mean'],
        'Average Points per Game': ['mean'],
        'Price (in $M)': 'first'  # Since all players in the same cluster have the same price
    }).reset_index()

    # Renaming columns for a cleaner look
    summary.columns = ['Cluster', 'Number of Players', 'Avg. Total Points','Average Games Played', 'Average Points/Game','Price (in $M)']
    
    return summary
file = "/workspaces/SPL/Middleput/points.xlsx"
aggregated_data = load_and_process_data(file)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(aggregated_data.drop(columns='Position'))

clustered_data = assign_clusters_and_prices(aggregated_data, scaled_data).reset_index()
clustered_summary = cluster_summary(clustered_data)
clustered_summary

Unnamed: 0,Cluster,Number of Players,Avg. Total Points,Average Games Played,Average Points/Game,Price (in $M)
0,0,26,20.923077,2.615385,7.852564,2
1,1,7,206.857143,18.714286,11.197758,7
2,2,8,157.125,15.5,10.101056,5
3,3,23,51.173913,4.26087,12.812767,10


In [22]:
clustered_data

Unnamed: 0,Player,Total Points,Goal Points,Defensive Score Points,Midfield Score,MVP Points,Games Played,Position,Average Points per Game,Cluster,Price (in $M)
0,Ale (Fabri),5,0,0,0,0,1,Offensive,5.0,0,2
1,Ale (Mazzu),17,0,3,0,0,2,Defensive,8.5,0,2
2,Ale Guati,26,2,7,0,3,2,Defensive,13.0,3,10
3,Alessandro Gibertini,28,2,0,2,0,3,Offensive,9.333333,0,2
4,Amico Gio,14,0,0,6,0,1,Offensive,14.0,3,10
5,Amico Matteo,6,0,0,0,0,1,Offensive,6.0,0,2
6,Andrea DeGa,19,0,5,0,0,3,Outfield,6.333333,0,2
7,Andrea Limonta,231,3,72,0,0,22,Defensive,10.5,2,5
8,Andrea Scalambra,26,1,8,1,0,3,Outfield,8.666667,0,2
9,Andrea Silverstri,141,10,38,0,0,14,Defensive,10.071429,2,5
