In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances

# Function to compute player similarities
def compute_player_similarities(df):
    # Derive boundaries if not present
    if 'derived_boundaries' not in df.columns:
        df['derived_boundaries'] = (df['Runs'] >= 4).astype(int)

    # Compute aggregated statistics for batsmen
    batsmen_stats = df[df['batsman_role'].isin(['Batters', 'Wicket-Keepers', 'All-Rounders'])].groupby('batsman').agg({
        'Runs': 'sum',
        'is_wicket': 'sum',
        'ball': 'count',
        'derived_boundaries': 'sum'
    }).reset_index()
    
    # Filter batsmen with insufficient data
    batsmen_stats = batsmen_stats[batsmen_stats['ball'] >= 10]
    
    # Compute metrics
    batsmen_stats['batting_average'] = batsmen_stats['Runs'] / batsmen_stats['is_wicket'].where(batsmen_stats['is_wicket'] > 0, 1)
    batsmen_stats['strike_rate'] = (batsmen_stats['Runs'] / batsmen_stats['ball']) * 100
    batsmen_stats['boundary_percentage'] = (batsmen_stats['derived_boundaries'] / batsmen_stats['ball']) * 100
    
    # Compute aggregated statistics for bowlers
    bowlers_stats = df[df['bowler_role'].isin(['Bowlers', 'All-Rounders'])].groupby('bowler').agg({
        'Runs': 'sum',
        'is_wicket': 'sum',
        'ball': 'count'
    }).reset_index()
    
    # Filter bowlers with insufficient data
    bowlers_stats = bowlers_stats[bowlers_stats['ball'] >= 60]  # Approx. 10 overs
    
    # Compute metrics
    bowlers_stats['bowling_average'] = bowlers_stats['Runs'] / bowlers_stats['is_wicket'].where(bowlers_stats['is_wicket'] > 0, 1)
    bowlers_stats['economy_rate'] = (bowlers_stats['Runs'] / bowlers_stats['ball']) * 6
    bowlers_stats['strike_rate'] = bowlers_stats['ball'] / bowlers_stats['is_wicket'].where(bowlers_stats['is_wicket'] > 0, 1)
    
    # Standardize features for batsmen
    batsmen_features = batsmen_stats[['batting_average', 'strike_rate', 'boundary_percentage']]
    batsmen_scaler = StandardScaler()
    batsmen_scaled = batsmen_scaler.fit_transform(batsmen_features)
    
    # Standardize features for bowlers
    bowlers_features = bowlers_stats[['bowling_average', 'economy_rate', 'strike_rate']]
    bowlers_scaler = StandardScaler()
    bowlers_scaled = bowlers_scaler.fit_transform(bowlers_features)
    
    # Compute pairwise distances
    batsmen_distances = pairwise_distances(batsmen_scaled, metric='euclidean')
    bowlers_distances = pairwise_distances(bowlers_scaled, metric='euclidean')
    
    # Find similar players
    similar_batsmen = {}
    for i, batsman in enumerate(batsmen_stats['batsman']):
        distances = batsmen_distances[i]
        similar_indices = np.argsort(distances)[1:4]  # Top 3 excluding self
        similar_batsmen[batsman] = [batsmen_stats['batsman'].iloc[idx] for idx in similar_indices]
    
    similar_bowlers = {}
    for i, bowler in enumerate(bowlers_stats['bowler']):
        distances = bowlers_distances[i]
        similar_indices = np.argsort(distances)[1:4]  # Top 3 excluding self
        similar_bowlers[bowler] = [bowlers_stats['bowler'].iloc[idx] for idx in similar_indices]
    
    return similar_batsmen, similar_bowlers

# Function to print similarity results
def print_similarity_results(similar_batsmen, similar_bowlers):
    print("Similar Batsmen:")
    for batsman, similars in similar_batsmen.items():
        print(f"Most similar to {batsman}: {', '.join(similars)}")
    
    print("\nSimilar Bowlers:")
    for bowler, similars in similar_bowlers.items():
        print(f"Most similar to {bowler}: {', '.join(similars)}")

# Main execution
def main(file_path):
    # Load dataset
    df = pd.read_csv(file_path)
    
    # Compute similarities
    similar_batsmen, similar_bowlers = compute_player_similarities(df)
    
    # Print results
    print_similarity_results(similar_batsmen, similar_bowlers)
    
    return similar_batsmen, similar_bowlers

# Example usage
if __name__ == "__main__":
    file_path = 'final.csv'  # Replace with your dataset path
    similar_batsmen, similar_bowlers = main(file_path)