In [1]:
import pandas as pd

In [32]:
df = pd.read_csv("../raw_data/clean_data.csv")

In [34]:
# Step 1: Drop rows where the club is equal to 'Other'
df = df[df['club'] != 'Other']

# second way

## df grouped by teams

In [29]:
def filter_and_append(avg_clubs, df, club, position, max_count):
    filtered_df = filter_position(df, club, position, max_count)
    return pd.concat([avg_clubs, filtered_df])

avg_clubs = pd.DataFrame()

for club in df['club'].unique():
    avg_clubs = filter_and_append(avg_clubs, df, club, 'goalkeeper', 2)
    avg_clubs = filter_and_append(avg_clubs, df, club, 'centerback', 3)
    avg_clubs = filter_and_append(avg_clubs, df, club, 'fullback', 4)
    avg_clubs = filter_and_append(avg_clubs, df, club, 'winger', 4)
    avg_clubs = filter_and_append(avg_clubs, df, club, 'midfielder', 5)
    avg_clubs = filter_and_append(avg_clubs, df, club, 'striker', 2)
    
# Step 2: Group by 'club' and take the mean values
df = avg_clubs.groupby('club').mean()

# Create a new dataset with the numerical features for the followings steps.
columns_to_remove = ['age', 'height', 'weight',"wage", "last trans. fee", "value"]

df.drop(columns=columns_to_remove, inplace=True)

Unnamed: 0,name,club,nat,position,dob,age,height,weight,wage,last trans. fee,...,midfielder,striker,winger,division_rating,nat_rating,club_rating,either_left,either_right,left,right
24,Ederson,Man City,BRA,GK,17/8/1993 (26 years old),26.0,188.0,86.0,474000.0,38500000.0,...,0,0,0,93.8,1812.20,2013.0,0,0,1,0
246,Zack Steffen,Man City,USA,GK,2/4/1995 (25 years old),25.0,191.0,86.0,237000.0,7750000.0,...,0,0,0,93.8,1675.89,2013.0,0,0,0,1
10,Aymeric Laporte,Man City,FRA,D (C),27/5/1994 (26 years old),26.0,191.0,85.0,569000.0,63000000.0,...,0,0,0,93.8,1853.11,2013.0,0,0,1,0
19,Rúben Dias,Man City,POR,D (C),14/5/1997 (23 years old),23.0,187.0,83.0,332000.0,68000000.0,...,0,0,0,93.8,1739.83,2013.0,0,0,0,1
45,John Stones,Man City,ENG,D (C),28/5/1994 (26 years old),26.0,188.0,72.0,474000.0,52000000.0,...,0,0,0,93.8,1807.88,2013.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174316,Marcos Leal,Charlotte FC Academy,USA,AM (RLC),8/12/2004 (15 years old),15.0,166.0,55.0,9.0,0.0,...,1,0,0,40.8,1675.89,1156.0,0,0,1,0
174317,Nathan Palmer,Charlotte FC Academy,USA,AM (C),11/2/2004 (16 years old),16.0,178.0,67.0,9.0,0.0,...,1,0,0,40.8,1675.89,1156.0,0,0,0,1
174322,Yeferson Suárez,Charlotte FC Academy,USA,M (C),4/1/2004 (16 years old),16.0,162.0,55.0,9.0,0.0,...,1,0,0,40.8,1675.89,1156.0,0,0,0,1
174318,Darren Cox,Charlotte FC Academy,USA,ST (C),15/12/2003 (16 years old),16.0,174.0,62.0,9.0,0.0,...,0,1,0,40.8,1675.89,1156.0,0,0,0,1


## grouped features

### definition

In [55]:
def create_grouped_features(df):
    # Offensive Skills
    df['shooting'] = df[['fin', 'lon', 'fre', 'pen']].mean(axis=1)
    df['dribbling_control'] = df[['dri', 'fir', 'fla', 'tec']].mean(axis=1)
    df['passing_vision'] = df[['pas', 'vis', 'l th', 'cro', 'cor', 'otb']].mean(axis=1)
    
    # Defensive Skills
    df['tackling_interception'] = df[['tck', 'mar', 'pos', 'ant']].mean(axis=1)
    df['aerial_defense'] = df[['hea', 'jum', 'aer']].mean(axis=1)
    
    # Physical Attributes
    df['speed_agility'] = df[['acc', 'pac', 'agi']].mean(axis=1)
    df['strength_stamina'] = df[['str', 'sta', 'bal']].mean(axis=1)
    
    # Mental and Tactical Attributes
    df['decision_making'] = df[['dec', 'cmp']].mean(axis=1)
    df['work_ethic_effort'] = df[['wor', 'det', 'bra']].mean(axis=1)
    
    # Leadership and Teamwork
    df['leadership'] = df[['ldr', 'com']].mean(axis=1)
    df['teamwork'] = df['tea']
    
    # Goalkeeping Abilities
    df['goalkeeping_abilities'] = df[['han', 'ref', 'kic', 'thr', '1v1', 'ecc', 'cmd']].mean(axis=1)
    
    # Behavioral Attributes
    df['behavioral_attributes'] = df[['agg', 'pun']].mean(axis=1)
    
    # Player Traits
    df['player_traits'] = df['tro']
    
    # Create a new DataFrame with the grouped features
    dfgf = df[['shooting', 'dribbling_control', 'passing_vision', 'tackling_interception', 
               'aerial_defense', 'speed_agility', 'strength_stamina', 'decision_making', 
               'work_ethic_effort', 'leadership', 'teamwork', 'goalkeeping_abilities', 
               'behavioral_attributes', 'player_traits', 'club_rating']]
    
    return dfgf

In [9]:
# Find the team with the highest rating for each feature
for column in df.columns:
    
    best_team = df.loc[df[column].idxmax()]
    
    # Print the result for the best teams
    print(f"Best team for {column}:")
    print(best_team[[column]])
    print("")

Best team for shooting:
shooting    10.9375
Name: Tottenham, dtype: float64

Best team for dribbling_control:
dribbling_control    14.3125
Name: Barcelona, dtype: float64

Best team for passing_vision:
passing_vision    11.666667
Name: Barcelona, dtype: float64

Best team for tackling_interception:
tackling_interception    12.3125
Name: Liverpool, dtype: float64

Best team for aerial_defense:
aerial_defense    8.866667
Name: Burnley, dtype: float64

Best team for speed_agility:
speed_agility    14.733333
Name: Man City, dtype: float64

Best team for strength_stamina:
strength_stamina    13.916667
Name: Man Utd, dtype: float64

Best team for decision_making:
decision_making    14.225
Name: Barcelona, dtype: float64

Best team for work_ethic_effort:
work_ethic_effort    14.666667
Name: Lazio, dtype: float64

Best team for leadership:
leadership    7.45
Name: Milan, dtype: float64

Best team for teamwork:
teamwork    14.9
Name: Arsenal, dtype: float64

Best team for goalkeeping_abilities:

## team_scores for each style

In [14]:
def calculate_team_score(df, playing_style):
    relevant_features = playing_style['relevant_features']
    weights = playing_style['weights']

    # Filter DataFrame to include only relevant features
    relevant_df = df[relevant_features]

    # Normalize the relevant features to have values between 0 and 1
    normalized_df = (relevant_df - relevant_df.min()) / (relevant_df.max() - relevant_df.min())

    # Calculate the weighted sum for each team
    team_scores = (normalized_df * weights).sum(axis=1)

    # Get the team with the highest score
    best_team = team_scores.idxmax()

    return best_team, team_scores

# Example playing style for Possession-Based Style
pressing_style = {
    'relevant_features': ['decision_making', 'work_ethic_effort', 'teamwork', 'speed_agility'],
    'weights': [0.4, 0.25, 0.15, 0.2]
}

possession_style = {
    'relevant_features': ['passing_vision', 'decision_making', 'teamwork', 'dribbling_control', 'work_ethic_effort'],
    'weights': [0.2, 0.2, 0.2, 0.2, 0.2]
}

# Calculate the best team for the Possession-Based Style
best_team, team_scores = calculate_team_score(df, possession_style)

print(f"Best team for Possession-Based Style: {best_team}")
print("Team Scores:")
pd.DataFrame(team_scores).sort_values(by=0, axis=0, ascending=False)



Best team for Possession-Based Style: Liverpool
Team Scores:


Unnamed: 0_level_0,0
club,Unnamed: 1_level_1
Liverpool,0.982066
Barcelona,0.953833
Man City,0.947978
A. Madrid,0.947358
Tottenham,0.933830
...,...
Tabuan U18s,0.059826
BSRC,0.054816
Setia Perdana,0.053720
Tunas,0.035930


In [15]:
# Calculate the best team for the Possession-Based Style
best_team, team_scores = calculate_team_score(df, pressing_style)

print(f"Best team for Pressing-Based Style: {best_team}")
print("Team Scores:")
pd.DataFrame(team_scores).sort_values(by=0, axis=0, ascending=False)

Best team for Pressing-Based Style: Liverpool
Team Scores:


Unnamed: 0_level_0,0
club,Unnamed: 1_level_1
Liverpool,0.981821
Man City,0.948671
A. Madrid,0.942543
Barcelona,0.935466
Paris SG,0.932657
...,...
BSRC,0.148985
Setia Perdana,0.143505
Tabuan U18s,0.125009
Tunas,0.119441


## comparator and scaler

In [63]:
def custom_scaler(dfgf_no_name):
    total_score = dfgf_no_name.sum(axis=1)
    scaled_dfgf_no_name = dfgf_no_name.div(total_score, axis=0)
    return scaled_dfgf_no_name

In [48]:
def compare_teams(df, team1, team2):
    df1 = df.loc[team1]
    df2 = df.loc[team2]
    return custom_scaler(pd.DataFrame(df1.subtract(df2)).T.drop(columns=['club_rating', 'goalkeeping_abilities']))

In [62]:
Nino = compare_teams(test, 'Man City', 'Club Brugge')

In [65]:
Nino

Unnamed: 0,shooting,dribbling_control,passing_vision,tackling_interception,aerial_defense,speed_agility,strength_stamina,decision_making,work_ethic_effort,leadership,teamwork,goalkeeping_abilities,behavioral_attributes,player_traits,club_rating
0,2.5,3.5,2.1,1.7,-0.8,3.266667,2.4,2.7,0.8,-0.6,0.6,0.142857,-1.5,0.4,356.0
