In [1]:
import matplotlib.pyplot as plt
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
import pandas as pd
# for the progress bar
from tqdm.auto import tqdm

- [`pandas` Cheat Sheet](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf)
- [`matplotlib` Cheat Sheet](https://matplotlib.org/cheatsheets/_images/cheatsheets-1.png)
- [SQL Cheat Sheet](https://www.sqltutorial.org/sql-cheat-sheet/)

# Loading the data

In [2]:
df = pd.read_csv('https://drive.switch.ch/index.php/s/UEpTFv2Bfa5C1dd/download')
df.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,...,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,...,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181.0,79.0,,1,0,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191.0,87.0,Center Back,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172.0,70.0,Right Midfielder,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002


This week we keep the `NaN` values.

In [3]:
# df = df.dropna()

# Repeat: Estimating position

For the 'position' column, estimate the position of the missing value, by finding the player that is most similar to the one whose value is missing, and using their position.

Use the following formula for the *cosine similarity* of the two vectors _a_ and _b_.:

$$
d(a, b) = \frac{a \cdot b}{ \|a\| \|b\| }
$$

Note, that the "multiplication" of two vectors is the **dot** product and the $ \| $ gives the **magnitude** (or **norm**) of a vector.

In [4]:
feature_columns = ['height', 'weight', 'games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards']

In [5]:
missing_positions = df[df['position'].isna()]

# drop all rows with missing features
missing_positions = missing_positions.dropna(subset=feature_columns)

# vectorize the features
missing_positions_vec = missing_positions[feature_columns].values

missing_positions_vec

array([[181.,  79.,   1., ...,   1.,   0.,   0.],
       [185.,  74.,   1., ...,   0.,   0.,   0.],
       [180.,  68.,   1., ...,   0.,   0.,   0.],
       ...,
       [182.,  88.,   1., ...,   0.,   0.,   0.],
       [191.,  84.,   1., ...,   1.,   0.,   0.],
       [188.,  81.,   1., ...,   0.,   0.,   0.]])

# Repeat - Vectorize the positions

For each position, calculate a vector representing that position. To do so, take the values of each player, and calculate an average vector each position.

In [6]:
# calculate vector for each position
positions = df[~df['position'].isna()]['position'].unique()
position_vectors = {}
for position in positions:
    position_players = df[df['position'] == position]
    position_players = position_players.dropna()  # drop those with missing values
    
    # vectorize
    position_vector = position_players[feature_columns].values
    
    # calculate average vector
    position_vectors[position] = position_vector.mean(axis=0)

In [7]:
positions

array(['Attacking Midfielder', 'Right Winger', 'Center Back',
       'Right Midfielder', 'Left Fullback', 'Defensive Midfielder',
       'Goalkeeper', 'Right Fullback', 'Left Winger', 'Left Midfielder',
       'Center Forward', 'Center Midfielder'], dtype=object)

In [8]:
position_vectors

{'Attacking Midfielder': array([1.77547457e+02, 7.21203867e+01, 2.90759505e+00, 1.34823836e+00,
        6.70633076e-01, 8.88723622e-01, 4.71971739e-01, 3.59579808e-01,
        8.18071953e-03, 7.34405503e-03]),
 'Right Winger': array([1.77538147e+02, 7.30531335e+01, 3.00797976e+00, 1.40618918e+00,
        7.05527443e-01, 8.96263137e-01, 7.37251849e-01, 3.13156870e-01,
        6.22810432e-03, 1.20669521e-02]),
 'Center Back': array([1.87258714e+02, 8.08175303e+01, 3.03548969e+00, 1.38283040e+00,
        7.22858675e-01, 9.29800614e-01, 1.56924877e-01, 4.86228246e-01,
        1.79398430e-02, 2.00848243e-02]),
 'Right Midfielder': array([1.76354720e+02, 7.08107583e+01, 3.16655865e+00, 1.35515230e+00,
        7.78137827e-01, 1.03326852e+00, 3.78915533e-01, 4.06567293e-01,
        1.08014690e-02, 7.99308706e-03]),
 'Left Fullback': array([1.79579545e+02, 7.48155080e+01, 2.95978164e+00, 1.33511586e+00,
        7.09447415e-01, 9.15218360e-01, 1.14193405e-01, 4.37834225e-01,
        1.39260250e-

In [9]:
def cosine_similarity(
        a: np.array, 
        b: np.array
) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [10]:
def estimate_player_position(
        player_vectors: np.array,
        position_vectors: np.array
) -> list[str]:
    estimated_positions = []
    for player_vec in player_vectors:
        sims = []
        for position, position_vector in position_vectors.items():
            # calculate cosine similarity
            similarity = cosine_similarity(a=player_vec, b=position_vector)
            sims.append((position, similarity))
        # sort by similarity
        sims = sorted(sims, key=lambda x: x[1], reverse=True)
        # take the most similar position
        estimated_positions.append((sims[0][0]))
    return estimated_positions

missing_estimates = estimate_player_position(missing_positions_vec, position_vectors)

In [11]:
missing_estimates

['Center Back',
 'Left Midfielder',
 'Left Midfielder',
 'Center Midfielder',
 'Left Midfielder',
 'Goalkeeper',
 'Left Midfielder',
 'Center Back',
 'Right Fullback',
 'Center Forward',
 'Left Midfielder',
 'Left Fullback',
 'Right Winger',
 'Center Midfielder',
 'Center Midfielder',
 'Left Fullback',
 'Left Midfielder',
 'Center Back',
 'Attacking Midfielder',
 'Left Midfielder',
 'Left Fullback',
 'Left Midfielder',
 'Left Fullback',
 'Left Fullback',
 'Left Midfielder',
 'Left Midfielder',
 'Goalkeeper',
 'Left Midfielder',
 'Left Midfielder',
 'Attacking Midfielder',
 'Left Midfielder',
 'Left Midfielder',
 'Center Forward',
 'Center Forward',
 'Goalkeeper',
 'Center Back',
 'Goalkeeper',
 'Attacking Midfielder',
 'Left Midfielder',
 'Center Back',
 'Right Midfielder',
 'Left Midfielder',
 'Left Midfielder',
 'Goalkeeper',
 'Left Midfielder',
 'Attacking Midfielder',
 'Right Midfielder',
 'Left Midfielder',
 'Right Midfielder',
 'Center Forward',
 'Left Midfielder',
 'Left Fullbac

# Repeat - Evaluation

Evaluate the quality of our approach by comparing the estimated positions with the true positions.

In [12]:
non_missing = df[~df['position'].isna()]
non_missing_vecs = non_missing[feature_columns].values

# calculate the estimated positions, for the non_missing players
estimated_positions = estimate_player_position(non_missing_vecs, position_vectors)

correct = (non_missing['position'] == estimated_positions).sum()
wrong = (non_missing['position'] != estimated_positions).sum()

print('Correct ratio:', correct / (correct + wrong))

Correct ratio: 0.16627176505432495


# Repeat - Random guessing

Compare the quality of our approach with random guessing. That is, generate random guesses for the missing positions, and compare them with the true positions. How does our approach compare? Can we beat random guessing?

In [13]:
# compare with random guessing
random_positions = np.random.choice(positions, size=len(non_missing))
correct_random = (non_missing['position'] == random_positions).sum()
wrong_random = (non_missing['position'] != random_positions).sum()

print('Correct ratio (random):', correct_random / (correct_random + wrong_random))

Correct ratio (random): 0.0843868373057318


# Task 1 - Forward / Backward selection

In [14]:
feature_columns = ['height', 'weight', 'games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards']

In [15]:
def cosine_similarity(
        a: np.array, 
        b: np.array
) -> float:
    """
    Calculates the cosine similarity of two vectors.
    Cosine similarity is a measure of similarity between two non-zero vectors, it uses the cosine of the angle between them.
    That means, the closer / more similar the vectors, the higher the similarity.
    """
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [16]:
def position_vectors4features(
        features: list[str],
        player_df: pd.DataFrame
) -> dict[str, np.array]:
    """
    Calculates the vector for each position, using the given features.
    """
    # calculate vector for each position
    # the ~ is a negation
    # player_df['position'].isna() gives us all rows where the position is missing
    # ~player_df['position'].isna() gives us all rows where the position is not missing (because of the ~ negation)
    # then we use this to filter player_df: player_df[~player_df['position'].isna()]
    # next we select the position column
    # and only keep the unique values
    # positions = player_df[~player_df['position'].isna()]['position'].unique()
    
    # a split up way of writing the same thing, which is easier to understand
    cond = player_df['position'].isna()
    positions = player_df[~cond]['position'].unique()
    
    position_vectors = {}
    
    # iterate over all the positions
    for position in positions:
        
        # filter player_df by where the 'position' value is equal to the current position
        position_players = player_df[player_df['position'] == position]
        
        # drop all rows with missing features
        # we do this to have only 'nice' vectors, without anything missing
        position_players = position_players.dropna() 
        
        # vectorize
        position_vector = position_players[features].values
        
        # calculate average vector
        position_vectors[position] = position_vector.mean(axis=0)
    return position_vectors

In [17]:
def estimate_player_position(
        player_vectors: np.array,
        position_vectors: np.array
) -> list[str]:
    """
    Estimates the position of the players, based on the given player vectors and position vectors.
    """
    estimated_positions = []
    
    # iterate over the player_vectors - this is a list of vectors, one for each player
    for player_vec in player_vectors:
        sims = []
        
        # iterate over the position_vectors - this is a dictionary, with the position as key and the vector as value
        # so for example at position_vectors['Attacker'] we have the vector for the attacker position
        for position, position_vector in position_vectors.items():
            
            # calculate cosine similarity between the player vector and the position vector
            similarity = cosine_similarity(a=player_vec, b=position_vector)
            
            # add the similarity to the list of similarities
            # we use a tuple here so we know which similarity belongs to which position
            # we could also use a dictionary here, but since we want to sort by similarity later this is more convenient
            sims.append((position, similarity))
            
        # sort by similarity
        # sims is a list of tuples, where the first element is the position and the second element is the similarity
        # we use the second element (x[1]) as the key for sorting, and we sort in reverse order (highest similarity first)
        sims = sorted(sims, key=lambda x: x[1], reverse=True)
        
        # take the most similar position
        # sims[0] is the tuple with the highest similarity
        # sims[0][0] is the position of the tuple with the highest similarity
        # sims[0][1] is the similarity of the tuple with the highest similarity
        estimated_positions.append((sims[0][0]))
    return estimated_positions

missing_estimates = estimate_player_position(missing_positions_vec, position_vectors)

In [18]:
# compare with random guessing
# we randomly select len(non_missing) positions from the list of all positions
random_positions = np.random.choice(positions, size=len(non_missing))

# then we count how often the actual position (non_missing['position']) is equal to the random position
correct_random = (non_missing['position'] == random_positions).sum()

# and how often it is not equal
wrong_random = (non_missing['position'] != random_positions).sum()

# and then we calculate the ratio of correct guesses
print('Correct ratio (random):', correct_random / (correct_random + wrong_random))

Correct ratio (random): 0.0833969852379542


In [19]:
def evaluate_position_estimation(
        features: list[str],
        player_df: pd.DataFrame
) -> float:
    """
    Evaluates the position estimation, by calculating the ratio of correct guesses.
    """
    # get all rows where the position is not missing
    # this works exactly the same way as in estimate_player_position
    non_missing = player_df[~player_df['position'].isna()]
    non_missing_vecs = non_missing[features].values
    
    # we use this to calculate the position vectors
    positionvecs4features = position_vectors4features(
        features,
        player_df
    )
    
    # calculate the estimated positions, for the non_missing players
    estimated_positions = estimate_player_position(non_missing_vecs, positionvecs4features)
    
    # count how often the estimated position is equal to the actual position
    correct = (non_missing['position'] == estimated_positions).sum()
    
    # count how often the estimated position is not equal to the actual position
    wrong = (non_missing['position'] != estimated_positions).sum()
    
    # print('Correct ratio:', correct / (correct + wrong))
    return correct / (correct + wrong)
    
evaluate_position_estimation(feature_columns, df)

0.16627176505432495

In [20]:
from typing import Optional


def forward_selection(
        max_features: int,
        player_df: pd.DataFrame,
        selected_features: Optional[list[str]],
) -> list[str]:
    """
    Selects the best features, by adding one feature at a time.
    """
    feature2score = {}
    
    # note how when we call this the first time selected_features can be empty
    # we could start with some features pre-selected, if we know they are good or want to use them
    
    # iterate over all features
    for feature in feature_columns:
        
        # if the feature is already selected, we skip it
        if feature in selected_features:
            continue
            
        # otherwise we add it to the features we currently evaluate
        current_features = selected_features + [feature]
        
        # here we use the evaluate_position_estimation function we defined above to calculate the score
        # we check here what our score would be if we use 'current_features' for the prediction
        score = evaluate_position_estimation(
            current_features,
            player_df=player_df
        )
        
        # now we record, for the feature we just added, what the score is
        feature2score[feature] = score
        # then the for-loop continues, and we try out all the possible features
        
    # we sort feature2score.items, where each item is a tuple of (feature, score)
    # we sort by the score, and we sort in reverse order (highest score first)
    scores = sorted(feature2score.items(), key=lambda k: k[1], reverse=True)
    
    # get best feature from score
    # scores[0][1] is the score of the best feature
    # that is: if we add this feature to the selected features, we get the highest score
    best_feature = scores[0][0]
    print(f'Adding feature: {best_feature} with score {scores[0][1]:4f}')
    
    # we go until we reach the maximum number of features
    if max_features > 0:
        
        # we add the best feature to the selected features, and recursively call this function again
        # but max-features is reduced by 1 because we added a feature
        # that means we have to add one less feature
        # once we have added enough features we return selected_features, these are then the best features
        return forward_selection(max_features - 1, player_df, selected_features + [best_feature])
    return selected_features

In [21]:
def backward_selection(
        max_features: int,
        player_df: pd.DataFrame,
        selected_features: Optional[list[str]],
) -> list[str]:
    """
    Selects the best features, by removing one feature at a time.
    """
    feature2score = {}
    
    # iterate over all features
    for feature in feature_columns:
        
        # if the feature is already selected, we skip it
        if feature not in selected_features:
            continue
            
        # we make a copy of the selected_features, because we will remove stuff
        current_features = selected_features.copy()
        
        # remove the feature we are currently evaluating
        current_features.remove(feature)
        
        # now we use the evaluate_position_estimation function we defined above to calculate the score, WITHOUT this feature
        score = evaluate_position_estimation(
            current_features,
            player_df=player_df
        )
        
        # now we record, for the feature we just removed, what the score is
        feature2score[feature] = score
    # get best feature from score
    scores = sorted(feature2score.items(), key=lambda k: k[1], reverse=True)
    
    # we recorded for each feature what the score is, that is, if we remove this feature, what does our score become?
    # thats why we sort in reverse order, because we want to remove the feature that gives us the highest score when removed
    worst_feature = scores[0][0]
    
    # now we properly remove the feature from the selected features
    selected_features.remove(worst_feature)
    print(f'Removing feature: {worst_feature} with score {scores[0][1]:4f}')
    
    # if we have not yet reached the maximum number of features, we recursively call this function again
    if len(selected_features) > max_features:
        return backward_selection(max_features, player_df, selected_features)
    return selected_features

In [22]:
# start with no features, go up to 3
r = forward_selection(5, df, [])
r

Adding feature: height with score 0.092103
Adding feature: goals with score 0.161135
Adding feature: yellowCards with score 0.180660
Adding feature: yellowReds with score 0.180660
Adding feature: redCards with score 0.180660
Adding feature: ties with score 0.176178


['height', 'goals', 'yellowCards', 'yellowReds', 'redCards']

In [23]:
# start with no features, go up to 3
r = backward_selection(5, df, feature_columns.copy())
r

Removing feature: victories with score 0.168407
Removing feature: games with score 0.169460
Removing feature: defeats with score 0.170714
Removing feature: height with score 0.176552
Removing feature: ties with score 0.179779


['weight', 'goals', 'yellowCards', 'yellowReds', 'redCards']

In [24]:
score = evaluate_position_estimation(feature_columns, df)
score

0.16627176505432495

The other tasks from Exercise 7 that were in the blank notebook will be moved to / discussed in Exercise 8.