In [1]:
import matplotlib.pyplot as plt
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
import pandas as pd
# for the progress bar
from tqdm.auto import tqdm

- [`pandas` Cheat Sheet](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf)
- [`matplotlib` Cheat Sheet](https://matplotlib.org/cheatsheets/_images/cheatsheets-1.png)
- [SQL Cheat Sheet](https://www.sqltutorial.org/sql-cheat-sheet/)

# Loading the data

In [3]:
df = pd.read_csv('https://drive.switch.ch/index.php/s/UEpTFv2Bfa5C1dd/download')
df.head()

KeyboardInterrupt: 

This week we keep the `NaN` values.

In [None]:
# df = df.dropna()

# Repeat: Estimating position

For the 'position' column, estimate the position of the missing value, by finding the player that is most similar to the one whose value is missing, and using their position.

Use the following formula for the *cosine similarity* of the two vectors _a_ and _b_.:

$$
d(a, b) = \frac{a \cdot b}{ \|a\| \|b\| }
$$

Note, that the "multiplication" of two vectors is the **dot** product and the $ \| $ gives the **magnitude** (or **norm**) of a vector.

In [None]:
feature_columns = ['height', 'weight', 'games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards']

In [None]:
missing_positions = df[df['position'].isna()]

# drop all rows with missing features
missing_positions = missing_positions.dropna(subset=feature_columns)

# vectorize the features
missing_positions_vec = missing_positions[feature_columns].values

missing_positions_vec

# Repeat - Vectorize the positions

For each position, calculate a vector representing that position. To do so, take the values of each player, and calculate an average vector each position.

In [None]:
# calculate vector for each position
positions = df[~df['position'].isna()]['position'].unique()
position_vectors = {}
for position in positions:
    position_players = df[df['position'] == position]
    position_players = position_players.dropna()  # drop those with missing values
    
    # vectorize
    position_vector = position_players[feature_columns].values
    
    # calculate average vector
    position_vectors[position] = position_vector.mean(axis=0)

In [None]:
positions

In [None]:
position_vectors

In [None]:
def cosine_similarity(
        a: np.array, 
        b: np.array
) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
def estimate_player_position(
        player_vectors: np.array,
        position_vectors: np.array
) -> list[str]:
    estimated_positions = []
    for player_vec in player_vectors:
        sims = []
        for position, position_vector in position_vectors.items():
            # calculate cosine similarity
            similarity = cosine_similarity(a=player_vec, b=position_vector)
            sims.append((position, similarity))
        # sort by similarity
        sims = sorted(sims, key=lambda x: x[1], reverse=True)
        # take the most similar position
        estimated_positions.append((sims[0][0]))
    return estimated_positions

missing_estimates = estimate_player_position(missing_positions_vec, position_vectors)

In [None]:
missing_estimates

# Repeat - Evaluation

Evaluate the quality of our approach by comparing the estimated positions with the true positions.

In [None]:
non_missing = df[~df['position'].isna()]
non_missing_vecs = non_missing[feature_columns].values

# calculate the estimated positions, for the non_missing players
estimated_positions = estimate_player_position(non_missing_vecs, position_vectors)

correct = (non_missing['position'] == estimated_positions).sum()
wrong = (non_missing['position'] != estimated_positions).sum()

print('Correct ratio:', correct / (correct + wrong))

# Repeat - Random guessing

Compare the quality of our approach with random guessing. That is, generate random guesses for the missing positions, and compare them with the true positions. How does our approach compare? Can we beat random guessing?

In [None]:
# compare with random guessing
random_positions = np.random.choice(positions, size=len(non_missing))
correct_random = (non_missing['position'] == random_positions).sum()
wrong_random = (non_missing['position'] != random_positions).sum()

print('Correct ratio (random):', correct_random / (correct_random + wrong_random))

# Task 0 - Setup

Here are some functions that you need for the tasks below.

In [None]:
feature_columns = ['height', 'weight', 'games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards']

In [None]:
def cosine_similarity(
        a: np.array, 
        b: np.array
) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
def position_vectors4features(
        features,
        player_df
):
    # calculate vector for each position
    positions = player_df[~player_df['position'].isna()]['position'].unique()
    position_vectors = {}
    for position in positions:
        position_players = player_df[player_df['position'] == position]
        position_players = position_players.dropna()  # drop those with missing values
        
        # vectorize
        position_vector = position_players[features].values
        
        # calculate average vector
        position_vectors[position] = position_vector.mean(axis=0)
    return position_vectors

In [None]:
def estimate_player_position(
        player_vectors: np.array,
        position_vectors: np.array
) -> list[str]:
    estimated_positions = []
    for player_vec in player_vectors:
        sims = []
        for position, position_vector in position_vectors.items():
            # calculate cosine similarity
            similarity = cosine_similarity(a=player_vec, b=position_vector)
            sims.append((position, similarity))
        # sort by similarity
        sims = sorted(sims, key=lambda x: x[1], reverse=True)
        # take the most similar position
        estimated_positions.append((sims[0][0]))
    return estimated_positions

missing_estimates = estimate_player_position(missing_positions_vec, position_vectors)

In [None]:
# compare with random guessing
random_positions = np.random.choice(positions, size=len(non_missing))
correct_random = (non_missing['position'] == random_positions).sum()
wrong_random = (non_missing['position'] != random_positions).sum()

print('Correct ratio (random):', correct_random / (correct_random + wrong_random))

In [None]:
def evaluate_position_estimation(
        features: str,
        player_df: pd.DataFrame
) -> float:
    non_missing = player_df[~player_df['position'].isna()]
    non_missing_vecs = non_missing[features].values
    
    positionvecs4features = position_vectors4features(
        features,
        player_df
    )
    
    # calculate the estimated positions, for the non_missing players
    estimated_positions = estimate_player_position(non_missing_vecs, positionvecs4features)
    
    correct = (non_missing['position'] == estimated_positions).sum()
    wrong = (non_missing['position'] != estimated_positions).sum()
    
    # print('Correct ratio:', correct / (correct + wrong))
    return correct / (correct + wrong)
    
evaluate_position_estimation(feature_columns, df)

# Task 1 - Forward / Backward selection

Implement forward and backward selection. Use the `evaluate_position_estimation` function to evaluate the quality of the selected features.

In [None]:
from typing import Optional


def forward_selection(
        max_features: int,
        player_df: pd.DataFrame,
        selected_features: Optional[list[str]],
) -> list[str]:
    return []

In [None]:
def backward_selection(
        max_features: int,
        player_df: pd.DataFrame,
        selected_features: Optional[list[str]],
) -> list[str]:
    return []

In [None]:
# start with no features, go up to 5
r = forward_selection(5, df, [])
r

In [None]:
# start with ALL features, keep only 5
r = backward_selection(5, df, feature_columns.copy())
r

In [None]:
score = evaluate_position_estimation(feature_columns, df)
score

# Task 2 - Normalization
Normalize all values in the `feature_columns` to be between 0 and 1.

# Task 3 - Binarization
Binarize all values in the `leagueCountry` column. Add a column for each possible value, and set it to 1 if the value is present, and 0 otherwise.

# Task 4 - PCA

Use PCA to reduce the number of features to 2. Then, use the 2 features to estimate the position of the players.

*Hint*: Take a look at the [scikit-learn documentation for PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html).

In [None]:
from sklearn.decomposition import PCA
