In [13]:
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import factorial


In [14]:
df = pd.read_csv('https://drive.switch.ch/index.php/s/UEpTFv2Bfa5C1dd/download')
df.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,...,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,...,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181.0,79.0,,1,0,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191.0,87.0,Center Back,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172.0,70.0,Right Midfielder,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002


# Task 0 - Intermission - Combinations

In [15]:
import itertools

feature_columns = ['height', 'weight', 'games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards']

def generate_combinations(features):
    all_combinations = []
    for r in range(1, len(features) + 1):
        # Generate combinations of size r
        combinations = itertools.combinations(features, r)
        all_combinations.extend(combinations)
    return all_combinations

print(generate_combinations(feature_columns))

[('height',), ('weight',), ('games',), ('victories',), ('ties',), ('defeats',), ('goals',), ('yellowCards',), ('yellowReds',), ('redCards',), ('height', 'weight'), ('height', 'games'), ('height', 'victories'), ('height', 'ties'), ('height', 'defeats'), ('height', 'goals'), ('height', 'yellowCards'), ('height', 'yellowReds'), ('height', 'redCards'), ('weight', 'games'), ('weight', 'victories'), ('weight', 'ties'), ('weight', 'defeats'), ('weight', 'goals'), ('weight', 'yellowCards'), ('weight', 'yellowReds'), ('weight', 'redCards'), ('games', 'victories'), ('games', 'ties'), ('games', 'defeats'), ('games', 'goals'), ('games', 'yellowCards'), ('games', 'yellowReds'), ('games', 'redCards'), ('victories', 'ties'), ('victories', 'defeats'), ('victories', 'goals'), ('victories', 'yellowCards'), ('victories', 'yellowReds'), ('victories', 'redCards'), ('ties', 'defeats'), ('ties', 'goals'), ('ties', 'yellowCards'), ('ties', 'yellowReds'), ('ties', 'redCards'), ('defeats', 'goals'), ('defeats',

# Task 1 - Player numbers per position
For each 'position', calculate how many players we have.

In [16]:
# count how often each position occurs
position_counts = df.groupby('position').size()
position_counts

position
Attacking Midfielder    11817
Center Back             22466
Center Forward          18687
Center Midfielder        6113
Defensive Midfielder    15753
Goalkeeper              12321
Left Fullback            9874
Left Midfielder          6569
Left Winger              4814
Right Fullback           9199
Right Midfielder         5298
Right Winger             5391
dtype: int64

# Task 2 - Different possible teams of 'Attacking Midfielder' and 'Center Back'
How many pairs of 'Attacking Midfielder' and 'Center Back' players can you form? There is always exactly one from each position in a pair.

In [17]:
# multiply up the number of players in each position
# we do this on a column level again
# with integers you would just use *, nothing changes despite using columns, we can just do this
total_combinations = position_counts['Attacking Midfielder'] * position_counts['Center Back']
total_combinations

265480722

# Task 3 - Photoshoot
You want to take photos of 100 of the players. How many different ways can you arrange the players in a line for the photoshoot?

In [18]:
arangements = factorial(100)
arangements

93326215443944152681699238856266700490715968264381621468592963895217599993229915608941463976156518286253697920827223758251185210916864000000000000000000000000

# Task 4 - Photoshoot - Redux
You noticed that your original plan doesn't work. There are too many players to fit in one photo.

Instead of taking a single photo with all players, you decide to create unique photo compositions by arranging groups of 5 players at a time. Each arrangement in the photo should be unique, with the order of the players being significant. 

Your task is to calculate the number of different ways you can arrange 5 players in each group, considering the order of players. How many different ways can you arrange the players in groups of 5 for the photoshoot?

In [19]:
from math import factorial

# order doesn't matter
arrangements = factorial(len(df)) / (factorial(5) * factorial(len(df) - 5))
arrangements

5.533115028624767e+23

In [20]:
# order matters
arrangements = factorial(len(df)) / factorial(len(df) - 5)
arrangements

6.63973803434972e+25

# Task 5 - Building a team
Build a team of 11 players, with the following constraints:
- 1 goalkeeper
- 4 defenders
- 4 midfielders
- 2 forwards

Calculate how many different teams you can build. The players for each position should come from the players who actually play in that position. For example, the goalkeeper should be a player who plays as a goalkeeper.
Do this by hand (disclaimer: please don't actually do this) or with code.

In [21]:
position_mapping = {
    'Goalkeeper': 'Goalkeepers',
    'Center Back': 'Defenders',
    'Left Fullback': 'Defenders',
    'Right Fullback': 'Defenders',
    'Attacking Midfielder': 'Midfielders',
    'Center Midfielder': 'Midfielders',
    'Defensive Midfielder': 'Midfielders',
    'Left Midfielder': 'Midfielders',
    'Right Midfielder': 'Midfielders',
    'Left Winger': 'Forwards',
    'Right Winger': 'Forwards',
    'Center Forward': 'Forwards'
}

# rename the position in the dataframe to the broader position category
df['position'] = df['position'].map(position_mapping)

In [22]:
df.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Midfielders,1,0,...,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Forwards,1,0,...,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181.0,79.0,,1,0,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191.0,87.0,Defenders,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172.0,70.0,Midfielders,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002


In [23]:
# count how often each position category occurs
broader_position_counts = df.groupby('position').size()

In [24]:
broader_position_counts_df = broader_position_counts.reset_index()
broader_position_counts_df.columns = ['Position Category', 'Number']

broader_position_counts_df

Unnamed: 0,Position Category,Number
0,Defenders,41539
1,Forwards,28892
2,Goalkeepers,12321
3,Midfielders,45550


In [35]:
from math import comb

combinations_goalkeepers = comb(12321, 1)
combinations_goalkeepers

12321

In [26]:
from math import comb

# this looks horrible, but it's simple
# we filter broader_position_counts_df by the position category
# then we pick the 'Number' columns, and take the value of it out of it
# we do this for all the position categories
combinations_goalkeepers = comb(broader_position_counts_df[broader_position_counts_df['Position Category'] == 'Goalkeepers']['Number'].values[0], 1)

combinations_defenders = comb(broader_position_counts_df[broader_position_counts_df['Position Category'] == 'Defenders']['Number'].values[0], 4)

combinations_midfielders = comb(broader_position_counts_df[broader_position_counts_df['Position Category'] == 'Midfielders']['Number'].values[0], 4)

combinations_forwards = comb(broader_position_counts_df[broader_position_counts_df['Position Category'] == 'Forwards']['Number'].values[0], 2)

total_combinations = combinations_goalkeepers * combinations_defenders * combinations_midfielders * combinations_forwards
total_combinations

114390877761817207419313078513240414033753264800

# Task 6 - Dream Team - Redux
You noticed that some players can be assigned to multiple positions. For example, a player can be both a 'Center Back' and a 'Defensive Midfielder'. So what you do instead is select 11 players, without any constraints on their original position. Afterwards you will assign each player to a position, using the same constraints as in Task 5. Except now this team you can put any player in any position. That means a 'Goalkeeper' player for example can be in a 'Defender' position.
You can also have multiple players in the same position. For example, you can have 2 'Goalkeeper' players in your team (players who are actually goalkeepers) as long as you respect the constraints on the number of players per position.

How many different teams can you build?

Before you calculate it: do you think the number of possible teams is higher or lower than in Task 5? Why?

In [34]:
num_combinations = factorial(11) / (factorial(1) * factorial(4) * factorial(4) * factorial(2))
num_combinations

34650.0

In [27]:
num_combinations = factorial(11) / (factorial(1) * factorial(4) * factorial(4) * factorial(2))

possible_selections = comb(len(df), 11)

possible_selections * num_combinations

5.587035783908066e+53

# Task 7 - Same league
If you randomly construct a team of players while respecting their original position (like we did in Task 5), what is the probability that all players are in the England league?

In [28]:
league_country_counts = df.groupby(['position', 'leagueCountry']).size()
league_country_counts

position     leagueCountry
Defenders    England          12395
             France            7146
             Germany          12511
             Spain             9487
Forwards     England           8904
             France            4930
             Germany           8530
             Spain             6528
Goalkeepers  England           4063
             France            1760
             Germany           3927
             Spain             2571
Midfielders  England          11999
             France            7972
             Germany          13815
             Spain            11764
dtype: int64

In [29]:
league_country_counts = df.groupby(['position', 'leagueCountry']).size()
comb_gk_england = comb(league_country_counts[('Goalkeepers', 'England')], 1)
comb_def_england = comb(league_country_counts[('Defenders', 'England')], 4)
comb_mid_england = comb(league_country_counts[('Midfielders', 'England')], 4)
comb_fwd_england = comb(league_country_counts[('Forwards', 'England')], 2)

total_combinations_england = comb_gk_england * comb_def_england * comb_mid_england * comb_fwd_england
probability_same_league = total_combinations_england / total_combinations


In [30]:
probability_same_league

1.1947124230859298e-06