In [1]:
import pandas as pd
import numpy as np

# 1. Load the data
Find a way to load the data into a pandas dataframe. You can find the data here: https://osf.io/fv8c3.

In [2]:
df = pd.read_csv('CrowdstormingDataJuly1st.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'CrowdstormingDataJuly1st.csv'

In [None]:
df

In [None]:
df.columns

In [None]:
df.shape

# 2. Clean the data
Here we use a very simple approach to clean the data. We remove all the rows that contain missing values. You can use a more sophisticated approach if you want.

In [None]:
df = df.dropna()
df.shape

In [None]:
len(df)

# 3. Simple statistics
Calculate the mean, median, min and maximum values for all columns.

In [None]:
df.describe()

df.describe() did not calculate the median for us. Let's do it manually.
First, find all numeric columns.

In [None]:
numeric_columns = df.select_dtypes(include=[np.number]).columns
numeric_columns

Next, select the numeric columns.

In [None]:
df[numeric_columns].head()

Finally, calculate the median.

In [None]:
df[numeric_columns].median()  # by default column-wise!

# 4. Average cards per game
Calculate the average number of yellow and red cards per game for each player. Then print out the 5 players with the highest average number of cards per game.

## 4.1 Count the number of cards each player has gotten.
As an intermediate step, let's first calculate the number of cards each player has gotten.

In [None]:
df['total_cards'] = df['yellowCards'] + df['redCards']
df['total_cards'].head()

## 4.2 Calculate the average number of cards per game for each player.
Next, we can now use this column to calculate the average number of cards per game for each player.

In [None]:
df['avg_cards_per_game'] = df['total_cards'] / df['games']
df['avg_cards_per_game'].head()

## 4.3 Sort the players by the average number of cards per game.
Then we sort by this column.

In [None]:
avg_cards_per_game_df = df.sort_values(by='avg_cards_per_game', ascending=False)
avg_cards_per_game_df

## 4.4 Print out the top 5 players.
This is now very easy to do. We will not use .head() this time though.

In [None]:
avg_cards_per_game_df[:5]

# 5. Average number of cards per country
Do the same as in 4. but this time for each country. This means we need to group the countries!

## 5.1 Group the data by country.
This is our first step. It will be annoying to calculate the average for each country otherwise.

In [None]:
grouped_by_country = df.groupby('leagueCountry')
grouped_by_country

We can also use this to check what countries we have.

In [None]:
grouped_by_country.groups.keys()

## 5.2 Calculate the average number of cards per game for each country.

In [None]:
df.iloc[:10, 3:10:2]

In [None]:
(grouped_by_country['yellowCards'].sum() + grouped_by_country['redCards'].sum()) / grouped_by_country['games'].sum()

# 6. Correlation
Let's calculate some correlations.

In [None]:
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns]

In [None]:
df[numeric_columns].corr()

## 6.1 Highest correlations per variable
For each of the variables, let's find the variable that has the highest correlation with it.

In [None]:
# df[["height", "weight", "ties", ...]]

numeric_columns = df.select_dtypes(include=[np.number]).columns
corrs = df[numeric_columns].corr()
corrs

In [None]:
corrs = df[numeric_columns].corr()
for column in numeric_columns:
    high_corrs = corrs[column].sort_values(ascending=False)[1:1+3]
    for index, corr in high_corrs.items():
        print(column, index, corr)

In [None]:
c = df[numeric_columns].corr().replace(1.0, np.nan)
cdf = pd.DataFrame(
    {
        "other": c.idxmax(),
        "correlation": c.max()
    }
)
cdf.sort_values("correlation", ascending=False)

In [None]:
c.nlargest(5, numeric_columns)

In [None]:
c.max()

In [None]:
c.idxmax()

In [None]:
corrs["yellowReds"]

## 6.2 Interesting correlations
Pick out some correlations and explain why you think they are interesting and what might be the cause of them.

- height weight 0.7914972050607055: This is not surprising. Taller people tend to be heavier.
- height redCards 0.007067698331657767: This is slightly surprising. Taller people might more easily unintentionally foul smaller players, so it seems like they would get more cards. The correlation is positive but not very strong.

All of these make sense, but may look confusing at first:
- games victories 0.8558576680313157
- games defeats 0.7812185900320455
- games ties 0.7607619655309061
A player who plays more games will have more victories, defeats and ties than players who do not play more games.

# 7. Simple Analysis
Let's do a simple analysis of skin color and the average number of cards. For this we are going to use a boxplot.

First we prepare the data.

In [None]:
boxplot_values = df[['avg_cards_per_game', 'rater1', 'rater2']].copy()
boxplot_values

Then we calculate the average skin color.

In [None]:
boxplot_values['avg_skin_color'] = (boxplot_values['rater1'] + boxplot_values['rater2']) / 2
boxplot_values

Finally, we can plot the boxplot.

In [None]:
import matplotlib.pyplot as plt

grouped = boxplot_values.groupby('avg_skin_color')['avg_cards_per_game']

labels = sorted(boxplot_values['avg_skin_color'].unique())

data_for_boxplot = [grouped.get_group(label).values for label in labels]

plt.figure(figsize=(10, 6))
plt.boxplot(data_for_boxplot, labels=labels, notch=True)
plt.title('Boxplot of avg_cards_per_game grouped by avg_skin_color')
plt.xlabel('Average Skin Color')
plt.ylabel('Average Cards Per Game')
plt.show()

In [None]:
grouped = boxplot_values.groupby('avg_skin_color')['avg_cards_per_game']
grouped