In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('https://drive.switch.ch/index.php/s/UEpTFv2Bfa5C1dd/download')
df.head()

In [None]:
df = df.dropna()
df

# Task 6 - Bayes' Theorem

Remember Bayes' Theorem from the lecture (Theorem 32):

$P(B|A_1, \dots , A_k) = \frac{P(B) * P(A_1, \dots , A_k|B)}{P(A_1, \dots , A_k)}$

$P(B)$ is called **posterior probability**.
$P(A_1, \dots , A_k|B)$ is called **likelihood**.

Assume there is a test for a disease that is 99% accurate. That means that the probability of a false positive is 1% and the probability of a false negative is 1%. The probability of having the disease is 0.1%.


**What is the probability you have the disease if you tested positive?**

In [None]:
# simulation of this problem
NUM_PEOPLE = 1_000_000

# generate people, where 0.1% have the disease
# bonus: what happens if 1% have the disease? 10%?
disease_prob = 0.001
people = np.random.choice([0, 1], size=NUM_PEOPLE, p=[1 - disease_prob, disease_prob])

def test_person(person):
    if person == 1:
        # If the person has the disease, 99% chance of a positive test result
        return np.random.choice([0, 1], p=[0.01, 0.99])
    else:
        # If the person does not have the disease, 1% chance of a positive test result
        return np.random.choice([0, 1], p=[0.99, 0.01])

# apply the test function to all people
results = []
for p in people:
    result = test_person(p)
    results.append(result)

people_w_positive_test = [(result, person) for result, person in zip(results, people) if result == 1]

# calculate how many people have a positive test result AND are sick
people_w_positive_and_sick = [(result, person) for result, person in people_w_positive_test if person == 1]

# calculate the probability
prob_being_sick_given_positive = len(people_w_positive_and_sick) / len(people_w_positive_test)
prob_being_sick_given_positive

# Task 8 - Marginal distribution of player heights
Plot the marginal distribution of player heights.

*Hint*: You can use the seaborn function [`sns.histplot`](https://seaborn.pydata.org/generated/seaborn.histplot.html) to plot a histogram.


# Task 9 - Joint Distribution of player heights and weights
Plot the joint distribution of player heights and weights.

*Hint*: You can use the seaborn function [`sns.jointplot`](https://seaborn.pydata.org/generated/seaborn.jointplot.html) to plot a joint distribution.

# 1. Average cards per game
Calculate the average number of yellow and red cards per game for each player. Then print out the 5 players with the highest average number of cards per game.

# 2. Average cards per game per country
Calculate the average number of yellow and red cards per game for each country.

# 3. Correlations
For all numeric columns, find the highest correlation with another column that is not itself.

# 4. Scatter Plot
Make a scatter plot of weight vs. height.

Then use PCA to reduce the dimensionality of the player data to 2, and create another scatter plot.

# 5. Joins

Load the `countries` API data, then join it with the player data on the `leagueCountry` column. What is the `fifa` code for each player?

In [None]:
countries_df = pd.read_json(
    "https://drive.switch.ch/index.php/s/x0zUM0seQqigcU1/download"
)
countries_df["name"] = countries_df["name"].apply(lambda x: x["common"])
countries_df.head()

Combine the two DataFrames on the `leagueCountry` column.

# 6. Forward and Backward Selection

Train a Naive Bayes classifier on our dataset.

*Hint*: Refer to the scikit-learn [documentation](https://scikit-learn.org/stable/modules/naive_bayes.html)

Now repeat this, but use forward selection to select 5 features.

*Hint*: You can use [`SequentialFeatureSelector`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html).