In [None]:
import pandas as pd
import sqlite3

import requests  # used later to download additional data

# for the progress bar
from tqdm.auto import tqdm
from pathlib import Path

# to measure runtime
from time import perf_counter

- [`pandas` Cheat Sheet](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf)
- [`matplotlib` Cheat Sheet](https://matplotlib.org/cheatsheets/_images/cheatsheets-1.png)
- [SQL Cheat Sheet](https://www.sqltutorial.org/sql-cheat-sheet/)

# Loading the data

In [None]:
df = pd.read_csv('https://drive.switch.ch/index.php/s/UEpTFv2Bfa5C1dd/download')
df.head()

We repeat our simple data cleaning here, by getting rid of all `NaN` values.

In [None]:
df = df.dropna()

We will be using [`sqlalchemy`](https://www.sqlalchemy.org/) here. First we store the data from the DataFrame in a sqlite3 database.

You can find an `sqlalchemy` cheat sheet [here](https://www.pythonsheets.com/notes/python-sqlalchemy.html).

In [None]:
with sqlite3.connect("crowdstorming.db") as c:
    df.to_sql("crowdstorming", c, if_exists="replace")

from sqlalchemy import create_engine, MetaData

engine = create_engine("sqlite:///crowdstorming.db")
metadata = MetaData()
metadata.reflect(engine)

table_names = metadata.tables.keys()
print(table_names)

# Task 3.1 - Loading additional data

To enrich our data we will collect information about the countries. For this we will use an API.

- Make a GET request to https://restcountries.com/v3.1/all. You can use the [`requests` library](https://requests.readthedocs.io/en/latest/user/quickstart/) for this.
- Create a DataFrame called `countries_df` from the response
- Alternative: Load the data from the file `countries.json` (in case the API dies)
- You may need either [`pd.DataFrame.from_records`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_records.html), [`pd.read_json`](https://pandas.pydata.org/docs/reference/api/pandas.read_json.html).

# Task 3.2 - Data Cleaning
The `name` column contains dictionaries. This makes it annoying for us to work with.
Simplify the column by replacing all entries in it with the value in `common` in that dictionary.

*Hint*: You did something very similar in Task 1.2 last week!

In [None]:
# you can use this as an alternative to task 3.1. if you are completely stuck with 3.1 or the API dies you can use this
countries_df = pd.read_json(
    "https://drive.switch.ch/index.php/s/x0zUM0seQqigcU1/download"
)

# Task 3.3 - Joining DataFrames

Combine the two DataFrames on the `leagueCountry` column. You can use [`pd.merge`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html) for this.
For the DataFrame with the countries, you only need the `name` and `fifa` columns.

Here is how we can do it with SQLAlchemy.
First we save the data from the DataFrame in the database.

In [None]:
from sqlalchemy.orm import sessionmaker

with sqlite3.connect("./crowdstorming.db") as c:
    countries_df[["name", "fifa", "unMember"]].to_sql(
        "countries", c, if_exists="replace"
    )

metadata = MetaData()
metadata.reflect(engine)
countries_table = metadata.tables["countries"]
crowdstorming_table = metadata.tables["crowdstorming"]
Session = sessionmaker(bind=engine)
session = Session()
results = session.query(countries_table).all()
print(results[:5])

Then we do the join, but with SQLAlchemy.

In [None]:
results = (
    session.query(crowdstorming_table.c, countries_table.c.fifa)
    .join(
        countries_table, crowdstorming_table.c.leagueCountry == countries_table.c.name
    )
    .all()
)
len(results)

# Task 4 - Joining crowdstorming data and country data with SQL

Select all columns from the `crowdstorming` table, and `fifa` and `name` columns from the `countries` table.
Then join the two tables on the `leagueCountry` column of the `crowdstorming` table and the `name` column of the `countries` table.

What JOIN do you need to replicate the results of task 3.3?

First we save the data from the DataFrame in the database.

In [None]:
with sqlite3.connect("./crowdstorming.db") as c:
    countries_df[["name", "fifa", "unMember"]].to_sql(
        "countries", c, if_exists="replace"
    )


Then we can do the query.


In [None]:
# you need to write the query to make it work
with sqlite3.connect("./crowdstorming.db") as c:
    result = pd.read_sql(
        """
        SELECT *
        FROM crowdstorming
        """,
        c
    )
result

And here is how we can do the same with SQLAlchemy.

In [None]:
results = (
    session
    .query(
        crowdstorming_table.c,
        countries_table.c.name,
        countries_table.c.fifa,
    )
    .join(
        countries_table,
        crowdstorming_table.c.leagueCountry == countries_table.c.name,
        isouter=True
    )
    .all()
)
print(results[0])

# Task 5 - Calculating the mean

Calculate the mean height and weight of each player in the database.

*Hint*: Check the SQL Aggregate Functions sections in the sidebar in the [cheat sheet](https://www.sqltutorial.org/sql-cheat-sheet/).

In [None]:
with sqlite3.connect("./crowdstorming.db") as c:
    result = pd.read_sql(
        """
        SELECT *
        FROM crowdstorming
        """,
        c
    )
result

Now repeat this, but on the DataFrame. Are the results the same?

And the solution with SQLAlchemy:

In [None]:
from sqlalchemy import func
# calculate the mean height and weight in the database
results = session.query(
    func.avg(crowdstorming_table.c.height),
    func.avg(crowdstorming_table.c.weight)
).all()

print(results)

# Task 6 - Calculating the mean per position

Calculate the mean height and weight of each player per position in the database.

*Hint*: Remember how to do this in pandas. Then check the [cheat sheet](https://www.sqltutorial.org/sql-cheat-sheet/) if there is something similar in SQL.

In [None]:
with sqlite3.connect("./crowdstorming.db") as c:
    result = pd.read_sql(
        """
        SELECT *
        FROM crowdstorming
        """,
        c
    )
result

Now do the same with the DataFrame. Are the results the same?

And the solution with SQLAlchemy:

In [None]:
# calculate mean height and weight PER POSITION in the database
results = (
    session.query(
        crowdstorming_table.c.position,
        func.avg(crowdstorming_table.c.height),
        func.avg(crowdstorming_table.c.weight)
    )
    .group_by(crowdstorming_table.c.position)
    .all()
)
results

# Task 7 - Calculating the mean per position and league
Calculate the mean height and weight of each player per position and per league in the database.

*Hint*: This is almost identical to task 6. Try to not overcomplicate things: What would be the most intuitive way to extend the solution of task 6 to two conditions? (Task 6: 'per position', here: 'per position **and per league**'.)

In [None]:
with sqlite3.connect("./crowdstorming.db") as c:
    results = pd.read_sql(
        """
        SELECT *
        FROM crowdstorming
        """,
        c)
results = results.set_index(['position', 'leagueCountry'])
results

Now do the same with the DataFrame. Are the results the same?

And the solution with SQLAlchemy:

In [None]:
# calculate mean height and weight PER POSITION and PER LEAGUE in the database
results = (
    session
    .query(
        crowdstorming_table.c.position,
        crowdstorming_table.c.leagueCountry,
        func.avg(crowdstorming_table.c.height),
        func.avg(crowdstorming_table.c.weight)
    )
    .group_by(crowdstorming_table.c.position, crowdstorming_table.c.leagueCountry)
    .all()
)
results

# Task 8.1  - People with unusual names
Select all people, whose first name starts with an X, from `people_database.db`.

*Hint*: If you can't figure out how to do this, check [here](https://www.w3schools.com/sql/sql_like.asp).

In [None]:
people_db_file = Path("people_database.db")
if people_db_file.exists() and people_db_file.stat().st_size == 691134464:
    print("File already downloaded.")
else:
    print("Will download database")
    with requests.get("https://drive.switch.ch/index.php/s/OIsWhbxdTY6h5n7/download", stream=True) as response:
        response.raise_for_status()
        with people_db_file.open('wb') as fo:
            for chunk in tqdm(response.iter_content(chunk_size=8192), desc="Chunks written"):
                fo.write(chunk)

In [None]:
start = perf_counter()

with sqlite3.connect(people_db_file) as c:
    results = pd.read_sql(
        """
        SELECT *
        FROM people
        """,
        c,
    )
end = perf_counter()
print(f"Duration: {end - start:.3f}s")
results

Repeat this, but load the data into a DataFrame first, and time both loading it into a DataFrame, and the actual querying.

In [None]:
# alternative: load into dataframe, do it there
start = perf_counter()

with sqlite3.connect(people_db_file) as c:
    people_df = pd.read_sql(
        """
        SELECT * FROM people
        """,
        c,
    )
end = perf_counter()
print(f"Loading duration: {end - start:.3f}s")


start = perf_counter()

# ADD HERE:
result =

end = perf_counter()
print(f"Just the query: {end - start:.3f}s")
result

# Task 8.2 - Joining with football players
Select all people from `people_database.db`, who share a name with a player from the `crowdstorming` table, as well as the position of that player. Include the `fifa` column from the `countries` table as well for those players, who have a match in that table.

*Hint*: You can use the `||` operator to concatenate strings in SQL.
*Hint2*: You can use the `DISTINCT` keyword to get rid of duplicates.
*Hint3*: You can have multiple JOINs per query.
*Hint4*: If you can't solve it in pure SQL, break it down into multiple smaller problems. For example, first get all the names of the players, then join that with the people table.

In [None]:
with sqlite3.connect(people_db_file) as c:
    c.execute("ATTACH DATABASE './crowdstorming.db' AS crowd")
    results = pd.read_sql(
        """

        """,
        c,
    )

results