In [1]:
import pandas as pd
import sqlite3

import requests  # used later to download additional data

# for the progress bar
from tqdm.auto import tqdm
from pathlib import Path

# to measure runtime
from time import perf_counter

- [`pandas` Cheat Sheet](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf)
- [`matplotlib` Cheat Sheet](https://matplotlib.org/cheatsheets/_images/cheatsheets-1.png)
- [SQL Cheat Sheet](https://www.sqltutorial.org/sql-cheat-sheet/)

# Loading the data

In [2]:
df = pd.read_csv('https://drive.switch.ch/index.php/s/UEpTFv2Bfa5C1dd/download')
df.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,...,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,...,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181.0,79.0,,1,0,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191.0,87.0,Center Back,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172.0,70.0,Right Midfielder,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002


We repeat our simple data cleaning here, by getting rid of all `NaN` values.

In [3]:
df = df.dropna()

We will be using [`sqlalchemy`](https://www.sqlalchemy.org/) here. First we store the data from the DataFrame in a sqlite3 database.

You can find an `sqlalchemy` cheat sheet [here](https://www.pythonsheets.com/notes/python-sqlalchemy.html).

In [4]:
with sqlite3.connect('crowdstorming.db') as c:
    df.to_sql('crowdstorming', c, if_exists="replace")

from sqlalchemy import create_engine, MetaData

engine = create_engine('sqlite:///crowdstorming.db')
metadata = MetaData()
metadata.reflect(engine)

table_names = metadata.tables.keys()
print(table_names)

dict_keys(['countries', 'crowdstorming'])


# Task 3.1 - Loading additional data

To enrich our data we will collect information about the countries. For this we will use an API.

- Make a GET request to https://restcountries.com/v3.1/all. You can use the [`requests` library](https://requests.readthedocs.io/en/latest/user/quickstart/) for this.
- Create a DataFrame called `countries_df` from the response
- Alternative: Load the data from the file `countries.json` (in case the API dies)
- You may need either [`pd.DataFrame.from_records`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_records.html), [`pd.read_json`](https://pandas.pydata.org/docs/reference/api/pandas.read_json.html).

In [5]:
with requests.get('https://restcountries.com/v3.1/all') as response:
    response_json = response.json()
    countries_df = pd.DataFrame.from_records(response_json)

countries_df

Unnamed: 0,name,tld,cca2,ccn3,cca3,cioc,independent,status,unMember,currencies,...,gini,fifa,car,timezones,continents,flags,coatOfArms,startOfWeek,capitalInfo,postalCode
0,"{'common': 'Uzbekistan', 'official': 'Republic...",[.uz],UZ,860,UZB,UZB,True,officially-assigned,True,"{'UZS': {'name': 'Uzbekistani soʻm', 'symbol':...",...,{'2003': 35.3},UZB,"{'signs': ['UZ'], 'side': 'right'}",[UTC+05:00],[Asia],"{'png': 'https://flagcdn.com/w320/uz.png', 'sv...",{'png': 'https://mainfacts.com/media/images/co...,monday,"{'latlng': [41.32, 69.25]}","{'format': '######', 'regex': '^(\d{6})$'}"
1,"{'common': 'Austria', 'official': 'Republic of...",[.at],AT,040,AUT,AUT,True,officially-assigned,True,"{'EUR': {'name': 'Euro', 'symbol': '€'}}",...,{'2018': 30.8},AUT,"{'signs': ['A'], 'side': 'right'}",[UTC+01:00],[Europe],"{'png': 'https://flagcdn.com/w320/at.png', 'sv...",{'png': 'https://mainfacts.com/media/images/co...,monday,"{'latlng': [48.2, 16.37]}","{'format': '####', 'regex': '^(\d{4})$'}"
2,"{'common': 'Sierra Leone', 'official': 'Republ...",[.sl],SL,694,SLE,SLE,True,officially-assigned,True,"{'SLL': {'name': 'Sierra Leonean leone', 'symb...",...,{'2018': 35.7},SLE,"{'signs': ['WAL'], 'side': 'right'}",[UTC],[Africa],"{'png': 'https://flagcdn.com/w320/sl.png', 'sv...",{'png': 'https://mainfacts.com/media/images/co...,monday,"{'latlng': [8.48, -13.23]}",
3,"{'common': 'Cocos (Keeling) Islands', 'officia...",[.cc],CC,166,CCK,,False,officially-assigned,False,"{'AUD': {'name': 'Australian dollar', 'symbol'...",...,,,"{'signs': ['AUS'], 'side': 'left'}",[UTC+06:30],[Asia],"{'png': 'https://flagcdn.com/w320/cc.png', 'sv...",{},monday,"{'latlng': [-12.17, 96.83]}",
4,"{'common': 'Mauritania', 'official': 'Islamic ...",[.mr],MR,478,MRT,MTN,True,officially-assigned,True,"{'MRU': {'name': 'Mauritanian ouguiya', 'symbo...",...,{'2014': 32.6},MTN,"{'signs': ['RIM'], 'side': 'right'}",[UTC],[Africa],"{'png': 'https://flagcdn.com/w320/mr.png', 'sv...",{'png': 'https://mainfacts.com/media/images/co...,monday,"{'latlng': [18.07, -15.97]}",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,"{'common': 'New Caledonia', 'official': 'New C...",[.nc],NC,540,NCL,,False,officially-assigned,False,"{'XPF': {'name': 'CFP franc', 'symbol': '₣'}}",...,,NCL,"{'signs': ['F'], 'side': 'right'}",[UTC+11:00],[Oceania],"{'png': 'https://flagcdn.com/w320/nc.png', 'sv...",{'png': 'https://mainfacts.com/media/images/co...,monday,"{'latlng': [-22.27, 166.45]}","{'format': '#####', 'regex': '^(\d{5})$'}"
246,"{'common': 'Libya', 'official': 'State of Liby...",[.ly],LY,434,LBY,LBA,True,officially-assigned,True,"{'LYD': {'name': 'Libyan dinar', 'symbol': 'ل....",...,,LBY,"{'signs': ['LAR'], 'side': 'right'}",[UTC+01:00],[Africa],"{'png': 'https://flagcdn.com/w320/ly.png', 'sv...",{'png': 'https://mainfacts.com/media/images/co...,sunday,"{'latlng': [32.88, 13.17]}",
247,"{'common': 'Åland Islands', 'official': 'Åland...",[.ax],AX,248,ALA,,False,officially-assigned,False,"{'EUR': {'name': 'Euro', 'symbol': '€'}}",...,,,"{'signs': [''], 'side': 'right'}",[UTC+02:00],[Europe],"{'png': 'https://flagcdn.com/w320/ax.png', 'sv...",{'png': 'https://mainfacts.com/media/images/co...,monday,"{'latlng': [60.12, 19.9]}",
248,"{'common': 'Martinique', 'official': 'Martiniq...",[.mq],MQ,474,MTQ,,False,officially-assigned,False,"{'EUR': {'name': 'Euro', 'symbol': '€'}}",...,,,"{'signs': ['F'], 'side': 'right'}",[UTC-04:00],[North America],"{'png': 'https://flagcdn.com/w320/mq.png', 'sv...",{'png': 'https://mainfacts.com/media/images/co...,monday,"{'latlng': [14.6, -61.08]}","{'format': '#####', 'regex': '^(\d{5})$'}"


# Task 3.2 - Data Cleaning
The `name` column contains dictionaries. This makes it annoying for us to work with.
Simplify the column by replacing all entries in it with the value in `common` in that dictionary.

*Hint*: You did something very similar in Task 1.2 last week!

In [6]:
x = {"foo": "bar"}
print(x["foo"])

bar


In [7]:
countries_df = pd.read_json(
    "https://drive.switch.ch/index.php/s/x0zUM0seQqigcU1/download"
)
countries_df["name"] = countries_df["name"].apply(lambda x: x["common"])
countries_df.head()

Unnamed: 0,name,tld,cca2,ccn3,cca3,independent,status,unMember,currencies,idd,...,continents,flags,coatOfArms,startOfWeek,capitalInfo,postalCode,borders,cioc,gini,fifa
0,French Polynesia,[.pf],PF,258.0,PYF,0.0,officially-assigned,False,"{'XPF': {'name': 'CFP franc', 'symbol': '₣'}}","{'root': '+6', 'suffixes': ['89']}",...,[Oceania],"{'png': 'https://flagcdn.com/w320/pf.png', 'sv...",{'png': 'https://mainfacts.com/media/images/co...,monday,"{'latlng': [-17.53, -149.56]}","{'format': '#####', 'regex': '^((97|98)7\d{2})$'}",,,,
1,Saint Martin,"[.fr, .gp]",MF,663.0,MAF,0.0,officially-assigned,False,"{'EUR': {'name': 'Euro', 'symbol': '€'}}","{'root': '+5', 'suffixes': ['90']}",...,[North America],"{'png': 'https://flagcdn.com/w320/mf.png', 'sv...",{},monday,"{'latlng': [18.07, -63.08]}",{'format': '### ###'},[SXM],,,
2,Venezuela,[.ve],VE,862.0,VEN,1.0,officially-assigned,True,{'VES': {'name': 'Venezuelan bolívar soberano'...,"{'root': '+5', 'suffixes': ['8']}",...,[South America],"{'png': 'https://flagcdn.com/w320/ve.png', 'sv...",{'png': 'https://mainfacts.com/media/images/co...,monday,"{'latlng': [10.48, -66.87]}","{'format': '####', 'regex': '^(\d{4})$'}","[BRA, COL, GUY]",VEN,{'2006': 44.8},VEN
3,Réunion,[.re],RE,638.0,REU,0.0,officially-assigned,False,"{'EUR': {'name': 'Euro', 'symbol': '€'}}","{'root': '+2', 'suffixes': ['62']}",...,[Africa],"{'png': 'https://flagcdn.com/w320/re.png', 'sv...",{},monday,"{'latlng': [-20.88, 55.45]}","{'format': '#####', 'regex': '^((97|98)(4|7|8)...",,,,
4,El Salvador,[.sv],SV,222.0,SLV,1.0,officially-assigned,True,"{'USD': {'name': 'United States dollar', 'symb...","{'root': '+5', 'suffixes': ['03']}",...,[North America],"{'png': 'https://flagcdn.com/w320/sv.png', 'sv...",{'png': 'https://mainfacts.com/media/images/co...,monday,"{'latlng': [13.7, -89.2]}","{'format': 'CP ####', 'regex': '^(?:CP)*(\d{4}...","[GTM, HND]",ESA,{'2019': 38.8},SLV


# Task 3.3 - Joining DataFrames

Combine the two DataFrames on the `leagueCountry` column. You can use [`pd.merge`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html) for this.
For the DataFrame with the countries, you only need the `name` and `fifa` columns.

In [8]:
pd.merge(
    df,
    countries_df[["name", "fifa"]],
    how="left",
    left_on="leagueCountry",
    right_on="name",
)

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp,name,fifa
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,...,1,GRC,0.326391,712.0,0.000564,0.396000,750.0,0.002696,Spain,ESP
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,...,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504,France,FRA
2,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,0,...,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,,
3,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,1,...,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,,
4,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,0,...,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115452,tom-huddlestone,Tom Huddlestone,Tottenham Hotspur,England,28.12.1986,188.0,80.0,Defensive Midfielder,1,0,...,21,HUN,0.376127,574.0,0.000714,0.498350,606.0,0.002968,,
115453,tomas-rosicky,Tomáš Rosický,Arsenal FC,England,04.10.1980,178.0,67.0,Attacking Midfielder,1,1,...,21,HUN,0.376127,574.0,0.000714,0.498350,606.0,0.002968,,
115454,winston-reid,Winston Reid,West Ham United,England,03.07.1988,190.0,87.0,Center Back,1,0,...,21,HUN,0.376127,574.0,0.000714,0.498350,606.0,0.002968,,
115455,xherdan-shaqiri,Xherdan Shaqiri,Bayern München,Germany,10.10.1991,169.0,72.0,Left Midfielder,1,1,...,21,HUN,0.376127,574.0,0.000714,0.498350,606.0,0.002968,Germany,GER


Here is how we can do it with SQLAlchemy.
First we save the data from the DataFrame in the database.

In [9]:
from sqlalchemy.orm import sessionmaker

with sqlite3.connect("./crowdstorming.db") as c:
    countries_df[["name", "fifa", "unMember"]].to_sql(
        "countries", c, if_exists="replace"
    )

metadata = MetaData()
metadata.reflect(engine)
countries_table = metadata.tables["countries"]
crowdstorming_table = metadata.tables["crowdstorming"]
Session = sessionmaker(bind=engine)
session = Session()
results = session.query(countries_table).all()
print(results[:5])

[(0, 'French Polynesia', None, 0), (1, 'Saint Martin', None, 0), (2, 'Venezuela', 'VEN', 1), (3, 'Réunion', None, 0), (4, 'El Salvador', 'SLV', 1)]


Then we do the join, but with SQLAlchemy.

In [10]:
results = (
    session
    .query(
        crowdstorming_table.c,
        countries_table.c.fifa
    )
    .join(
        countries_table,
        crowdstorming_table.c.leagueCountry == countries_table.c.name
    )
    .all()
)
len(results)

84358

# Task 4 - Joining crowdstorming data and country data with SQL

Select all columns from the `crowdstorming` table, and `fifa` and `name` columns from the `countries` table.
Then join the two tables on the `leagueCountry` column of the `crowdstorming` table and the `name` column of the `countries` table.

What JOIN do you need to replicate the results of task 3.3?

First we save the data from the DataFrame in the database.

In [11]:
with sqlite3.connect("./crowdstorming.db") as c:
    countries_df[["name", "fifa", "unMember"]].to_sql(
        "countries", c, if_exists="replace"
    )


Then we can do the query.


In [12]:
with sqlite3.connect("./crowdstorming.db") as c:
    result = pd.read_sql(
        """
        SELECT crowdstorming.*, countries.name, countries.fifa
        FROM crowdstorming
        LEFT JOIN countries ON crowdstorming.leagueCountry = countries.name
        """,
        c,
    )
result

Unnamed: 0,index,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,...,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp,name,fifa
0,0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,...,1,GRC,0.326391,712.0,0.000564,0.396000,750.0,0.002696,Spain,ESP
1,1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,...,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504,France,FRA
2,5,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,...,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,,
3,6,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,...,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,,
4,7,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,...,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115452,146022,tom-huddlestone,Tom Huddlestone,Tottenham Hotspur,England,28.12.1986,188.0,80.0,Defensive Midfielder,1,...,21,HUN,0.376127,574.0,0.000714,0.498350,606.0,0.002968,,
115453,146023,tomas-rosicky,Tomáš Rosický,Arsenal FC,England,04.10.1980,178.0,67.0,Attacking Midfielder,1,...,21,HUN,0.376127,574.0,0.000714,0.498350,606.0,0.002968,,
115454,146024,winston-reid,Winston Reid,West Ham United,England,03.07.1988,190.0,87.0,Center Back,1,...,21,HUN,0.376127,574.0,0.000714,0.498350,606.0,0.002968,,
115455,146025,xherdan-shaqiri,Xherdan Shaqiri,Bayern München,Germany,10.10.1991,169.0,72.0,Left Midfielder,1,...,21,HUN,0.376127,574.0,0.000714,0.498350,606.0,0.002968,Germany,GER


And here is how we can do the same with SQLAlchemy.

In [13]:
results = (
    session
    .query(
        crowdstorming_table.c,
        countries_table.c.name,
        countries_table.c.fifa,
    )
    .join(
        countries_table,
        crowdstorming_table.c.leagueCountry == countries_table.c.name,
        isouter=True
    )
    .all()
)
print(results[0])

(0, 'lucas-wilchez', 'Lucas Wilchez', 'Real Zaragoza', 'Spain', '31.08.1983', 177.0, 72.0, 'Attacking Midfielder', 1, 0, 0, 1, 0, 0, 0, 0, '95212.jpg', 0.25, 0.5, 1, 1, 'GRC', 0.326391469021736, 712.0, 0.0005641123543345, 0.396, 750.0, 0.0026964901062936, 'Spain', 'ESP')


# Task 5 - Calculating the mean

Calculate the mean height and weight of each player in the database.

*Hint*: Check the SQL Aggregate Functions sections in the sidebar in the [cheat sheet](https://www.sqltutorial.org/sql-cheat-sheet/).

In [14]:
with sqlite3.connect("./crowdstorming.db") as c:
    result = pd.read_sql(
        """
        SELECT AVG(height), AVG(weight)
        FROM crowdstorming
        """,
        c
    )
result

Unnamed: 0,AVG(height),AVG(weight)
0,182.176135,76.517413


Now repeat this, but on the DataFrame. Are the results the same?

In [15]:
df[['height', 'weight']].mean()

height    182.176135
weight     76.517413
dtype: float64

And the solution with SQLAlchemy:

In [16]:
from sqlalchemy import func
# calculate the mean height and weight in the database
results = session.query(
    func.avg(crowdstorming_table.c.height),
    func.avg(crowdstorming_table.c.weight)
).all()

print(results)

[(182.17613483807824, 76.51741340932122)]


# Task 6 - Calculating the mean per position

Calculate the mean height and weight of each player per position in the database.

*Hint*: Remember how to do this in pandas. Then check the [cheat sheet](https://www.sqltutorial.org/sql-cheat-sheet/) if there is something similar in SQL.

In [17]:
with sqlite3.connect("./crowdstorming.db") as c:
    result = pd.read_sql(
        """
        SELECT position, AVG(height), AVG(weight)
        FROM crowdstorming
        GROUP BY position
        """,
        c
    )
result

Unnamed: 0,position,AVG(height),AVG(weight)
0,Attacking Midfielder,177.547457,72.120387
1,Center Back,187.258714,80.81753
2,Center Forward,183.354108,77.769815
3,Center Midfielder,180.631655,74.969339
4,Defensive Midfielder,181.404502,75.278369
5,Goalkeeper,189.847237,84.549437
6,Left Fullback,179.579545,74.815508
7,Left Midfielder,176.942494,70.929285
8,Left Winger,179.196087,73.800356
9,Right Fullback,179.767063,74.262307


Now do the same with the DataFrame. Are the results the same?

In [18]:
df.groupby('position')[['height', 'weight']].mean()

Unnamed: 0_level_0,height,weight
position,Unnamed: 1_level_1,Unnamed: 2_level_1
Attacking Midfielder,177.547457,72.120387
Center Back,187.258714,80.81753
Center Forward,183.354108,77.769815
Center Midfielder,180.631655,74.969339
Defensive Midfielder,181.404502,75.278369
Goalkeeper,189.847237,84.549437
Left Fullback,179.579545,74.815508
Left Midfielder,176.942494,70.929285
Left Winger,179.196087,73.800356
Right Fullback,179.767063,74.262307


And the solution with SQLAlchemy:

In [19]:
# calculate mean height and weight PER POSITION in the database
results = (
    session.query(
        crowdstorming_table.c.position,
        func.avg(crowdstorming_table.c.height),
        func.avg(crowdstorming_table.c.weight)
    )
    .group_by(crowdstorming_table.c.position)
    .all()
)
results

[('Attacking Midfielder', 177.5474574695547, 72.1203867249233),
 ('Center Back', 187.25871398625262, 80.81753034660947),
 ('Center Forward', 183.3541076487252, 77.76981495991802),
 ('Center Midfielder', 180.63165527242595, 74.96933913403771),
 ('Defensive Midfielder', 181.4045016539623, 75.27836904933122),
 ('Goalkeeper', 189.84723736822974, 84.54943656852053),
 ('Left Fullback', 179.57954545454547, 74.81550802139037),
 ('Left Midfielder', 176.94249355670104, 70.92928479381443),
 ('Left Winger', 179.19608714984437, 73.80035571365052),
 ('Right Fullback', 179.76706302021404, 74.26230677764566),
 ('Right Midfielder', 176.35472024195292, 70.81075826312379),
 ('Right Winger', 177.53814713896458, 73.05313351498637)]

# Task 7 - Calculating the mean per position and league
Calculate the mean height and weight of each player per position and per league in the database.

*Hint*: This is almost identical to task 6. Try to not overcomplicate things: What would be the most intuitive way to extend the solution of task 6 to two conditions? (Task 6: 'per position', here: 'per position **and per league**'.)

In [20]:
with sqlite3.connect("./crowdstorming.db") as c:
    results = pd.read_sql(
        """
        SELECT position, leagueCountry, AVG(height), AVG(weight)
        FROM crowdstorming
        GROUP BY position, leagueCountry
        """,
        c,
    )
results = results.set_index(["position", "leagueCountry"])
results

Unnamed: 0_level_0,Unnamed: 1_level_0,AVG(height),AVG(weight)
position,leagueCountry,Unnamed: 2_level_1,Unnamed: 3_level_1
Attacking Midfielder,England,175.738573,70.66211
Attacking Midfielder,France,177.742502,72.419898
Attacking Midfielder,Germany,178.725081,73.475267
Attacking Midfielder,Spain,177.293126,71.311963
Center Back,England,186.521912,78.782205
Center Back,France,185.349614,78.330885
Center Back,Germany,189.624508,83.832435
Center Back,Spain,185.912707,80.479094
Center Forward,England,182.909722,76.85335
Center Forward,France,182.828865,75.820878


Now do the same with the DataFrame. Are the results the same?

In [21]:
df.groupby(['leagueCountry', 'position'])[['height', 'weight']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,height,weight
leagueCountry,position,Unnamed: 2_level_1,Unnamed: 3_level_1
England,Attacking Midfielder,175.738573,70.66211
England,Center Back,186.521912,78.782205
England,Center Forward,182.909722,76.85335
England,Center Midfielder,180.414167,74.579167
England,Defensive Midfielder,182.704415,75.322457
England,Goalkeeper,191.672706,85.01188
England,Left Fullback,179.328821,75.440611
England,Left Midfielder,177.22721,71.02279
England,Left Winger,179.471328,72.350353
England,Right Fullback,180.169567,75.178046


And the solution with SQLAlchemy:

In [22]:
# calculate mean height and weight PER POSITION and PER LEAGUE in the database
results = (
    session
    .query(
        crowdstorming_table.c.position,
        crowdstorming_table.c.leagueCountry,
        func.avg(crowdstorming_table.c.height),
        func.avg(crowdstorming_table.c.weight)
    )
    .group_by(crowdstorming_table.c.position, crowdstorming_table.c.leagueCountry)
    .all()
)
results

[('Attacking Midfielder', 'England', 175.7385732592909, 70.66211020931226),
 ('Attacking Midfielder', 'France', 177.74250182882224, 72.41989758595464),
 ('Attacking Midfielder', 'Germany', 178.72508078548347, 73.47526721352224),
 ('Attacking Midfielder', 'Spain', 177.29312623925975, 71.31196298744217),
 ('Center Back', 'England', 186.5219123505976, 78.78220451527224),
 ('Center Back', 'France', 185.3496143958869, 78.33088505325009),
 ('Center Back', 'Germany', 189.62450823255136, 83.83243479527903),
 ('Center Back', 'Spain', 185.91270650622067, 80.47909443198041),
 ('Center Forward', 'England', 182.90972222222223, 76.85334967320262),
 ('Center Forward', 'France', 182.82886480319453, 75.82087849401027),
 ('Center Forward', 'Germany', 185.41949521322888, 81.1437771975631),
 ('Center Forward', 'Spain', 181.26471289015964, 75.03454848701453),
 ('Center Midfielder', 'England', 180.41416666666666, 74.57916666666667),
 ('Center Midfielder', 'France', 186.0, 75.0),
 ('Center Midfielder', 'Germ

# Task 8.1  - People with unusual names
Select all people, whose first name starts with an X, from `people_database.db`.

*Hint*: If you can't figure out how to do this, check [here](https://www.w3schools.com/sql/sql_like.asp).

In [23]:
people_db_file = Path("people_database.db")
if people_db_file.exists() and people_db_file.stat().st_size == 691134464:
    print("File already downloaded.")
else:
    print("Will download database")
    with requests.get("https://drive.switch.ch/index.php/s/OIsWhbxdTY6h5n7/download", stream=True) as response:
        response.raise_for_status()
        with people_db_file.open('wb') as fo:
            for chunk in tqdm(response.iter_content(chunk_size=8192), desc="Chunks written"):
                fo.write(chunk)

File already downloaded.


In [24]:
start = perf_counter()

with sqlite3.connect(people_db_file) as c:
    results = pd.read_sql(
        """
        SELECT * FROM people
        WHERE first_name LIKE 'X%'
        """,
        c,
    )
end = perf_counter()
print(f"Duration: {end - start:.3f}s")
results

Duration: 7.287s


Unnamed: 0,id,first_name,last_name,email,age,phone_number,gender
0,2068,Xavier,Hyde,tripadvisor2019@example.com,45,+17635978290,Female
1,2647,Xenia,Wade,imports1866@gmail.com,31,+1-747-884-1447,Male
2,2782,Xiao,Conley,hobby1832@duck.com,51,+15858378666,Female
3,3664,Xiao,Chang,dial2080@protonmail.com,28,+1-563-540-7138,Male
4,3998,Xenia,Raymond,jack2018@gmail.com,37,+1-561-201-4185,Male
...,...,...,...,...,...,...,...
12990,9997658,Xiao,Bartlett,impacts2018@yahoo.com,40,+1-618-841-0962,Female
12991,9998247,Xenia,Reynolds,leon1877@duck.com,66,+1-740-537-8529,Male
12992,9998502,Xavier,Larson,examination1912@protonmail.com,47,+1-470-362-5782,Male
12993,9998553,Xiao,James,hugh1907@yandex.com,63,+1-203-437-5999,Female


Repeat this, but load the data into a DataFrame first, and time both loading it into a DataFrame, and the actual querying.

**Beware, that this will load the entire DB into memory.**

In [None]:
# alternative: load into dataframe, do it there
start = perf_counter()

with sqlite3.connect(people_db_file) as c:
    people_df = pd.read_sql(
        """
        SELECT * FROM people
        """,
        c,
    )
end = perf_counter()
print(f"Loading duration: {end - start:.3f}s")

start = perf_counter()
result = people_df[people_df["first_name"].str.startswith("X")]
end = perf_counter()
print(f"Just the query: {end - start:.3f}s")
result