In [1]:
import numpy as np
import pandas as pd
import sqlite3
import kaggle

from IPython.display import display

pd.options.display.max_rows = 200

In [2]:
kaggle.api.authenticate()

In [3]:
path = "../../../data/sql_data/"

In [4]:
kaggle.api.dataset_download_files(
    'hugomathien/soccer', 
    path=path, 
    unzip=True
)

In [5]:
ls ../../../data/sql_data

database.sqlite


In [6]:
database = path + 'database.sqlite'

In [7]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
    return conn

In [8]:
conn = create_connection(database)

In [9]:
def make_metadata_df(con):
    cur = conn.cursor()
    tablename_statement = "SELECT * FROM sqlite_master WHERE type='table';"
    res = []
    cur.execute(tablename_statement)
    meta_cols = {row[0]: num for num, row in enumerate(cur.description)}
    for row in cur:
        table_name = row[meta_cols["name"]]
        print(table_name)
        cols_statement = f"SELECT * from {table_name} LIMIT 1"
        new_cur = conn.cursor()
        new_cur.execute(cols_statement)
        for row in new_cur.description:
            res.append(dict(table_name=table_name,
                            col=row[0]))
        new_cur.close()
    cur.close()
    df = pd.DataFrame(res)
    return df

In [10]:
meta_df = make_metadata_df(conn)

sqlite_sequence
Player_Attributes
Player
Match
League
Country
Team
Team_Attributes


In [11]:
meta_df.query("table_name == 'Match'")["col"].unique()

array(['id', 'country_id', 'league_id', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_team_goal', 'away_team_goal', 'home_player_X1',
       'home_player_X2', 'home_player_X3', 'home_player_X4',
       'home_player_X5', 'home_player_X6', 'home_player_X7',
       'home_player_X8', 'home_player_X9', 'home_player_X10',
       'home_player_X11', 'away_player_X1', 'away_player_X2',
       'away_player_X3', 'away_player_X4', 'away_player_X5',
       'away_player_X6', 'away_player_X7', 'away_player_X8',
       'away_player_X9', 'away_player_X10', 'away_player_X11',
       'home_player_Y1', 'home_player_Y2', 'home_player_Y3',
       'home_player_Y4', 'home_player_Y5', 'home_player_Y6',
       'home_player_Y7', 'home_player_Y8', 'home_player_Y9',
       'home_player_Y10', 'home_player_Y11', 'away_player_Y1',
       'away_player_Y2', 'away_player_Y3', 'away_player_Y4',
       'away_player_Y5', 'away_player_Y6', 'away_player_Y7',
       'aw

In [12]:
meta_df.table_name.unique()

array(['sqlite_sequence', 'Player_Attributes', 'Player', 'Match',
       'League', 'Country', 'Team', 'Team_Attributes'], dtype=object)

In [13]:
def peek_sql_res(statement, con=conn):
    res_df = pd.read_sql(statement, con=conn)
    print(f"shape: {res_df.shape}")
    display(res_df.head(10))

## CASE

#### Retrieve information about matches played between Barcelona (id = 8634) and Real Madrid (id = 8633)

In [14]:
statement = \
"""
SELECT  
    m.date,
    t.team_long_name AS opponent,
    CASE WHEN m.home_team_goal < m.away_team_goal THEN 'Barcelona win'
        WHEN m.home_team_goal > m.away_team_goal THEN 'Barcelona loss' 
        ELSE 'Tie' END AS outcome
FROM Match AS m
-- Join team to match
LEFT JOIN Team AS t 
ON m.home_team_api_id = t.team_api_id
WHERE m.away_team_api_id = 8634;
"""

In [15]:
# pd.read_sql("select * from Match limit 2", con=conn).T

In [16]:
peek_sql_res(statement)

shape: (152, 3)


Unnamed: 0,date,opponent,outcome
0,2008-08-31 00:00:00,CD Numancia,Barcelona loss
1,2008-11-16 00:00:00,RC Recreativo,Barcelona win
2,2008-11-29 00:00:00,Sevilla FC,Barcelona win
3,2008-12-21 00:00:00,Villarreal CF,Barcelona win
4,2009-01-11 00:00:00,CA Osasuna,Barcelona win
5,2009-02-01 00:00:00,Racing Santander,Barcelona win
6,2009-02-14 00:00:00,Real Betis Balompié,Tie
7,2009-03-01 00:00:00,Atlético Madrid,Barcelona loss
8,2009-03-15 00:00:00,UD Almería,Barcelona win
9,2009-04-04 00:00:00,Real Valladolid,Barcelona win


In [17]:
statement = \
"""
SELECT
    date,
    CASE WHEN m.home_team_api_id = 8634 THEN 'Barcelona' ELSE 'Madrid' END AS HOME,
    CASE WHEN m.away_team_api_id = 8634 THEN 'Barcelona' ELSE 'Madrid' END AS AWAY,
    CASE WHEN m.home_team_goal > m.away_team_goal THEN 'Home team win'
         WHEN m.away_team_goal > m.home_team_goal THEN 'Away team win'
         ELSE 'tie'
         END
    AS OUTCOME
FROM Match as m
WHERE (m.home_team_api_id = 8634 AND m.away_team_api_id = 8633)
OR
(m.home_team_api_id = 8633 AND m.away_team_api_id = 8634)
"""

In [18]:
peek_sql_res(statement)

shape: (16, 4)


Unnamed: 0,date,HOME,AWAY,OUTCOME
0,2008-12-13 00:00:00,Barcelona,Madrid,Home team win
1,2009-05-02 00:00:00,Madrid,Barcelona,Away team win
2,2009-11-29 00:00:00,Barcelona,Madrid,Home team win
3,2010-04-10 00:00:00,Madrid,Barcelona,Away team win
4,2010-11-29 00:00:00,Barcelona,Madrid,Home team win
5,2011-04-16 00:00:00,Madrid,Barcelona,tie
6,2011-12-10 00:00:00,Madrid,Barcelona,Away team win
7,2012-04-21 00:00:00,Barcelona,Madrid,Away team win
8,2013-03-02 00:00:00,Madrid,Barcelona,Home team win
9,2012-10-07 00:00:00,Barcelona,Madrid,tie


In [19]:
statement = \
"""
SELECT
    date,
    CASE WHEN m.home_team_api_id = 8634 THEN 'Barcelona' ELSE 'Madrid' END AS HOME,
    CASE WHEN m.away_team_api_id = 8634 THEN 'Barcelona' ELSE 'Madrid' END AS AWAY,
    CASE WHEN m.home_team_api_id = 8634 AND m.home_team_goal > m.away_team_goal THEN 'Barcelona win'
         WHEN m.home_team_api_id = 8634 AND m.home_team_goal < m.away_team_goal THEN 'Madrid win'
         WHEN m.away_team_api_id = 8634 AND m.home_team_goal > m.away_team_goal THEN 'Madrid win'
         WHEN m.away_team_api_id = 8634 AND m.home_team_goal < m.away_team_goal THEN 'Barcelona win'
         ELSE 'tie'
         END
         AS OUTCOME
    
FROM Match as m
WHERE (m.home_team_api_id = 8634 AND m.away_team_api_id = 8633)
OR
(m.home_team_api_id = 8633 AND m.away_team_api_id = 8634)
"""

In [20]:
peek_sql_res(statement)

shape: (16, 4)


Unnamed: 0,date,HOME,AWAY,OUTCOME
0,2008-12-13 00:00:00,Barcelona,Madrid,Barcelona win
1,2009-05-02 00:00:00,Madrid,Barcelona,Barcelona win
2,2009-11-29 00:00:00,Barcelona,Madrid,Barcelona win
3,2010-04-10 00:00:00,Madrid,Barcelona,Barcelona win
4,2010-11-29 00:00:00,Barcelona,Madrid,Barcelona win
5,2011-04-16 00:00:00,Madrid,Barcelona,tie
6,2011-12-10 00:00:00,Madrid,Barcelona,Barcelona win
7,2012-04-21 00:00:00,Barcelona,Madrid,Madrid win
8,2013-03-02 00:00:00,Madrid,Barcelona,Madrid win
9,2012-10-07 00:00:00,Barcelona,Madrid,tie
