In [1]:
from sqlalchemy import create_engine, inspect, text, Engine
import pandas as pd
import numpy as np

In [2]:
engine = create_engine(
    "postgresql+psycopg2:///country_club?host=/var/run/postgresql", 
    # echo=True,
    # isolation_level="AUTOCOMMIT"
)
table_names = inspect(engine).get_table_names()

In [3]:
inspect(engine).get_columns("bookings")

[{'name': 'bookid',
  'type': SMALLINT(),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'facid',
  'type': SMALLINT(),
  'nullable': True,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'memid',
  'type': SMALLINT(),
  'nullable': True,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'starttime',
  'type': TIMESTAMP(),
  'nullable': True,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'slots',
  'type': SMALLINT(),
  'nullable': True,
  'default': None,
  'autoincrement': False,
  'comment': None}]

In [4]:
def get_results_frame(query, engine: Engine, **query_vars):
    with engine.connect() as conn:
        rs = conn.execute(text(query), query_vars)
        results = pd.DataFrame(rs.fetchall(), columns=rs.keys())
    return results

for table_name in table_names:
    test_query = f"SELECT * FROM {table_name} LIMIT 1;"
    print(table_name, get_results_frame(test_query, engine).T)

bookings                              0
bookid                       0
facid                        3
memid                        1
starttime  2012-07-03 11:00:00
slots                        2
members                                  0
memid                            0
surname                      GUEST
firstname                    GUEST
address                      GUEST
zipcode                          0
telephone           (000) 000-0000
recommendedby                 None
joindate       2012-07-01 00:00:00
facilities                                  0
facid                            0
name                Tennis Court 1
membercost                     5.0
guestcost                     25.0
initialoutlay                10000
monthlymaintenance             200
expense_label                 None


# Notes on Syntax

In general, SQL queries are whitespace indifferent and end in a `;`. Commands and table names are case-insensitive, though by convention the former are typed in all caps.

Comments are indicated with `--` or `/*` ... `*/`. The latter can encompass multiple lines.

String expressions are enclosed in single quotes, and column or table names are enclosed in double quote when necessary (e.g. when they contain spaces). For example:
```sql
SELECT * FROM "this is a table" WHERE "this is a column" = 'a string';
```

In some dialects, including SQLite and PostgreSQL, strings are concatenated with `||`, while in others like MySQL, this is done with the `CONCAT()` function:
```sql
'this' || ' and that'
CONCAT('this', ' and that')
```

When there is no ambiguity, columns can be referred to by name alone. If there are multiple tables references in a query with shared columns names, they can be distinguished with `tbl.col`.

Selected values and queried tables can be aliased with `AS`. The alias can then be used in elements of the query that are processed later (see below).

# Basic SELECT queries

## Elements and their order of operations

A `SELECT` statement has the following basic format:

```sql
SELECT
    [Columns and expressions separated by commas]
FROM
    [Table expressions and joins]
[Options]
```

Options include:
- Filtering rows: `WHERE [condition]`
- Grouping results: `GROUP BY [columns or columns, separated by commas]`. To filter on properties of groups, following this `GROUP BY` expression with `HAVING [condition]`.
- Ordering results: `ORDER BY [column or columns, separated by commas]`. By default, sorting is ascending; to sort in descending order, add `DESC` after the column name.
- Limiting results: `LIMIT [n]`.

The order of processing is:
1. Sources: the `FROM` clause, including subqueries and joins
1. Filters: the `WHERE` clause
1. The `GROUP BY` clause and its filters
1. Selection: querying column values or evaluating functions, as well aliases. Note that this means that column aliases cannot be referred in filtering and grouping.
1. Set operations (see below)
1. The `ORDER BY` clause
1. The `LIMIT` clause

## Functions on column values

The expression in a select statement or filter can be either column names or functions. These include basic arithmetic (`+`, `-`, `*`, `/` NB if both values are integers the latter will be integer division).

To round values for display, using `ROUND(value, places)`. The number of `places` defaults to 0, i.e. whole numbers. Negative values will round to the nearest 10, 100, etc.

The components of date and time types can be accessed with `EXTRACT(COMPONENT FROM dtval)`. Valid `COMPONENT`s  include `YEAR`, `MONTH`, `DAY`, `HOUR`, etc.

Values can be cast to different times with the `::` operator. This will attempt to parse values from strings, e.g. date or time.

## Filtering results

As noted above, filtering can be accomplished in two ways:
1. `WHERE` clauses, which filter on a row-by-row basis
1. `GROUP BY ... HAVING` clauses, which filter on values evaluated on groups

Certain filters can be accomplished using one or the other, for example choosing only certain groups.

Conditional expression include:
- Boolean operators: `AND`, `OR`, `NOT`. The last can be combined with many of the keywords below: `NOT IN`, `NOT LIKE`, etc.
- Standard mathematical comparison operators: `=`, `<>`, `<`, `<=`, `>`, `>=`. Note that the second 
- Group comparison: these have syntax `[column expression] [operator(s)] ()` where the `()` contains a set, which can be either a literal list, e.g. `(1, 2, 3)` or a `SELECT` subquery.
    - `IN`: returns true if the value is found in the list.
    - `[comparison operator] ANY`: returns true if the operator returns true for the expression and any item in the list. So, (I think) `= ANY` is equivalent to `IN`. See the examples in the next cell.
    - `[comparison operator] ALL`: returns true if the operator returns true for the expression and all items in the list. See the example in the cell after next for one use case: finding rows with the maximum values in a column.
- String comparison: the comparison operators work for strict equivalence or alphabetical order. It is also possible to pattern match with the `LIKE` or `ILIKE` operators. In pattern strings, `%` represents a free wildcard, so `LIKE 'a%'` will match `'a'`, `'aa'`, `'aba'`, etc. The `_` wildcard matches exactly one character. In PostgreSQL, `LIKE` is case sensitive, while `ILIKE` is not. In SQLite, `LIKE` is case insensitive and the latter keyword does not exist.
- Ranges of values: `BETWEEN [lower] AND [upper]`. This range includes the lower and upper bounds.
- Missing values can be captured or excluded with `IS NULL` or `IS NOT NULL`.

In [5]:
query_any = """
SELECT
    name
FROM
    facilities
WHERE facid = ANY (SELECT facid FROM facilities WHERE monthlymaintenance > 200);
"""

query_any2 = """
SELECT
    name
FROM
    facilities
WHERE monthlymaintenance > 200;
"""

query_in = """
SELECT
    name
FROM
    facilities
WHERE facid IN (SELECT facid FROM facilities WHERE monthlymaintenance > 200);
"""

print(np.all(get_results_frame(query_any, engine) == get_results_frame(query_any2, engine)))
print(np.all(get_results_frame(query_any2, engine) == get_results_frame(query_in, engine)))

True
True


In [13]:
# This query returns the names of records with the maximum value in the monthlymaintenance column
query_all = """
SELECT
    name,
    monthlymaintenance
FROM
    facilities
WHERE monthlymaintenance >= ALL (SELECT monthlymaintenance FROM facilities);
"""

get_results_frame(query_all, engine)

Unnamed: 0,name,monthlymaintenance
0,Massage Room 1,3000
1,Massage Room 2,3000


# Combining Table and Queries

## JOIN statements

A join extends one table horizontally by matching records on some key. This can be one-to-one, which basically just linked records. One-to-many or many-to-one link details about some entity that appears multiple times in a table. A `JOIN` statement identifies the type of join and the key with which to link records:
```sql
FROM tbl1
    JOIN tbl2
    ON tbl1.col1 = tab2.col2
```
To match rows based on the values of multiple columns, the equivalences with `AND`. Strictly, this can be any logical expression, including transformations of columns. If the key has the same name in both tables, the `ON` clause can be replaced with `USING (key)`.

The types of joins define how matches between the tables are handled:
- `INNER JOIN`, also the default, returns only records for which matching keys are found in both tables
- `LEFT JOIN` or `RIGHT JOIN` returns all records in the first or second table (respectively). For records without matches in the other table, the columns from the other table will be `NULL`.
- `FULL JOIN` includes all rows from both tables.
- `CROSS JOIN` is a distinct type, which instead of using a key, simply returns every possible combination of rows from the two tables (i.e., with $n$ rows in one table and $m$ rows in the other, the joined table will have $n \times m$ rows).

Repeated `JOIN` statements are evaluated sequentially. It can be helpful to alias tables to make references to columns more concise.

## Self Joins

A special case of joins is when one column in a table links the record to other rows in the same table. A self join unrolls the linked information. This uses the same syntax as any other join, except that the same table name is referred to twice. However, in this case, aliasing is required in order to refer to the two different roles of the same table.

## Set operations to concatenate query results

Tables can also be combined vertically if the number and type of columns match.
```sql
SELECT ...
UNION
SELECT ...
[display options]
```
Each component `SELECT` has its own filtering options. `ORDER BY` and `LIMIT` at the end apply to the combined results. Column names are determined the *first* `SELECT` statement.

The set operators are:
- `UNION`: combine all rows, once for each distinct record. `UNION ALL` includes duplicates.
- `INTERSECT`: include only matching rows.
- `EXCEPT`: includes only rows that appear in eactly one of the queries.

## Subqueries

An additional way to combine values in different tables is through `SELECT` statements inside other clauses of a query.

For example, filtering rows in one table based on values in another table can be accomplished with an inner join:
```sql
SELECT
    tbl1.col
FROM tbl1
    INNER JOIN tbl2
    ON tbl1.key = tbl2.id;
```

Or by a subquery filter:
```sql
SELECT
    tbl1.col
FROM tbl1
WHERE tbl1.key in (
    SELECT id FROM tbl2
);
```

Often complex subquieries will be less efficient because they require a search be done for each row separately, whereas a join creates a temporary table once.

## Views and common table expressions

If a particular query is going to be referred to frequently, it can be efficient to save it.

Within a single query, a subquery can be defined in advance with a **common table expression (CTE)**.
```sql
WITH cte_name AS SELECT ...
SELECT ... FROM name ...;
```
Multiple CTE definitions can be joined by commas.

Queries can also be saved for later reference as **views**:
```sql
CREATE VIEW view_name AS SELECT ...;
```
The view can subsequently be referred to like a normal table.

## Exporting query results to csv

Data can be exported to files using the `COPY` command or `\copy` meta-command. The syntax is the same, but the former is strictly speaking a server command, while the latter is executed by the client. In some cases, users only have permission for former.

The `...` below represents what data is to be exported. This can be a table name, a table name with selected columns indicated in parantheses, or a query.
```sql
COPY ... 
    TO 'file path' 
    WITH 
        FORMAT csv, 
        HEADER true;
```

# Aggregation

## Example functions

SQL includes various functions that serve to produce aggregate values for groups of records. These can be used as columns for a summary query or with window functions to combine row-level and aggregate values.
- `AVG(col)`, `MODE()`
- `SUM(col)`
- `COUNT([DISTINCT] col)`. Use `*` to give a count of records.
- `MIN(col)`, `MAX(col)`
- `CORR(x, y)`. Note that there are various other measures of distribution and association.


## Summary functions for all results or for groups

Summary functions included in a simple `SELECT` statement will return the summary for all records or by groups if these are defined by a `GROUP BY`.

Note that non-summary-function elements can be included in the same `SELECT` clause only if these are the bases for group formation. Note that in the query below, there is a one-for-one relationship between `memid` and name, but the engine still requires they all be included as groups. (It is possible that keys are no properly configured here.)

In [10]:
group_query1 = """
    SELECT 
        recommendedby,
        COUNT(*) as "Members recommended"
    FROM members
    GROUP BY recommendedby
    ORDER BY "Members recommended" DESC;
"""

group_query2 = """
    SELECT 
        recommender.firstname || ' ' || recommender.surname as "Name",
        COUNT(*) as "Members recommended"
    FROM members as recommended
        JOIN members as recommender
        ON recommended.recommendedby = recommender.memid
    GROUP BY recommended.recommendedby, recommender.firstname, recommender.surname
    HAVING recommended.recommendedby IS NOT NULL
    ORDER BY "Members recommended" DESC;
"""

get_results_frame(group_query2, engine)

Unnamed: 0,Name,Members recommended
0,Darren Smith,5
1,Tracy Smith,3
2,Ponder Stibbons,2
3,Janice Joplette,2
4,Jemima Farrell,2
5,Florence Bader,1
6,Millicent Purview,1
7,Matthew Genting,1
8,Tim Rownam,1
9,Gerald Butters,1


## Window functions

Window functions using the `OVER ()` keyword allow rows to output values that depend on other rows in the table. Some of these operations can be performed using subqueries, but generally speaking window functions are more efficient and less verbose.

The parantheses following `OVER` contain keywords indicating the relevant window, if required.

### Functions based on order

#### Row numbers

A relatively simple example involves numbering results.

Arbitrary row numbers based on the order in which records are returned (i.e. before any *subsequent* `ORDER BY` clauses) can be generated with 
```sql 
ROW_NUMBER() OVER () AS row_number
```
To alter the order of rows before numbers are assigned, `ORDER BY` needs to be within the `ORDER` clause:
```sql 
ROW_NUMBER() OVER (ORDER BY col) AS row_number
```
As with `ORDER BY` in general, this gives ascending order; use `DESC` to rank from largest to smallest.

#### Ranking

The `RANK` function is similar to `ROW_NUMBER` except that rows with equivelent values are given the same rank, with the rank incremented accordingly for the next distinct value.
```sql
RANK() OVER (ORDER BY col) AS col_rank
```

#### Comparing values across rows

The `LAG(col, n)` function returns the value in a column `col`, offset by `n` rows. Positive values refer to previous rows, negative values to future rows. This can be used to capture changing values:
```SQL
SELECT
    tdate,
    val <> LAG(val, 1) OVER (ORDER BY tdate) AS changed
FROM
    tbl;
```

The directions inverse of `LAG` is `LEAD`.

#### Percentiles

```sql
PERCENTILE_CONT(0.1) OVER (ORDER BY col) AS tenth_percentile
```
This returns the value within `col` corresponding to the specified percentile, interpolating if necessary. `PERCENTILE_DISC` returns the specific value in the data that is the first at or beyond the specified percentile. If used as a summary function, replace `OVER` with `WITHIN GROUP`.

Ranking example:

In [None]:
rank_query1 = """
    SELECT 
        membercost, 
        RANK() OVER(ORDER BY membercost DESC) as member_cost_rank
    FROM facilities;
"""

get_results_frame(rank_query1, engine)

Unnamed: 0,membercost,member_cost_rank
0,9.9,1
1,9.9,1
2,5.0,3
3,5.0,3
4,3.5,5
5,0.0,6
6,0.0,6
7,0.0,6
8,0.0,6


Lag example:

In [None]:
lag_query = """
    SELECT 
        day,
        COUNT(*) as number_of_bookings,
        COUNT(*) > LAG(COUNT(*), 1) OVER (ORDER BY day) as increased
    FROM ( SELECT
            *,
            EXTRACT(MONTH FROM starttime ) AS month,
            EXTRACT(DAY FROM starttime ) AS day 
        FROM bookings
    ) as daily
    WHERE month = 8
    GROUP BY day;
"""

get_results_frame(lag_query, engine)

Unnamed: 0,day,number_of_bookings,increased
0,1,29,
1,2,43,True
2,3,39,False
3,4,37,False
4,5,33,False
5,6,42,True
6,7,35,False
7,8,49,True
8,9,36,False
9,10,51,True


### Grouping

Within window functions, groups are defined with `PARTITION BY`. For example, to output both the value of a row and the group and global averages (note the use of `OVER()`):
```sql
SELECT
    col,
    group,
    AVG(col) OVER () as global_average,
    AVG(col) OVER (PARTITION BY group) as group_average
FROM
    tbl;
```

In [23]:
group_avg_query = """
    SELECT
        name as "Facility Name",
        slots,
        ROUND(AVG(slots) OVER(), 1) as "Global Average Slots",
        ROUND(AVG(slots) OVER(PARTITION BY Bookings.facid), 1) as "Facility Average Slots"
    FROM
        Bookings
        JOIN Facilities USING(facid)
    ORDER BY starttime
    LIMIT 15;
"""

get_results_frame(group_avg_query, engine)

Unnamed: 0,Facility Name,slots,Global Average Slots,Facility Average Slots
0,Massage Room 1,2,2.3,2.2
1,Pool Table,1,2.3,1.1
2,Table Tennis,2,2.3,2.1
3,Pool Table,1,2.3,1.1
4,Squash Court,2,2.3,2.5
5,Snooker Table,2,2.3,2.0
6,Tennis Court 1,3,2.3,3.2
7,Pool Table,1,2.3,1.1
8,Squash Court,2,2.3,2.5
9,Massage Room 1,2,2.3,2.2


### Sliding windows

In addition to calculating on groups, window functions can be calculated on moving windows within the data. E.g., a 5-step, centered moving average can be calculated as:
```sql
SELECT
    AVG(col) OVER (ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING) AS rolling_average
FROM
    tbl
ORDER BY seq;
```

The order can also between defined with `ORDER (ORDER BY ...)`. Other range keywords include:
- `CURRENT ROW`
- `UNBOUNDED` in place of a number value to express every observation `PRECEDING` or `FOLLOWING`.

In [35]:
# This query calculates the cumulative sum of slots booked within each month, for one member and one facility.

rolling_query = """
SELECT
    starttime,
    slots,
    SUM(slots) OVER (PARTITION BY EXTRACT(MONTH FROM starttime) ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as "Cumulative total for the month"
FROM
    Bookings
WHERE memid = 1 AND facid = 1
ORDER BY starttime;
"""

get_results_frame(rolling_query, engine)

Unnamed: 0,starttime,slots,Cumulative total for the month
0,2012-07-08 15:00:00,3,3
1,2012-07-08 17:30:00,3,6
2,2012-07-09 19:00:00,3,9
3,2012-07-12 11:30:00,3,12
4,2012-07-13 15:00:00,3,15
5,2012-07-16 08:00:00,3,18
6,2012-07-17 10:00:00,3,21
7,2012-07-19 11:30:00,3,24
8,2012-07-21 09:30:00,3,27
9,2012-07-23 10:00:00,3,30
