I will be working with Northwind DB. This is a popular example DWH database. 

In [30]:
import pandas as pd

# Read CSV files
df1 = pd.read_csv('./northwind/products.csv')
df2 = pd.read_csv('./northwind/orders.csv')
df3 = pd.read_csv('./northwind/order_details.csv')
df4 = pd.read_csv('./northwind/customers.csv')
df5 = pd.read_csv('./northwind/categories.csv')
df6 = pd.read_csv('./northwind/employees.csv')


In [31]:
from sqlalchemy import create_engine

# Create SQLite in-memory database
engine = create_engine('sqlite:///:memory:')

In [32]:
# Add the CSVs to SQLite database as tables
df1.to_sql('products', engine, index=False)
df2.to_sql('orders', engine, index=False)
df3.to_sql('order_details', engine, index=False)
df4.to_sql('customers', engine, index=False)
df5.to_sql('categories', engine, index=False)
df6.to_sql('employees', engine, index=False)

9

Describe the tables.

In [4]:
query = """
PRAGMA table_info([customers]);
"""

result = pd.read_sql(query, engine)
print(result)

    cid          name  type  notnull dflt_value  pk
0     0    customerID  TEXT        0       None   0
1     1   companyName  TEXT        0       None   0
2     2   contactName  TEXT        0       None   0
3     3  contactTitle  TEXT        0       None   0
4     4       address  TEXT        0       None   0
5     5          city  TEXT        0       None   0
6     6        region  TEXT        0       None   0
7     7    postalCode  TEXT        0       None   0
8     8       country  TEXT        0       None   0
9     9         phone  TEXT        0       None   0
10   10           fax  TEXT        0       None   0


In [5]:
query = """
PRAGMA table_info([products]);
"""

result = pd.read_sql(query, engine)
print(result)

   cid             name    type  notnull dflt_value  pk
0    0        productID  BIGINT        0       None   0
1    1      productName    TEXT        0       None   0
2    2       supplierID  BIGINT        0       None   0
3    3       categoryID  BIGINT        0       None   0
4    4  quantityPerUnit    TEXT        0       None   0
5    5        unitPrice   FLOAT        0       None   0
6    6     unitsInStock  BIGINT        0       None   0
7    7     unitsOnOrder  BIGINT        0       None   0
8    8     reorderLevel  BIGINT        0       None   0
9    9     discontinued  BIGINT        0       None   0


In [7]:
query = """
PRAGMA table_info([orders]);
"""

result = pd.read_sql(query, engine)
print(result)

    cid            name    type  notnull dflt_value  pk
0     0         orderID  BIGINT        0       None   0
1     1      customerID    TEXT        0       None   0
2     2      employeeID  BIGINT        0       None   0
3     3       orderDate    TEXT        0       None   0
4     4    requiredDate    TEXT        0       None   0
5     5     shippedDate    TEXT        0       None   0
6     6         shipVia  BIGINT        0       None   0
7     7         freight   FLOAT        0       None   0
8     8        shipName    TEXT        0       None   0
9     9     shipAddress    TEXT        0       None   0
10   10        shipCity    TEXT        0       None   0
11   11      shipRegion    TEXT        0       None   0
12   12  shipPostalCode    TEXT        0       None   0
13   13     shipCountry    TEXT        0       None   0


In [8]:
query = """
PRAGMA table_info([order_details]);
"""

result = pd.read_sql(query, engine)
print(result)

   cid       name    type  notnull dflt_value  pk
0    0    orderID  BIGINT        0       None   0
1    1  productID  BIGINT        0       None   0
2    2  unitPrice   FLOAT        0       None   0
3    3   quantity  BIGINT        0       None   0
4    4   discount   FLOAT        0       None   0


In [13]:
query = """
PRAGMA table_info([categories]);
"""

result = pd.read_sql(query, engine)
print(result)

   cid          name    type  notnull dflt_value  pk
0    0    categoryID  BIGINT        0       None   0
1    1  categoryName    TEXT        0       None   0
2    2   description    TEXT        0       None   0
3    3       picture    TEXT        0       None   0


In [14]:
query = """
PRAGMA table_info([employees]);
"""

result = pd.read_sql(query, engine)
print(result)

    cid             name    type  notnull dflt_value  pk
0     0       employeeID  BIGINT        0       None   0
1     1         lastName    TEXT        0       None   0
2     2        firstName    TEXT        0       None   0
3     3            title    TEXT        0       None   0
4     4  titleOfCourtesy    TEXT        0       None   0
5     5        birthDate    TEXT        0       None   0
6     6         hireDate    TEXT        0       None   0
7     7          address    TEXT        0       None   0
8     8             city    TEXT        0       None   0
9     9           region    TEXT        0       None   0
10   10       postalCode    TEXT        0       None   0
11   11          country    TEXT        0       None   0
12   12        homePhone    TEXT        0       None   0
13   13        extension  BIGINT        0       None   0
14   14            photo    TEXT        0       None   0
15   15            notes    TEXT        0       None   0
16   16        reportsTo   FLOA

--- QUESTIONS PART ---

Q1. Find the top 5 employees who have taken the most orders?

First I will manipulate the table to add another column named 'fullName'

In [33]:
from sqlalchemy import text

query = """
    ALTER TABLE employees
    ADD COLUMN 'fullName' GENERATED ALWAYS AS (firstName || ' ' || lastName);
"""

engine.execute(text(query))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1e562e1f3d0>

Now, I write the answer.

In [34]:
query = """
    SELECT e.fullName AS 'Full Name', COUNT(o.orderId) AS 'Orders'
    FROM employees e
    JOIN orders o
    ON o.employeeId = e.employeeId
    GROUP BY e.employeeId
    ORDER BY [Orders] DESC
    LIMIT 5;
"""

result = pd.read_sql(query, engine)
print(result)

          Full Name  Orders
0  Margaret Peacock     156
1   Janet Leverling     127
2     Nancy Davolio     123
3    Laura Callahan     104
4     Andrew Fuller      96


Q2. Find the costumers with the least varied orders (least categories).

In [47]:
query = """
    SELECT c.companyName, COUNT(DISTINCT p.categoryId) AS 'Categories'
    FROM customers c
    JOIN orders o 
        ON o.customerId = c.customerId
    JOIN order_details od 
        ON od.orderId = o.orderId
    JOIN products p
        ON p.productId = od.productId
    GROUP BY c.customerId
    ORDER BY [Categories] ASC
    LIMIT 5; 
"""

result = pd.read_sql(query, engine)
print(result)

                  companyName  Categories
0  Centro comercial Moctezuma           2
1        Lazy K Kountry Store           2
2                 North/South           3
3   Vins et alcools Chevalier           3
4   Bólido Comidas preparadas           4


Closing the engine.

In [29]:
engine.dispose()