# SQL-Python Integration

## Project Part 3 with SQL

This is incomplete, but illustrates the power of SQL.

Run this first to get the required data files. Though you may not know the methods used, you should be able to follow the code and get a sense of what it does.

In [None]:
from pathlib import Path
from urllib.request import urlretrieve

def download_if_missing(url, filename, data_dir = Path('data')):
    """
    Download a file from URL if not present in local data directory.
    
    Args:
        url: Source URL to download from
        filename: Name to save file as locally
        data_dir: Path to data directory (default: Path('data'))
    
    Returns:
        Path: Path object pointing to the local file
    """

    # make sure a data subdirectory exists
    data_dir.mkdir(exist_ok=True)

    # build a full Path objecct
    filepath = data_dir / filename

    if not filepath.exists():
        # there is no file in the data directory called filename
        print(f"Downloading {filename} to {data_dir}/...")
        try:
            urlretrieve(url, filepath)
            print("Download complete!")
        except Exception as e:
            print(f"Error downloading {filename}: {e}")
    else:
        print(f"{filepath} exists, skipping download.")

    return filepath

# define constants
DATA_DIR = Path('data')
BASE_URL = 'https://raw.githubusercontent.com/olearydj/INSY3010-Fall24/main/notebooks/data/'
CSV_FILE = '5k-sales.csv'
DB_FILE = '5k-sales.db'

# Download CSV if needed
csv_path = download_if_missing(BASE_URL + CSV_FILE, CSV_FILE, DATA_DIR)

# Delete DB if exists
db_path = DATA_DIR / DB_FILE
db_path.unlink(missing_ok=True)


---
Import libraries, including Pandas (alternative method for building a table).

In [None]:
# sqlite3 is included with python
import sqlite3
import os
import pandas as pd

---
Create connection and cursor, build the table using **sqlite**. Then populate it with data using **Pandas** `read_csv` and `to_sql` methods.

In [None]:
# create the connector and cursor
con = sqlite3.connect(db_path)
cur = con.cursor()

# define the SQL statement to create the table
SQL = (
    "CREATE TABLE sales("
        "id INTEGER PRIMARY KEY,"
        "region TEXT,"
        "country TEXT,"
        "type TEXT,"
        "channel TEXT,"
        "priority TEXT,"
        "odate DATE,"
        "sdate DATE,"
        "usold INTEGER,"
        "uprice FLOAT,"
        "ucost FLOAT)"
)

# execute SQL to create the table
cur.execute(SQL)

# read the csv using Pandas
data = pd.read_csv(csv_path)

# inject the csv data into the sales table using connector
# overwrite existing table and do not include Pandas indices
data.to_sql('sales', con, if_exists='replace', index=False)

Check the outcome by executing a simple query and fetching all the results.

In [None]:
# fetchall returns all lines of the result as a list of tuples
cur.execute("SELECT * FROM sales LIMIT 1").fetchall()

---
### Display rows of data...

Not very helpful here as most of the work was in getting valid input and printing it nicely.

In [None]:
res = cur.execute("SELECT * FROM sales LIMIT 5")
# iterate through res to get rows of result as tuples
for row in res:
    for cell in row[:2]:
        print(cell, end='  ')
    print()

Double-check the column names. Stored in `res.description`.

In [None]:
# re-using the previous results, inspecting description attribute
print('Columns in sales table:')
for col in res.description:
    print(col[0])  # just first item in single element tuple

---
### List unique values in a column...

```python
def get_col_data(col):
    '''returns selected column of sample data as list'''
    return [row[col] for row in sample]

def get_unique_vals(data):
    '''returns a sorted list of the unique values in data'''
    return sorted(list(set(data)))
```

Here we begin to see benefits of SQL.

In [None]:
# demonstrate for unique values in Region
res = cur.execute("SELECT DISTINCT Region from sales ORDER BY Region")
for region in res:
    print(region[0])

---
### Calculate average units per order for selected item type...  

```python
units = 0
num = 0
for order in data:
    if order[3] == item_name:
        units += order[8]
        num += 1
avg_units = units / num
print(f"\nThe average quantity for each {item_name} order is {avg_units:.0f} units.")
```

SQL can achieve the same result with a simple query.

In [None]:
# average cost by unit type
SQL = "SELECT Item_Type, AVG(Unit_Cost) FROM sales GROUP BY Item_Type"
res = cur.execute(SQL)
for r in res:
    # fancy f-strings
    print(f"{r[0]:17.17}$ {r[1]:>6.2f}")

---
### Identify the most profitable order for each country in selected region...

```python
# build dictionary of most profitable orders for country in reg
c_data = {}
for row, order in enumerate(sample):
    if order[1] == region:
        country = order[2]
        units = order[8]
        u_price = order[9]
        u_cost = order[10]
        profit = units * (u_price - u_cost)
        # for each country, store highest profit order and row num
        if country in c_data:
            if c_data[country][1] < profit:
                c_data[country] = [row, profit]
        else:
            c_data[country] = [row, profit]

print([col_names[0]] + col_names[2:5])
for country, value in c_data.items():
    row = sample[value[0]]
    print([row[0]] + row[2:5])
```

It requires a sub-query in SQL, but still much easier.

In [None]:
# use a nested query
SQL = (
    # 2 - select only the desired columns of that result
    "SELECT Order_ID, Country, Item_Type, Sales_Channel "
    "FROM("
        # 1 - get all the original columns plus max Profit for each country
        "SELECT *, MAX(Units_Sold * (Unit_Price - Unit_Cost)) AS Profit "
        "FROM sales "
        "WHERE Region = 'Asia' "
        "GROUP BY Country"
    ")"
)

cur.execute(SQL).fetchall()

---
Close the connection!

In [None]:
con.close()