In [1]:
from pathlib import Path
from zipfile import ZipFile

import duckdb
from tempfile import TemporaryDirectory

In [2]:
RAW_DATA = Path('../raw')

In [3]:
db = duckdb.connect(RAW_DATA / 'company-data.db')

In [4]:
ch_zip = ZipFile(RAW_DATA / 'company-data.zip')
filename = ch_zip.namelist()[0]

In [5]:
with TemporaryDirectory() as d:
    companies = ch_zip.extract(filename, d)
    db.sql(f'''CREATE OR REPLACE TABLE CompanyData AS
            SELECT * FROM read_csv("{ companies }")''')

In [6]:
db.sql("SELECT count(*) as 'Count of companies' FROM CompanyData;").df()

Unnamed: 0,Count of companies
0,5632838


In [7]:
cc_zip = ZipFile(RAW_DATA / 'charity-data.zip')
cc_area_zip = ZipFile(RAW_DATA / 'charity-area-data.zip')

In [8]:
with TemporaryDirectory() as d:
    c = cc_zip.extract(cc_zip.namelist()[0], d)
    ca = cc_area_zip.extract(cc_area_zip.namelist()[0], d)
    db.sql(f'''
           CREATE OR REPLACE TEMP TABLE tCharities AS
            SELECT * FROM read_csv("{ c }", quote=NULL) WHERE date_of_removal IS NULL;
           CREATE OR REPLACE TEMP TABLE tCharityArea AS
            SELECT * FROM read_csv("{ ca }", quote=NULL);
            ''')

In [9]:
db.query('''
         CREATE OR REPLACE TABLE Charities AS
            SELECT
                c.registered_charity_number,
                charity_name,
                latest_income,
                latest_expenditure,
                charity_contact_postcode,
                charity_contact_web,
                charity_company_registration_number,
         
            FROM tCharities c
            JOIN
                (
                  SELECT DISTINCT organisation_number, registered_charity_number FROM tCharityArea
                WHERE 
                geographic_area_description IN ( 'Newcastle Upon Tyne City' )
            ) n
         ON n.organisation_number == c.organisation_number
         

         ''')

In [10]:
db.query('''
         SELECT COUNT(*) AS 'Count of Charities' FROM Charities
         ;''')

┌────────────────────┐
│ Count of Charities │
│       int64        │
├────────────────────┤
│               1112 │
└────────────────────┘

In [12]:
db.close()