In [1]:
from pathlib import Path
from zipfile import ZipFile

from tempfile import TemporaryDirectory

import pipeline_utils.db as database

In [2]:
RAW_DATA = Path('../raw')

In [3]:
ch_zip = ZipFile(RAW_DATA / 'company-data.zip')
filename = ch_zip.namelist()[0]

In [4]:
db = database.connect()

In [5]:
with TemporaryDirectory() as d:
    companies = ch_zip.extract(filename, d)
    db.sql(f'''CREATE OR REPLACE TABLE CompanyData AS
            SELECT * FROM read_csv("{ companies }")''')

In [6]:
db.sql("SELECT count(*) as 'Count of companies' FROM CompanyData;").df()

Unnamed: 0,Count of companies
0,5632838


In [7]:
cc_zip = ZipFile(RAW_DATA / 'charity-data.zip')
cc_area_zip = ZipFile(RAW_DATA / 'charity-area-data.zip')
cc_classification_zip = ZipFile(RAW_DATA / 'charity-classification-data.zip')

In [8]:
with TemporaryDirectory() as d:
    c = cc_zip.extract(cc_zip.namelist()[0], d)
    ca = cc_area_zip.extract(cc_area_zip.namelist()[0], d)
    cc = cc_classification_zip.extract(cc_classification_zip.namelist()[0], d)

    db.sql(f'''
           
           CREATE OR REPLACE TEMP TABLE tCharities AS
            SELECT * FROM read_csv("{ c }", quote=NULL) WHERE date_of_removal IS NULL;
           CREATE OR REPLACE TEMP TABLE tCharityArea AS
            SELECT * FROM read_csv("{ ca }", quote=NULL);
           CREATE OR REPLACE TEMP TABLE tCharityClassification AS
            SELECT * FROM read_csv("{ cc }", quote=NULL);
            ''')

In [9]:
db.sql('''SELECT DISTINCT(geographic_area_description) FROM tCharityArea WHERE geographic_area_description LIKE '%radford%';''')

┌─────────────────────────────┐
│ geographic_area_description │
│           varchar           │
├─────────────────────────────┤
│ Bradford City               │
└─────────────────────────────┘

In [10]:
db.sql( '''
        CREATE OR REPLACE TABLE tCharitiesScope AS
        SELECT DISTINCT *
          FROM tCharityArea a
          JOIN tCharityClassification c
            ON a.organisation_number = c.organisation_number
         WHERE a.geographic_area_description = 'Bradford City'
           AND c.classification_type = 'What'
           AND c.classification_code = '109';
       ''')

In [11]:
db.query('''
         CREATE OR REPLACE TABLE Charities AS
            SELECT
                c.registered_charity_number,
                c.organisation_number,
                charity_name,
                latest_income,
                latest_expenditure,
                charity_contact_postcode,
                charity_contact_web,
                charity_company_registration_number,
         
            FROM tCharities c
            JOIN tCharitiesScope s
         ON s.organisation_number == c.organisation_number
         ''')

In [12]:
db.query('''
         SELECT COUNT(*) AS 'Count of Charities' FROM Charities
         ;''').df()

Unnamed: 0,Count of Charities
0,260


In [13]:
db.close()