In [47]:
import pandas as pd
import requests
from pathlib import Path
from io import BytesIO
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import LastModified
from zipfile import ZipFile
import os
import numpy as np
from datetime import datetime
from pytz import timezone
import json
from string import Template

startedAt = datetime.now(timezone('Europe/London'))
buildUrl = os.environ.get('BUILD_URL', 'unknown-build')
jobName = os.environ.get('JOB_NAME', 'unknown-job')

session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())

dataUrl = "https://www.uktradeinfo.com/Statistics/Documents/Data%20Downloads/"

with ZipFile(BytesIO(session.get(dataUrl + "SMKA12_2017archive.zip").content)) as controlZips:
    for monthZipName in controlZips.namelist():
        with ZipFile(BytesIO(controlZips.open(monthZipName).read())) as monthControlZip:
            assert len(monthControlZip.namelist()) == 1
            with monthControlZip.open(monthControlZip.namelist()[0]) as monthControl:
                table = pd.read_csv(monthControl, sep='|', encoding='latin-1')
                display(table)
        break

Unnamed: 0,000000000,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,HMCUSTOMS CONTROL DATA,2017,01
0,10121000,0,00/00,00/00,00/00,00/00,0,00150,2,013,...,0,0,0,0,KG,NO,Pure-bred breeding horses ...,,,
1,10129100,0,00/00,00/00,00/00,00/00,0,00150,2,013,...,0,0,0,0,KG,NO,Horses for slaughter ...,,,
2,10129900,0,00/00,00/00,00/00,00/00,0,00150,2,013,...,0,0,0,0,KG,NO,"Live horses (excl. for slaughter, pure-bred fo...",,,
3,10130000,0,00/00,00/00,00/00,00/00,0,00150,2,013,...,0,0,0,0,KG,NO,Live asses ...,,,
4,10190000,0,00/00,00/00,00/00,00/00,0,00150,2,013,...,0,0,0,0,KG,NO,Live mules and hinnies ...,,,
5,10221100,0,00/00,00/00,00/00,00/00,0,00111,2,013,...,0,0,0,0,KG,NO,"Pure-bred breeding heifers ""female bovines tha...",,,
6,10221300,0,00/00,00/00,00/00,00/00,0,00111,2,013,...,0,0,0,0,KG,NO,Pure-bred breeding cows (excl. heifers) ...,,,
7,10221900,0,00/00,00/00,00/00,00/00,0,00111,2,013,...,0,0,0,0,KG,NO,Pure-bred cattle for breeding (excl. heifers a...,,,
8,10229050,0,00/00,00/00,00/00,00/00,0,00111,2,013,...,0,0,0,0,KG,NO,Live cattle of the sub-genus Bibos or Poephagu...,,,
9,10229100,0,00/00,00/00,00/00,00/00,0,00119,2,013,...,0,0,0,0,KG,NO,Live cattle of a weight <= 80 kg (excl. pure-b...,,,


In [48]:
# Non EU

column_names = ["COMCODE", "SITC", "RECORD-TYPE", "COD-SEQUENCE", "COD-ALPHA", "MONTH", "PORT-SEQUENCE", "PORT-ALPHA",
                "FLAG-SEQUENCE", "FLAG-ALPHA", "TRADE-INDICATOR", "CONTAINER", "MODE-OF-TRANSPORT", "INLAND-MOT",
                "GOLO-SEQUENCE", "GOLO-ALPHA", "SUITE-INDICATOR", "PROCEDURE-CODE", "Value", "QUANTITY1", "QUANTITY2",
                "INDUSTRIAL-PLANT-COMCODE"]

def get_cn8(comcode, month):
    cn8 = comcode[0:8]
    year = month[3:]
    return 'cn_%s#cn8_%s' % (year, cn8)

def get_period_slug(month):
    return 'month/%s-%s' % (month[0:2], month[3:])

destFolder = Path('out')
if not destFolder.exists():
    destFolder.mkdir(parents=True)
    
provSources = []    
provOutputs = []
    
zipName = "SMKE19_2017archive.zip"
zipUrl = dataUrl + zipName

with ZipFile(BytesIO(session.get(zipUrl).content)) as controlZips:
    for monthZipName in controlZips.namelist():
        with ZipFile(BytesIO(controlZips.open(monthZipName).read())) as monthControlZip:
            assert len(monthControlZip.namelist()) == 1
            innerZipName = monthControlZip.namelist()[0]
            with monthControlZip.open(innerZipName) as monthControl:
                table = pd.read_csv(monthControl,
                                    engine='python', 
                                    sep='|', 
                                    encoding='latin-1', 
                                    skiprows=1, 
                                    skipfooter=1,
                                    header=None,
                                    names=column_names,
                                    dtype={"COMCODE": str})
                table.drop(["SITC", "COD-SEQUENCE", "PORT-SEQUENCE", "PORT-ALPHA",
                             "FLAG-SEQUENCE", "FLAG-ALPHA", "TRADE-INDICATOR", "CONTAINER", 
                             "MODE-OF-TRANSPORT", "INLAND-MOT", "GOLO-SEQUENCE", "GOLO-ALPHA",
                             "SUITE-INDICATOR", "PROCEDURE-CODE", "INDUSTRIAL-PLANT-COMCODE"], 
                           inplace=True, axis='columns')
                
                table['Period'] = map(get_period_slug, table["MONTH"])
                table['Flow'] = 'Exports'
                table['HMRC Partner Geography'] = table['COD-ALPHA']
                table["Combined Nomenclature"] = map(get_cn8, table["COMCODE"], table["MONTH"])
                table['Measure Type'] = 'GBP Total'
                table['Unit'] = '£'
                table = table[['Period', 'Flow', 'Combined Nomenclature', 'HMRC Partner Geography', 'Measure Type', 'Unit', 'Value']]
                
                #TODO: write last slice
                sliceSize = 50000
                for i in np.arange(len(table)//sliceSize):
                    fileName = 'smke_{}_{:04}.csv'.format(os.path.splitext(innerZipName)[0], i)
                    destFile = destFolder / fileName
                    table.iloc[i*sliceSize:i*sliceSize+sliceSize-1].to_csv(destFile, index=False)

                    provOutputs.append((destFile, 'SMKE19_2017 table'))
                    
                provSources.append({
                    '@id': zipUrl + '/' + monthZipName + '/' + innerZipName,
                    '@type': 'entity',
                    'label': zipName + '/' + monthZipName + '/' + innerZipName,
                    'wasUsedBy': provActivity['@id']
                })
                
                display(table)
                
#write provenance metadata

def get_prov_context():
    #metadataDir = Path('metadata')
    with open('metadata/prov_context.json') as contextFile:
        return json.load(contextFile)

context = get_prov_context()

provActivity = {
    '@id': buildUrl + "#tidy_published",
    '@type': 'activity',
    'startedAtTime': startedAt.isoformat(),
    'label': 'Tidy published',
    'comment': 'Jupyter Python notebook as part of Jenkins job %s' % jobName
}

provActivity['endedAtTime'] = datetime.now(timezone('Europe/London')).isoformat()
prov = {
    '@context': context,
    '@graph': [ provActivity ] + provSources + [
        {
            '@id': buildUrl + '/artifact/' + str(filename),
            '@type': 'entity',
            'wasGeneratedBy': provActivity['@id'],
            'label': label
        } for (filename, label) in provOutputs
    ]
}

with open(str(destFolder / 'prov.jsonld'), 'w') as provFile:
    json.dump(prov, provFile, indent=2)
    
modified_date = datetime.now(timezone('Europe/London')).isoformat()


with open('metadata/dataset.trig.template', 'r') as metadata_template_file:
    metadata_template = Template(metadata_template_file.read())
    with open(str(destFolder / 'dataset.trig'), 'w') as metadata_file:
        metadata_file.write(metadata_template.substitute(modified=modified_date))
                
            

Unnamed: 0,Period,Flow,Combined Nomenclature,HMRC Partner Geography,Measure Type,Unit,Value
0,month/01-2017,Exports,cn_2017#cn8_01012100,NO,GBP Total,£,20000
1,month/01-2017,Exports,cn_2017#cn8_01012100,CH,GBP Total,£,346678
2,month/01-2017,Exports,cn_2017#cn8_01012100,TN,GBP Total,£,1898
3,month/01-2017,Exports,cn_2017#cn8_01012100,ZA,GBP Total,£,418663
4,month/01-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,206577
5,month/01-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,1283477
6,month/01-2017,Exports,cn_2017#cn8_01012100,CA,GBP Total,£,27403
7,month/01-2017,Exports,cn_2017#cn8_01012100,CA,GBP Total,£,117500
8,month/01-2017,Exports,cn_2017#cn8_01012100,SA,GBP Total,£,135124
9,month/01-2017,Exports,cn_2017#cn8_01012100,KW,GBP Total,£,15177


Unnamed: 0,Period,Flow,Combined Nomenclature,HMRC Partner Geography,Measure Type,Unit,Value
0,month/02-2017,Exports,cn_2017#cn8_01012100,NO,GBP Total,£,6325
1,month/02-2017,Exports,cn_2017#cn8_01012100,CH,GBP Total,£,950
2,month/02-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,290600
3,month/02-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,302269
4,month/02-2017,Exports,cn_2017#cn8_01012100,KW,GBP Total,£,91675
5,month/02-2017,Exports,cn_2017#cn8_01012100,BH,GBP Total,£,100000
6,month/02-2017,Exports,cn_2017#cn8_01012100,QA,GBP Total,£,177961
7,month/02-2017,Exports,cn_2017#cn8_01012100,PK,GBP Total,£,3000
8,month/02-2017,Exports,cn_2017#cn8_01012100,IN,GBP Total,£,60599
9,month/02-2017,Exports,cn_2017#cn8_01012100,SG,GBP Total,£,163000


Unnamed: 0,Period,Flow,Combined Nomenclature,HMRC Partner Geography,Measure Type,Unit,Value
0,month/03-2017,Exports,cn_2017#cn8_01012100,CH,GBP Total,£,38880
1,month/03-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,170750
2,month/03-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,121800
3,month/03-2017,Exports,cn_2017#cn8_01012100,SA,GBP Total,£,50000
4,month/03-2017,Exports,cn_2017#cn8_01012100,AE,GBP Total,£,76950
5,month/03-2017,Exports,cn_2017#cn8_01012100,AE,GBP Total,£,25473
6,month/03-2017,Exports,cn_2017#cn8_01012100,IN,GBP Total,£,996
7,month/03-2017,Exports,cn_2017#cn8_01012100,IN,GBP Total,£,50434
8,month/03-2017,Exports,cn_2017#cn8_01012100,SG,GBP Total,£,3474
9,month/03-2017,Exports,cn_2017#cn8_01012100,HK,GBP Total,£,365000


Unnamed: 0,Period,Flow,Combined Nomenclature,HMRC Partner Geography,Measure Type,Unit,Value
0,month/04-2017,Exports,cn_2017#cn8_01012100,NO,GBP Total,£,33100
1,month/04-2017,Exports,cn_2017#cn8_01012100,NO,GBP Total,£,13750
2,month/04-2017,Exports,cn_2017#cn8_01012100,CH,GBP Total,£,1165
3,month/04-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,2004
4,month/04-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,35000
5,month/04-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,55000
6,month/04-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,2205
7,month/04-2017,Exports,cn_2017#cn8_01012100,QA,GBP Total,£,30975
8,month/04-2017,Exports,cn_2017#cn8_01012100,AE,GBP Total,£,107369
9,month/04-2017,Exports,cn_2017#cn8_01012100,SG,GBP Total,£,34860


Unnamed: 0,Period,Flow,Combined Nomenclature,HMRC Partner Geography,Measure Type,Unit,Value
0,month/05-2017,Exports,cn_2017#cn8_01012100,NO,GBP Total,£,48650
1,month/05-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,50000
2,month/05-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,1714
3,month/05-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,10420
4,month/05-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,30000
5,month/05-2017,Exports,cn_2017#cn8_01012100,PE,GBP Total,£,40080
6,month/05-2017,Exports,cn_2017#cn8_01012100,QA,GBP Total,£,13456
7,month/05-2017,Exports,cn_2017#cn8_01012100,AE,GBP Total,£,341250
8,month/05-2017,Exports,cn_2017#cn8_01012100,HK,GBP Total,£,120000
9,month/05-2017,Exports,cn_2017#cn8_01012100,AU,GBP Total,£,151094


Unnamed: 0,Period,Flow,Combined Nomenclature,HMRC Partner Geography,Measure Type,Unit,Value
0,month/06-2017,Exports,cn_2017#cn8_01012100,CH,GBP Total,£,20288
1,month/06-2017,Exports,cn_2017#cn8_01012100,US,GBP Total,£,172352
2,month/06-2017,Exports,cn_2017#cn8_01012100,IL,GBP Total,£,1000
3,month/06-2017,Exports,cn_2017#cn8_01012100,KW,GBP Total,£,15000
4,month/06-2017,Exports,cn_2017#cn8_01012100,KW,GBP Total,£,61140
5,month/06-2017,Exports,cn_2017#cn8_01012100,AE,GBP Total,£,94000
6,month/06-2017,Exports,cn_2017#cn8_01012100,AE,GBP Total,£,151200
7,month/06-2017,Exports,cn_2017#cn8_01012100,SG,GBP Total,£,6500
8,month/06-2017,Exports,cn_2017#cn8_01012100,JP,GBP Total,£,48263
9,month/06-2017,Exports,cn_2017#cn8_01012100,HK,GBP Total,£,2000
