In [None]:
! pip install --quiet s3fs
! pip install --quiet plyvel
! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [2]:
import plyvel
from os.path import join
import json, time
from time import sleep
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
from multiprocessing import Pool, Queue, Process, Manager
from itertools import takewhile, islice, count

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

def chunk(n, it):
    src = iter(it)
    return takewhile(bool, (list(islice(src, n)) for _ in count(0)))

def remove_qs(url):
    """ Removes query string except for key jk """
    u = urlparse(url)
    qs = parse_qs(u.query)
    jk = qs.get('jk')
    if jk:
        qs = urlencode({'jk': jk}, True)
    else:
        qs = None        
    return urlunparse(u._replace(query = qs))

class ParallelProcessor():
    def __init__(self, inpath, fs, outpath, string_processor, threads, **kwargs):
        self.inpath = inpath
        self.outpath = outpath
        self.threads = threads
        self.fs = fs
        self.kwargs = kwargs
        self.string_processor = string_processor
        self.processed = 0

    def _init_db(self, fi):
        self.db = plyvel.DB(fi, 
                            create_if_missing=True,
                            block_size=8388608,
                            write_buffer_size=1073741824) 
        
    def _insert_rows(self, li):        
        with self.db.write_batch() as wb:
            for k,v in li:
                wb.put(k, v)
            
    def _process(self, d, keys):
        d['category'], d['section'] = d['categories']
        d['content'] = self.string_processor(d['description'])
        d['url'] = remove_qs(d['url'])
        d = [d[k] for k in keys] if d['content'] else None        
        return d

    def _make_entry(self, d):
        return d[0].encode('utf8'), json.dumps(d).encode('utf8')
        
    def _get_files(self):
        files = self.fs.walk(self.inpath)
        # filter files
        return files
    
    def process(self, filename):
        with self.fs.open(filename) as f:
            for c in chunk(1000, f):
                rows = [self._process(json.loads(l), **self.kwargs) for l in c]
                rows = [self._make_entry(d) for d in rows if d]
                self.q.put(rows)

    def write(self):
        self._init_db(self.outpath)
        i = 0
        start = time.clock()
        printed = 0
        while True:
            rows = self.q.get()
            if rows is None:
                break
            self._insert_rows(rows)

            i += len(rows)
            if i - printed > 50000:
                new_start = time.clock()
                print(f'Processed {i - printed} in {new_start - start} seconds')
                printed = i
                start = new_start
            
        self.db.close()
        
    def process_all(self, files=None):
        if files is None:
            files = self._get_files()

        m = Manager()
        self.q = m.Queue()
        p = Process(target = self.write)
        p.start()

        with Pool(self.threads) as pool:
            pool.map(self.process, files)
        
        print('done processing')
        self.q.put(None)
        p.join()

In [3]:
from embed_software.preprocess import claims_processor, readme_processor, Preprocessor
import s3fs

def preprocess_country(inpath, dbfile, keys, threads):
    fs = s3fs.S3FileSystem()
    string_processor = Preprocessor(readme_processor, 4).process
    p = ParallelProcessor(inpath, fs, dbfile, string_processor, threads=threads, keys=keys)
    p.process_all()

In [4]:
import plyvel
import re
from re import sub
import json

prep_labels = lambda s: sub('[^\w]', '_', s).lower()          

def extract_content(db, outfile, keys, labels=False):
    db = plyvel.DB(db)

    with open(outfile, 'w') as f:
        for url,v in db:
            row = dict(zip(keys, json.loads(v)))
            if labels:
                content = row['content']
                content = sub('\t', ' ', content)
                cat, sect = row['category'], row['section'] 
                cat, sect = map(prep_labels, [cat, sect])
                line = f'{content} __label__{cat} __label__{sect}'
            else:
                line = row["content"]
            f.write(f'{line}\n')

    print(f'Finished writing to {outfile}')
    db.close()

In [6]:
keys = ['url', 'title', 'content', 'category', 'section', 'scrapeTimestamp']

In [None]:
preprocess_country(f'oecd-scraping/indeed-uk', f'db/uk', keys, 14)

In [None]:
COUNTRY='uk'
extract_content(f'db/{COUNTRY}', f'data/{COUNTRY}/content.txt', keys)
extract_content(f'db/{COUNTRY}', f'data/{COUNTRY}/content-labelled.txt', keys, labels=True)

In [None]:
for COUNTRY in ['uk', 'us', 'india']:
    preprocess_country(f'oecd-scraping/indeed-{COUNTRY}', f'db/{COUNTRY}', keys, 14)
    extract_content(f'db/{COUNTRY}', f'data/{COUNTRY}/content.txt', keys)
    extract_content(f'db/{COUNTRY}', f'data/{COUNTRY}/content-labelled.txt', keys, labels=True)