In [6]:
from embed_software.preprocess import claims_processor, readme_processor, Preprocessor

In [11]:
from os.path import join
import json
import dataset
from time import sleep

from urllib.parse import urlparse, urlunparse, parse_qs, urlencode

from concurrent.futures import ThreadPoolExecutor as Pool
from tqdm import tqdm_notebook  
from itertools import takewhile, islice, count
from sqlite3 import connect, OperationalError

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

def chunk(n, it):
    src = iter(it)
    return takewhile(bool, (list(islice(src, n)) for _ in count(0)))

def remove_qs(url):
    """ Removes query string except for key jk """
    u = urlparse(url)
    qs = parse_qs(u.query)
    jk = qs.get('jk')
    if jk:
        qs = urlencode({'jk': jk}, True)
    else:
        qs = None        
    return urlunparse(u._replace(query = qs))

class ParallelProcessor():
    def __init__(self, inpath, fs, outpath, string_processor, threads, **kwargs):
        self.inpath = inpath
        self.outpath = outpath
        self.threads = threads
        self.fs = fs
        self.kwargs = kwargs
        self.string_processor = string_processor
        self.processed = 0
        
    def _create_table(self, outpath):
        con = connect(self.outpath)
        cur = con.cursor()
        cur.execute("""CREATE TABLE processed(id INTEGER PRIMARY KEY, 
                                              url VARCHAR UNIQUE, 
                                              title VARCHAR, 
                                              content VARCHAR, 
                                              timestamp DATETIME)
                                              """)
        con.commit()
        cur.close()
        con.close()
        
    def _insert_rows(self, cur, li, tries=0):
        try:
            cur.executemany("insert or ignore into \
            processed(url,title,content,timestamp) \
            values (?,?,?,?)", li)
        except OperationalError:
            sleep(.1)
            if tries < 30:
                self._insert_rows(cur, li, tries = tries+1)
            else:
                logging.error('failed to insert!')
                pass
            
    def _process(self, d, keys):
        d['content'] = self.string_processor(d['description'])
        d['url'] = remove_qs(d['url'])
        d = [d[k] for k in keys] if d['content'] else None
        return d
        
    def _get_files(self):
        files = self.fs.ls(self.inpath)
        files = [f.split('/')[-1] for f in files]
        files = [f for f in files if f]
        return [join(self.inpath, f) for f in files]
    
    def process(self, filename):
        con = connect(self.outpath)        
        cur = con.cursor()             
        with self.fs.open(filename) as f:
            for c in chunk(1000, f):
                rows = [self._process(json.loads(l), **self.kwargs) for l in c]
                rows = [d for d in rows if d]
                self._insert_rows(cur, rows)
                con.commit()
                self.processed += 1000
                logging.info('Processed: {}'.format(self.processed))
        cur.close()
        con.close()
        
    def process_all(self, files=None):
        self._create_table(self.outpath)        
        if files is None:
            files = self._get_files()
        with Pool(self.threads) as pool:
            pool.imap(self.process, files)

In [12]:
import s3fs

fs = s3fs.S3FileSystem()
inpath = 'oecd-scraping/indeed-uk'
keys = ['url', 'title', 'content', 'scrapeTimestamp']
string_processor = Preprocessor(readme_processor, 4).process

p = ParallelProcessor(inpath, fs, 'uk-jobs.db', string_processor, threads=30, keys=keys)
# p.process_all()

In [3]:
import pandas as pd

def write_content(fi, outfi):
    pd.read_csv(fi).content.to_csv(outfi, index=False, header=None)

In [None]:
# write out from sqlite to csv

In [5]:
write_content('uk-everything.csv', 'content-uk.txt')