In [1]:
from embed_software.preprocess import claims_processor, readme_processor, Preprocessor

In [75]:
from os.path import join
import json
import dataset
from time import sleep

from multiprocessing import Pool
from tqdm import tqdm_notebook  
from itertools import takewhile, islice, count
from sqlite3 import connect, OperationalError

def chunk(n, it):
    src = iter(it)
    return takewhile(bool, (list(islice(src, n)) for _ in count(0)))


con.commit()

class ParallelProcessor():
    def __init__(self, inpath, fs, outpath, string_processor, cores, **kwargs):
        self.inpath = inpath
        self.outpath = outpath
        self.cores = cores
        self.fs = fs
        self.kwargs = kwargs
        self.string_processor = string_processor

        
    def _create_table(self, outpath):
        con = connect(self.outpath)
        cur = con.cursor()
        cur.execute("""CREATE TABLE processed(id INTEGER PRIMARY KEY, 
                                              url VAARCHAR UNIQUE, 
                                              title VARCHAR, 
                                              content VARCHAR, 
                                              timestamp DATETIME)
                                              """)
        con.commit()
        cur.close()
        con.close()
        
    def _insert_rows(self, cur, li, tries=0):
        try:
            cur.executemany("insert or ignore into processed(url,title,content,timestamp) values (?,?,?,?)", li)
        except OperationalError:
            sleep(.2)
            if tries < 20:
                self._insert_rows(cur, li, tries = tries+1)
            
    def _process(self, d, keys):
        d['content'] = self.string_processor(d['description'])
        d = [d[k] for k in keys] if d['content'] else None
        return d
        
    def _get_files(self):
        files = self.fs.ls(self.inpath)
        files = [f.split('/')[-1] for f in files]
        return [f for f in files if f]
    
    def process(self, filename):
        con = connect(self.outpath)        
        cur = con.cursor()             
        with self.fs.open(filename) as f:
            for c in chunk(1000, f):
                rows = [self._process(json.loads(l), **self.kwargs) for l in c]
                rows = [d for d in rows if d]
                self._insert_rows(cur, rows)
            con.commit()
        cur.close()
        con.close()
        
    def process_all(self, files=None):
        self._create_table(self.outpath)        
        if files is None:
            files = [join(self.inpath, f) for f in self._get_files()]
        pool = Pool(self.cores)
        for f in tqdm_notebook(pool.imap(self.process, files), total=len(files)):
            pass
        pool.close()
        pool.join()

In [None]:
import gcsfs

fs = gcsfs.GCSFileSystem()
inpath = 'indeed-data/indeed-us'
keys = ['url', 'title', 'content', 'scrapeTimestamp']
string_processor = Preprocessor(readme_processor, 4).process

p = ParallelProcessor(inpath, fs, 'us-jobs.db', string_processor, cores=5, keys=keys)
p.process_all()

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))