In [8]:
import os
import csv
import pandas as pd
from toolz import curry
from tqdm import tqdm_notebook
from embed_software.preprocess import claims_processor, readme_processor, Preprocessor

def chunked_reader(f, chunksize=1000, **kwargs):
    for df in pd.read_csv(f, chunksize=chunksize, **kwargs):
        for i,r in df.iterrows():
            yield r.values.tolist()

def process_tsv(preprocessor, fi):
    lines = chunked_reader(fi, sep='\t')
    lines = ((i, preprocessor(desc)) for i,desc,_ in lines if not pd.isna(desc))
    lines = ([i, desc] for i,desc in lines if desc)
    return lines

def unzip(fi):
    zipfile = ZipFile(BytesIO(requests.get(fi).content))
    name = zipfile.filelist[0].filename
    return name, zipfile.open(name)

def write_out(fs, outfile, lines):
    with fs.open(outfile, 'w') as f:
        writer = csv.writer(f, quoting = csv.QUOTE_ALL)
        for line in lines:
           writer.writerow(line)

# def process(preprocessor, inpath, outpath, fi):
#     with open(os.path.join(inpath, fi)) as f:
#         lines = process_tsv(preprocessor, f)
#         write_out(os.path.join(outpath, fi), lines)    


# def main(inpath, outpath, preprocessor):
#     files = os.listdir(inpath)
#     for f in tqdm_notebook(files):
#         process(preprocessor, inpath, outpath, f)


def process(fs, preprocessor, outpath, fi):
    name, fi = unzip(fi)
    lines = process_tsv(preprocessor, fi)
    write_out(fs, os.path.join(outpath, name), lines)    


from joblib import Parallel, delayed


def main(fs, preprocessor, outpath, links):
    Parallel(n_jobs=-1)(delayed(process)(fs, preprocessor, outpath, link) for link in links)


# main('./patent-descriptions/data', './patent-descriptions/processed', preprocessor)

In [6]:
import requests
from bs4 import BeautifulSoup
from io import BytesIO
from zipfile import ZipFile
from random import shuffle

res = requests.get('http://www.patentsview.org/download/detail_desc_text.html')
soup = BeautifulSoup(res.content)
links = [x.attrs['href'] for x in soup.select('table a')]
shuffle(links)

In [9]:
from gcsfs import GCSFileSystem
fs = GCSFileSystem(project='labor-market-data')

preprocessor = Preprocessor(readme_processor, 4).process

main(fs, preprocessor, 'lmd-patent-predictions', links)