In [None]:
! pip install --quiet tqdm
! pip install --quiet gcsfs

In [2]:
import pandas as pd
import os
from lib.preprocess import *
from tqdm import tnrange, tqdm_notebook
from multiprocessing import Pool
import threading 

In [3]:
import gcsfs

class ParallelProcessor():
    def __init__(self, 
                 string_processor, 
                 inpath, 
                 outpath, 
                 content_key,
                 id_key='application_number_formatted',
                 pandas_kwargs = {},
                 cores = None):
        self.string_processor = string_processor
        self.inpath = inpath
        self.outpath = outpath
        self.id_key = id_key
        self.content_key = content_key
        self.pandas_kwargs = pandas_kwargs
        self.cores = cores
        self.fs = gcsfs.GCSFileSystem(project='open-source-software')

    def _read(self, f):
        # read csv from gs...
        with self.fs.open('gs://{}/{}'.format(self.inpath, f)) as f:
            return pd.read_csv(f, **self.pandas_kwargs)
        
    def _get_files(self):
        fs = gcsfs.GCSFileSystem(project='open-source-software')
        files = fs.ls(self.inpath)
        files = [f.split('/')[-1] for f in files]
        return [f for f in files if f]
    
    def process(self,  
                       f,  
                       compression = 'gzip'):
        
        key = self.content_key
        df = self._read(f)    
        df = df[df[key].notna()].reset_index(drop=True)
        
        processed = df[key].map(self.string_processor)
        
        df['content'] = processed
        df = (df[(df.content.notna()) & (df[self.id_key].notna())]
              .reset_index(drop=True)
              .drop(key, 1))
    
        return df
        
    def process_all(self):
        files = self._get_files()
        pool = Pool(self.cores)
        with open(self.outpath, 'w') as f:
            for df in tqdm_notebook(pool.imap(self.process, files), total=len(files)):
                for i,c in zip(df[self.id_key], df.content):
                    f.write('{},{}\n'.format(i,c))
            
        pool.close()
        pool.join()
        

In [5]:
inpath = 'oss_bigquery_exports/patent-descriptions'
string_processor = Preprocessor(claims_processor, 4).process
p = ParallelProcessor(string_processor, inpath, 'patent-lookup.csv', pandas_kwargs = {'compression':'gzip'}, cores=24)
p.process_all()

HBox(children=(IntProgress(value=0, max=586), HTML(value='')))




In [None]:
# preprocess('claims_lookup.csv',  'oss_bigquery_exports/patent-descriptions', 
#                                                    process_claims, 
#                                                    'application_number_formatted', 'gzip')

In [None]:
# preprocess('readme_lookup.csv', 'oss_bigquery_exports/readmes', process_readmes, 'id')

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

### Treat owner like collaborative filtering? 

In [101]:
by_owner = df.groupby('repo_owner').apply(lambda df: ' \t '.join(df.content))

In [102]:
with open('by_owner.txt', 'w') as f:
    for c in by_owner:
        f.write(c + '\n')