In [8]:
! pip install --quiet --upgrade scikit-learn
! pip install --quiet fuzzywuzzy
! pip install --quiet python-levenshtein
! pip install --quiet diskcache
! pip install --quiet lime
! pip install --quiet torch
! pip install --quiet gcsfs
! pip install --quiet xxhash
! pip install --upgrade numba
! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [None]:
! pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz

In [278]:
! pip install --quiet spacy
! python -m spacy download en_core_web_lg

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
import attr
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from validation.data import dot_train_data, get_soc_n, get_dictionary, indeed_test_data
from embed_software.preprocess import *
from embed_software.utils import get_embeddings, embed_docs
from classification.embedding import PreEmbeddedVectorizer
from validation.scoring import bubbleup_score, BubbleUpMixin

pd.set_option('max_colwidth',50)
pd.set_option('display.width', 700)

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

In [2]:
SAMPLE_SIZE = 100000
SOC_LEVEL = 6

In [79]:
X_train, y_train = dot_train_data(SOC_LEVEL, include_dot = False)

In [80]:
tasks = pd.read_csv('tasks.txt', sep='\t')

In [81]:
X_train.shape, tasks.shape

((19530,), (19530, 7))

In [5]:
class BubbleUpLogisticRegression(BubbleUpMixin, LogisticRegression):
    pass

model = Pipeline([
    ('sentencespace_200_patent_descriptions', PreEmbeddedVectorizer('./patent-descriptions/models/sentencespace-200', cache_dir='patent-description-cache-dir', chunk_size=1000)),
    ('lr', BubbleUpLogisticRegression(C=2., solver='lbfgs', class_weight='balanced', multi_class="multinomial", n_jobs=-1).set_bubbles(3))
])

In [None]:
model.fit(X_train, y_train)

In [None]:
mapping = pd.read_csv('./patent-descriptions/hbs_pat_ipc_mapping.csv', sep='\t')

In [7]:
patent_ids = set(mapping.patent.values)

In [37]:
import csv
from itertools import islice
import sys
from itertools import takewhile, islice, count
from validation.scoring import get_top_soc_n_preds, get_soc_n_preds, make_code_lookup

csv.field_size_limit(sys.maxsize)

def chunk(n, it):
    src = iter(it)
    return takewhile(bool, (list(islice(src, n)) for _ in count(0)))

def cast(i):
    try: 
        return int(i)
    except ValueError:
        return i

def format_preds(d, socs, probs):
    a,b = [(pd.DataFrame(df) 
            .assign(idx = d.id) 
            .melt(id_vars = ['idx'], value_name = key) 
            .drop(columns = 'variable')
            .sort_values('idx')
            .reset_index(drop=True)
            .set_index('idx'))
           for df,key in [(socs, 'soc'), (probs, 'prob')]]

    df = pd.concat([a,b], 1).reset_index().rename(columns = {'idx': 'patent'})
    return df

def make_predictions(model, mapping, lines):
    d = pd.DataFrame(lines, columns = ['id', 'description'])
    preds = model.predict_proba(d.description)
    preds_df = pd.DataFrame(preds)
    preds_df.columns = model.classes_    
    preds_socs, preds_probs = get_top_soc_n_preds(preds_df, 3, 5, True)    
    return mapping.merge(format_preds(d, preds_socs, preds_probs), how = 'inner', on = 'patent')



In [117]:
vectorizer = PreEmbeddedVectorizer('../abstracts-ss-100', cache_dir='patent-abstracts-cache-dir', chunk_size=500)

In [53]:
from joblib import Parallel, delayed

with open('patent-descriptions/processed/details.csv') as f:
    reader = csv.reader(f)
    lines = ([cast(l[0]), l[1]] for l in reader)
    lines = (l for l in lines if l[0] in patent_ids)
    lines = chunk(100000, lines)    
    pred_dfs = [make_predictions(model, mapping, list(c)) for c in lines]

    # pred_dfs = Parallel(n_jobs = -1)(delayed(make_predictions)(model, mapping, list(c)) for c in lines)

In [54]:
all_preds = pd.concat(pred_dfs)

In [55]:
all_preds.to_csv('patent-descriptions/patent-preds-all.csv', index = False)

# ABSTRACTS - NEAREST SENTENCE

In [5]:
import json
from itertools import islice
from json import JSONDecodeError

def loads(l):
    try:
        return json.loads(l)
    except JSONDecodeError:
        return { 'id': None }

with open('../abstracts') as f:
    dat = ((i, loads(l)) for i,l in enumerate(f) if i != 0)
    dat = ((d['id'], d.get('abstract')) for i,d in dat)
    dat = list(dat)

In [35]:
from joblib import Parallel, delayed
from embed_software.preprocess import claims_processor, readme_processor, Preprocessor, tokenizer


sentence = re.compile(r"\.\s+")

def tokenize(p, min_tokens):
    li_tokens = tokenizer(p)
    if len(li_tokens) < min_tokens:
        return None
    return ' '.join(li_tokens)  
    

def preprocessor(og, min_tokens = 3):
    sents = sentence.split(og)
    sents = [(s.strip(), claims_processor(s, numbers=True)) for s in sents]
    
    sents = [(s, tokenize(p, min_tokens)) for s,p in sents]
    sents = [(s, p) for s,p in sents if p is not None]
    originals, processed = zip(*sents)

    # each a list of strings.
    return originals, processed

def process(preprocessor, i, text):
    if text is None:
        return (i, text, text)

    originals, processed = preprocessor(text)
    return (i, originals, processed)

def process_abstracts_chunk(preprocessor, dat):
    processed = [process(preprocessor, i, t) for i,t in dat]
    return [(i,og,p) for i,og,p in processed if p]

def process_abstracts(preprocessor, dat, chunks = 32):
    chunks = np.array_split(dat, chunks)
    processed = Parallel(n_jobs=-1)(delayed(process_abstracts_chunk)(preprocessor, c) for c in chunks)
    return [y for x in processed for y in x]

In [None]:
# Filter dat by sample!

mapping = pd.read_csv('./patent-descriptions/hbs_pat_ipc_mapping.csv', sep='\t')
mapping['patent'] = mapping.patent.astype(str)

In [8]:
def count_sents(a):
    try: 
        return len(a.split('. '))
    except AttributeError:
        return 0

def is_cutoff(a):
    return a[-1] != '.' if a else True

patents = pd.DataFrame(dat, columns=['patent', 'abstract']) \
            .drop_duplicates('patent') \
            .merge(mapping, on='patent', how='inner') \
            .pipe(lambda df: df.assign(num_sents = df.abstract.map(count_sents))) \
            .pipe(lambda df: df.assign(cutoff = df.abstract.map(is_cutoff)))

In [9]:
sampled_patents = patents[(patents.appyear < 2007) & 
                          (patents.appyear > 1973) & 
                          (patents.num_sents <= 10) &
                          (patents.cutoff == False) ] \
                          .groupby(['class', 'appyear']) \
                          .apply(lambda df: df.sample(5) if df.shape[0] > 5 else None) \
                          .reset_index(drop = True)

In [48]:

# get original sentences
# limit to first 5 sentences

dat = [(r.patent, r.abstract) for i,r in sampled_patents.iterrows()]

In [49]:
%%time

lines = process_abstracts(preprocessor, dat, 32)
lines = [(i,ogg,pp) for i,og,p in lines for ogg,pp in zip(og, p)]

CPU times: user 312 ms, sys: 352 ms, total: 664 ms
Wall time: 709 ms


In [51]:
ids, ogs, sents = zip(*lines)
ids, ogs, sents = pd.Series(ids), pd.Series(ogs), pd.Series(sents)

In [52]:
%%time

from classification.embedding import WordEmbeddingVectorizer

vectorizer = WordEmbeddingVectorizer('../abstracts-ss-100.tsv', sep = '\t', cache_dir=None, chunk_size=1000, max_workers = 1)

embedded_tasks = vectorizer.fit_transform(X_train)

embedded_sents = vectorizer.fit_transform(sents)

CPU times: user 5.66 s, sys: 84 ms, total: 5.74 s
Wall time: 6.23 s


In [53]:
from numba import njit

@njit(parallel=True)
def get_nearest(embedded_tasks, embedded_sents, K = 1):
    N = embedded_sents.shape[0]
    A,D = np.zeros((N, K)), np.zeros((N, K))
    for i in np.arange(N):
        sim = embedded_tasks.dot(embedded_sents[i])
        top_idxs = np.argsort(sim)[-K:]
        for j,k in enumerate(top_idxs):
            A[i,j] = k
            D[i,j] = sim[k]
    return A,D

In [54]:
%%time

A,D = get_nearest(embedded_tasks, embedded_sents, 8)

CPU times: user 21min 59s, sys: 1h 1min 28s, total: 1h 23min 28s
Wall time: 2min 39s


In [55]:
close_idx = (D > .60)

close_idx.sum()

7495

In [56]:
far_idx = (D < .35)

far_idx.sum()

68714

In [57]:
all_idx = D <= 1.0

all_idx.sum()

363048

In [94]:
def make_results(idx, include_sents = True):
    ii = np.argwhere(idx)[:, 0].flatten()
    df = pd.DataFrame({
        'patent_id': ids.iloc[ii].values,
        'task_id': tasks.loc[A[idx]]['Task ID'].values,
        'task_soc': y_train[A[idx]].values,
        'distance': 1 - D[idx],
        'sent_id': sents.iloc[ii].index.values
    })
    if include_sents:
        df['abstract_sentence'] = ogs.iloc[ii].values
        df['task_sentence'] = tasks.loc[A[idx], 'Task'].values
    return df

In [189]:
make_results(close_idx).to_csv('patent-neighbors/week1/close_matches.csv', index=False)

In [190]:
make_results(far_idx).to_csv('patent-neighbors/week1/far_matches.csv', index=False)

In [191]:
make_results(all_idx, False).to_csv('patent-neighbors/week1/all_matches_no_sent.csv', index=False)

In [193]:
make_results(close_idx).sort_values('distance').head(5000).to_csv('patent-neighbors/week1/super_close_matches.csv', index=False)

In [196]:
make_results(far_idx).sort_values('distance', ascending=False).head(5000).to_csv('patent-neighbors/week1/super_far_matches.csv', index=False)

# Selecting for MTurk

In [161]:
%%time

from toolz import curry

@curry
def add_nested_local_id(nested, name, df):
    lookup = { s:i+1 for i,s in 
               enumerate(df[nested].unique()) }

    df[name] = df[nested].map(lookup)
    return df

@curry
def add_local_id(name, df, randomize=True):
    idx = np.arange(df.shape[0]) + 1
    if randomize:
        np.random.shuffle(idx)
    return df.assign(**{name: idx}).sort_values(name)

results = make_results(all_idx)




patents = pd.DataFrame(dat, columns = ['patent_id', 'abstract']) \
            .pipe(lambda df: df.assign(abstract = df.abstract.map(lambda a: ' '.join(a.split()).strip()))) \
            .merge(mapping, left_on='patent_id', right_on='patent')[['patent_id', 'abstract', 'appyear']] \
            .rename(columns = {'appyear': 'year'})

ided = results[['patent_id', 'sent_id', 'abstract_sentence', 
                'task_id', 'distance', 'task_sentence']] \
    .groupby(['patent_id']) \
    .apply(add_nested_local_id('sent_id', 'sent_index')) \
    .reset_index(drop=True)

# Take only first 5 sentences
ided = ided[ided.sent_index <= 5].shape

sentences = ided.drop_duplicates('sent_id') \
                .drop(columns = ['task_id', 'distance', 'task_sentence', 'sent_id'])

mturk_tasks = ided \
    .groupby(['patent_id', 'sent_index']) \
    .apply(add_local_id('task_index')) \
    .reset_index(drop=True) \
    .drop(columns = ['abstract_sentence', 'task_id', 'sent_id'])

AttributeError: 'tuple' object has no attribute 'drop_duplicates'

In [162]:
def assign_workers(patents, num_workers, per_worker):
    workers = [x for i in np.arange(num_workers) for x in [i+1]*per_worker]
    out = []
    vals = patents.values.tolist()
    curr_vals = vals.copy()
    for worker in workers:
        curr_vals = curr_vals if len(curr_vals) > 0 else vals.copy()
        out.append((worker, curr_vals.pop()))
    return pd.DataFrame(out, columns = ['worker_id', 'patent_id'])

In [163]:
small_sample_patents = patents.merge(mapping, left_on='patent_id', right_on='patent') \
                              .groupby('class') \
                              .apply(lambda df: df.sample(2)) \
                              .reset_index(drop=True) \
                              .sample(133).patent_id

workers = assign_workers(small_sample_patents, 40, 10)

In [166]:
patents.to_csv('mturk-data/abstracts.csv', index=False)
sentences.to_csv('mturk-data/sentences.csv', index=False)
mturk_tasks.to_csv('mturk-data/tasks.csv', index=False)
workers.to_csv('mturk-data/workers.csv', index=False)

In [197]:
import spacy
nlp = spacy.load('en_core_web_lg')

results = pd.read_csv('patent-neighbors/week0/abstracts-neighbor-results.csv')

In [19]:
i = 7

def get_subj(sent):
    subjs = [t for t in nlp(sent) 
             if t.dep_ == 'nsubj']
    words = [str(s) for s in subjs]
    subj = ','.join(words) if words else None
    return subj

results = results.assign(subj = [get_subj(sent) for sent in results.abstract_sentence])

In [21]:
results.to_csv('abstracts-neighbor-sample.csv', index=False)

In [None]:
model = Pipeline([('sentencespace_100_us', PreEmbeddedVectorizer('../abstracts-ss-100', 100, cache_dir='embed_cache')),
                  ('knn', KNeighborsClassifier(1, n_jobs=-1))])

model.fit(X_train, y_train)

In [8]:
def _get_soc_n(df, n):
    return (df.T
            .reset_index()
            .pipe(lambda df: df.assign(soc = df['index'].map(lambda i: str(i)[0:n])))
            .set_index('soc')
            .drop('index', 1)
            .groupby('soc').sum().T)


def get_pred(model, X):
    vals = model.predict_proba(X)
    df = pd.DataFrame(vals)
    df.columns = model.classes_
    n=3
    return _get_soc_n(df, n)

class UpscaleModel(LogisticRegression):
    def predict_soc_n(self, X, n):
        preds = self.predict_proba(X)
        df = pd.DataFrame(preds)
        df.columns = labels
        return self._get_soc_n(df, n)
    
    
def make_title_lookup(path, N):
    dot_codes = get_dictionary('', N).groupby('soc').first()
    d = dot_codes[f'desc_soc{N}'].to_dict()
    def lookup(code):
        try:
            return d[int(code)]
        except KeyError:
            return code
    return lookup

In [46]:
labels = np.unique(y_train)
lookup = make_title_lookup('', 3)

def get_soc_n(df, n):
    return (df.T
            .reset_index()
            .pipe(lambda df: df.assign(soc = df['index'].map(lambda i: str(i)[0:n])))
            .set_index('soc')
            .drop('index', 1)
            .groupby('soc')
            .sum().T
            .idxmax(1))

In [55]:
def print_preds(model, labels, target):
    preds = model.predict_proba(target)
    df = pd.DataFrame(preds)
    df.columns = labels

    res = [(lookup(cl),li) for cl,li in zip(get_soc_n(df, 3).values, target)]

    for title, desc in res:
        print(title)
        print(desc)
        print('\n')

In [104]:
print_preds(model, labels, lines[:25])

Material Moving Workers
flexible longitudinally continuous tape construction is disclosed for use in joining mating edges of juxtaposed members the tape having an like configuration transversely of its length to provide legs adapted to receive and be secured to the edges of the members to be joined	the tape is capable of serving as pliable hinge to permit articulation of the joined members or it may also serve simply as binding for joining members intended to be fixed relative to each other	the tape construction combines longitudinally continuous marginal web portions or carriers forming the extremities of the legs of the with longitudinally spaced strand or equivalent connector means running crosswise of and interconnecting pairs of marginal web portions	the connector means intersect and interlock forming the axis of the like configuration


Food Processing Workers
method of preserving perishable products	the method of packaging perishable products in container and insuring preservati

In [103]:
print_preds(model, labels, lines[1000:1025])

Textile, Apparel, and Furnishings Workers
method for production of hydroxyalkylglycol ethers	method for the production of hydroxyalkyl glycol ethers comprises reacting non terminal epoxides with ethylene glycol in the presence of an alkoxylation catalyst and saturated hydrocarbons in particular saturated aliphatic hydrocarbons as solvents


Other Installation, Maintenance, and Repair Occupations
removal and neutralisation of acid catalyst from products of cumene hydroperoxide cleavage	the mineral acid catalyst is removed from the products of cumene hydroperoxide cleavage by contact with an aqueous solution of an inorganic salt and an excess of an alkali metal hydroxide or phenate in first zone	suitably the aqueous solution contains sodium sulphate and sodium hydroxide or phenate	the aqueous layer is removed and the organic layer contacted in second zone with an aqueous solution comprising an inorganic salt and sufficient weak acid to decompose any phenate carried over from the first zo

In [105]:
print_preds(model, labels, lines[5000:5025])

Computer Occupations
use of hot spare drives to boost performance during nominal raid operation	method and apparatus for increasing performance in data processing system	the data processing system includes plurality of storage devices and backup storage device	the backup storage device is configured as log device	data is logged to the backup storage device after the backup storage device has been configured as log device	in response to failure of storage device within the plurality of storage devices the backup storage device is reconfigured to be used as replacement for the failed storage device


Computer Occupations
estimator program for estimating the availability of an application program that runs in cluster of at least two computers	an estimator program is disclosed which performs method steps for estimating the availability of an application program that runs on any computer in cluster of at least two computers	by the availability of an application program is herein meant the p

In [109]:
print_preds(model, labels, lines[5000:5025])

Other Production Occupations
thermo magnetic image recording methods and apparatus	magnetic image recording methods and apparatus employ magnetic recording medium susceptible to an image wise change of magnetization in response to thermal image patterns provided by image wise exposures of thermal image pattern generating device	the thermal device is subjected to repeated image wise exposures with each exposure including an exposure to an image part to which the thermal device is also exposed during another one of the exposures	the thermal device is moved relative to the recording medium between exposures and the image is magnetically recorded onto the recording medium with the aid of thermal image patterns generated by the exposures	in accordance with another aspect the mentioned thermal device is exposed at different spatial locations to each elemental area of the image	the exposed spatial locations as to each elemental image are brought into coincidence relative to the recording medi

In [111]:
print_preds(model, labels, lines[12000:12050])

Assemblers and Fabricators
battery of storage cells	the invention relates to semi open battery of storage cells each likely to evolve large volume of hydrogen more particularly during overcharge	battery according to the invention comprises an intermediate or dummy lid so located with respect to the regular lid of the battery to enable the inflammable gas flow generated in the cells to be isolated from the flow of fluid or gas serving for cooling of the electrical connections of the storage cells and thus avoiding or eliminating all danger of explosion


Material Moving Workers
blow molded gang type vent for multiple cell electric storage battery	dimensionally flexible one piece thermoplastic resinous hollow gang type vent with vent plugs for plurality of the filling wells of multiple cell battery is produced by blow molding technique


Vehicle and Mobile Equipment Mechanics, Installers, and Repairers
porous ceramic battery vent	fired porous ceramic battery vent having porosity and part

In [87]:
l = pd.concat([lines[:50], lines[1000:1050], lines[10000:10050], lines[-50:]])

In [None]:
from sklearn.neighbors import NearestNeighbors

a = embed_docs('../abstracts-ss-100', '\n'.join(l))

nn = NearestNeighbors()
nn.fit(a)

In [99]:
def print_neighbors(nn, a, i):
    _, idxs = nn.kneighbors(a[i].reshape(1,-1), n_neighbors = 5)
    for i in idxs:
        print(l.values[i])

In [108]:
print_neighbors(nn, a, 150)

['multi way cache expansion circuit architecture\tan expandable set tag cache circuit for use with data cache memory comprises tag memory divided into first set and second set for storing under single address location first and second tag fields representative of first and second data respectively\tthe tag memory also stores first and second signals representative of which of the sets is the least recently used\tcomparator is responsive to tag field of an address representative of requested data as well as to first tag field output from the tag memory for producing an output signal indicative of match therebetween\tsecond comparator is responsive to the same tag field of the address and to second tag field output from the tag memory for producing an output signal indicative of match therebetween\tfirst logic gate is responsive to the first and second comparators for producing an output signal indicative of the availability of the requested data in the data cache memory\tsecond logic ga