In [None]:
! pip install --quiet --upgrade scikit-learn
! pip install --quiet fuzzywuzzy
! pip install --quiet python-levenshtein
! pip install --quiet diskcache
! pip install --quiet lime
! pip install --quiet torch
! pip install --quiet gcsfs
! pip install --quiet xxhash
! pip install --upgrade numba
! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [None]:
! pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz

In [278]:
! pip install --quiet spacy
! python -m spacy download en_core_web_lg

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
import attr
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from validation.data import dot_train_data, get_soc_n, get_dictionary, indeed_test_data
from embed_software.preprocess import *
from embed_software.utils import get_embeddings, embed_docs
from classification.embedding import PreEmbeddedVectorizer
from validation.scoring import bubbleup_score, BubbleUpMixin

pd.set_option('max_colwidth',50)
pd.set_option('display.width', 700)

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]



In [2]:
SAMPLE_SIZE = 100000
SOC_LEVEL = 6

In [3]:
X_train, y_train = dot_train_data(SOC_LEVEL, include_dot = False)

In [4]:
tasks = pd.read_csv('tasks.txt', sep='\t')

In [5]:
X_train.shape, tasks.shape

((19530,), (19530, 7))

In [5]:
class BubbleUpLogisticRegression(BubbleUpMixin, LogisticRegression):
    pass

model = Pipeline([
    ('sentencespace_200_patent_descriptions', PreEmbeddedVectorizer('./patent-descriptions/models/sentencespace-200', cache_dir='patent-description-cache-dir', chunk_size=1000)),
    ('lr', BubbleUpLogisticRegression(C=2., solver='lbfgs', class_weight='balanced', multi_class="multinomial", n_jobs=-1).set_bubbles(3))
])

In [None]:
model.fit(X_train, y_train)

In [None]:
mapping = pd.read_csv('./patent-descriptions/hbs_pat_ipc_mapping.csv', sep='\t')

In [7]:
patent_ids = set(mapping.patent.values)

In [6]:
import csv
from itertools import islice
import sys
from itertools import takewhile, islice, count
from validation.scoring import get_top_soc_n_preds, get_soc_n_preds, make_code_lookup

csv.field_size_limit(sys.maxsize)

def chunk(n, it):
    src = iter(it)
    return takewhile(bool, (list(islice(src, n)) for _ in count(0)))

def cast(i):
    try: 
        return int(i)
    except ValueError:
        return i

def format_preds(d, socs, probs):
    a,b = [(pd.DataFrame(df) 
            .assign(idx = d.id) 
            .melt(id_vars = ['idx'], value_name = key) 
            .drop(columns = 'variable')
            .sort_values('idx')
            .reset_index(drop=True)
            .set_index('idx'))
           for df,key in [(socs, 'soc'), (probs, 'prob')]]

    df = pd.concat([a,b], 1).reset_index().rename(columns = {'idx': 'patent'})
    return df

def make_predictions(model, mapping, lines):
    d = pd.DataFrame(lines, columns = ['id', 'description'])
    preds = model.predict_proba(d.description)
    preds_df = pd.DataFrame(preds)
    preds_df.columns = model.classes_    
    preds_socs, preds_probs = get_top_soc_n_preds(preds_df, 3, 5, True)    
    return mapping.merge(format_preds(d, preds_socs, preds_probs), how = 'inner', on = 'patent')



In [117]:
vectorizer = PreEmbeddedVectorizer('../abstracts-ss-100', cache_dir='patent-abstracts-cache-dir', chunk_size=500)

In [53]:
from joblib import Parallel, delayed

with open('patent-descriptions/processed/details.csv') as f:
    reader = csv.reader(f)
    lines = ([cast(l[0]), l[1]] for l in reader)
    lines = (l for l in lines if l[0] in patent_ids)
    lines = chunk(100000, lines)  
    pred_dfs = [make_predictions(model, mapping, list(c)) for c in lines]

    # pred_dfs = Parallel(n_jobs = -1)(delayed(make_predictions)(model, mapping, list(c)) for c in lines)

In [54]:
all_preds = pd.concat(pred_dfs)

In [55]:
all_preds.to_csv('patent-descriptions/patent-preds-all.csv', index = False)

# ABSTRACTS - NEAREST SENTENCE

In [7]:
import json
from itertools import islice
from json import JSONDecodeError

def loads(l):
    try:
        return json.loads(l)
    except JSONDecodeError:
        return { 'id': None }

with open('../patent-abstracts/abstracts') as f:
    dat = ((i, loads(l)) for i,l in enumerate(f) if i != 0)
    dat = ((d['id'], d.get('abstract')) for i,d in dat)
    dat = list(dat)

In [8]:
from joblib import Parallel, delayed
from embed_software.preprocess import claims_processor, readme_processor, Preprocessor, tokenizer


sentence = re.compile(r"\.\s+")

def tokenize(p, min_tokens):
    li_tokens = tokenizer(p)
    if len(li_tokens) < min_tokens:
        return None
    return ' '.join(li_tokens)  

def preprocessor(og, min_tokens = 3):
    sents = sentence.split(og)
    sents = [(s.strip(), claims_processor(s, numbers=True)) for s in sents]
    
    sents = [(s, tokenize(p, min_tokens)) for s,p in sents]
    sents = [(s, p) for s,p in sents if p is not None]
    originals, processed = zip(*sents)

    # each a list of strings.
    return originals, processed

def process(preprocessor, i, text):
    if text is None:
        return (i, text, text)

    try:
        originals, processed = preprocessor(text)
        return (i, originals, processed)
    except ValueError: 
        return (i, None, None)

def process_abstracts_chunk(preprocessor, dat):
    processed = [process(preprocessor, i, t) for i,t in dat]
    return [(i,og,p) for i,og,p in processed if p]

def process_abstracts(preprocessor, dat, chunks = 32):
    chunks = chunk(chunks, dat)
    processed = Parallel(n_jobs=-1)(delayed(process_abstracts_chunk)(preprocessor, c) for c in chunks)
    # processed = (process_abstracts_chunk(preprocessor, c) for c in chunks)
    return [y for x in processed for y in x]



In [23]:
from numba import njit
from classification.embedding import WordEmbeddingVectorizer

@njit(parallel=True)
def get_nearest(embedded_tasks, embedded_sents, K = 1):
    N = embedded_sents.shape[0]
    A,D = np.zeros((N, K)), np.zeros((N, K))
    for i in np.arange(N):
        sim = embedded_tasks.dot(embedded_sents[i])
        top_idxs = np.argsort(sim)[-K:]
        for j,k in enumerate(top_idxs):
            A[i,j] = k
            D[i,j] = sim[k]
    return A,D

def make_results(idx, A, D, tasks, y_train, ids, ogs, sents, include_sents = True):
    ii = np.argwhere(idx)[:, 0].flatten()
    df = pd.DataFrame({
        'patent_id': ids.iloc[ii].values,
        'task_id': tasks.loc[A[idx]]['Task ID'].values,
        'task_soc': y_train[A[idx]].values,
        'distance': 1 - D[idx],
        'sent_id': sents.iloc[ii].index.values
    })
    if include_sents:
        df['abstract_sentence'] = ogs.iloc[ii].values
        df['task_sentence'] = tasks.loc[A[idx], 'Task'].values
    return df

def create_sentence_matches(X_train, y_train, tasks, patents):
    dat = [(r.patent, r.abstract) for i,r in patents.iterrows()]

    lines = process_abstracts(preprocessor, dat, 32)
    lines = [(i,ogg,pp) for i,og,p in lines for ogg,pp in zip(og, p)]

    ids, ogs, sents = zip(*lines)
    ids, ogs, sents = pd.Series(ids), pd.Series(ogs), pd.Series(sents)

    vectorizer = WordEmbeddingVectorizer('../patent-abstracts/abstracts-ss-100.tsv', sep = '\t', cache_dir=None, chunk_size=1000, max_workers = 1)

    embedded_tasks = vectorizer.fit_transform(X_train)
    embedded_sents = vectorizer.fit_transform(sents)

    A, D = get_nearest(embedded_tasks, embedded_sents, 8)
    
    all_idx = D <= 1.0

    return make_results(all_idx, A, D, tasks, y_train, ids, ogs, sents)

In [9]:
# Filter dat by sample!

mapping = pd.read_csv('./patent-descriptions/hbs_pat_ipc_mapping.csv', sep='\t')
mapping['patent'] = mapping.patent.astype(str)

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
def count_sents(a):
    try: 
        return len(a.split('. '))
    except AttributeError:
        return 0

def is_cutoff(a):
    return a[-1] != '.' if a else True

patents = pd.DataFrame(dat, columns=['patent', 'abstract']) \
            .drop_duplicates('patent') \
            .merge(mapping, on='patent', how='inner') \
            .pipe(lambda df: df.assign(num_sents = df.abstract.map(count_sents))) \
            .pipe(lambda df: df.assign(cutoff = df.abstract.map(is_cutoff)))

In [16]:
close_sents = pd.read_csv('./patent-neighbors/week1/super_close_matches.csv')
close_idxs = close_sents.patent_id.unique()

In [17]:
SEED = 1

valid_patents = patents[(patents.appyear < 2007) & 
                        (patents.appyear > 1973) & 
                        (patents.cutoff == False) ]

sampled_patents = valid_patents \
    [~valid_patents.patent.isin(close_idxs)] \
    .groupby(['class', 'appyear']) \
    .apply(lambda df: df.sample(5, random_state=SEED) if df.shape[0] > 5 else None) \
    .reset_index(drop = True)

close_patents = valid_patents[valid_patents.patent.isin(close_sents.patent_id)]

In [24]:
%%time

results = create_sentence_matches(X_train, y_train, tasks, sampled_patents)
close_results = create_sentence_matches(X_train, y_train, tasks, close_patents)

CPU times: user 33min 43s, sys: 1h 25min 56s, total: 1h 59min 39s
Wall time: 4min 5s


In [27]:
results.shape

(372016, 7)

In [26]:
close_results.shape

(133656, 7)

# Selecting for MTurk

In [111]:
from toolz import curry

@curry
def add_nested_local_id(nested, name, df):
    lookup = { s:i+1 for i,s in 
               enumerate(df[nested].unique()) }

    df[name] = df[nested].map(lookup)
    return df

@curry
def add_local_id(name, df, randomize=True):
    idx = np.arange(df.shape[0]) + 1
    if randomize:
        np.random.shuffle(idx)
    return df.assign(**{name: idx}).sort_values(name)


def split_out_results(results, patents, group):
    patents = patents \
        .pipe(lambda df: df.assign(abstract = df.abstract.map(lambda a: ' '.join(a.split()).strip()))) \
        .rename(columns = {'appyear': 'year', 'patent': 'patent_id'}) \
        [['patent_id', 'abstract', 'year', 'class']] \
        .assign(classifier_group = group)

    ided = results[['patent_id', 'sent_id', 'abstract_sentence', 
                'task_id', 'distance', 'task_sentence']] \
                .groupby(['patent_id']) \
                .apply(add_nested_local_id('sent_id', 'sent_index')) \
                .reset_index(drop=True)

    # Take only first 5 sentences
    # ided = ided[ided.sent_index <= 5]

    sentences = ided.drop_duplicates('sent_id') \
                    .drop(columns = ['task_id', 'distance', 'task_sentence', 'sent_id'])

    mturk_tasks = ided \
        .groupby(['patent_id', 'sent_index']) \
        .apply(add_local_id('task_index')) \
        .reset_index(drop=True) \
        .drop(columns = ['abstract_sentence', 'task_id', 'sent_id'])

    return patents, sentences, mturk_tasks

In [112]:
def reformat_close_sents(sents, patents):
    d = sents.merge(patents[['patent', 'abstract']], how='left', left_on='patent_id', right_on='patent')[['patent_id', 'abstract']]
    d = d[d['abstract'].notna()]
    dat = [(r.patent_id, r.abstract) for i,r in d.iterrows()]
    return sents, dat

In [113]:
def make_group_idxs(per_worker):
    idxs = [0]*per_worker
    for i in [3,6,9,12,15,21]:
        idxs[i] = 1
    return idxs

In [114]:
def assign_workers(patent_list, group_idxs, num_workers, per_worker, ):
    out = []

    vals = [p.values.tolist() for p in patent_list]
    curr_vals = [v.copy() for v in vals]

    for i in np.arange(num_workers):
        i = i+1
        for j in np.arange(per_worker):
            gidx = group_idxs[j]
            if len(curr_vals[gidx]) == 0:
                curr_vals[gidx] = vals[gidx].copy()
            out.append((i, curr_vals[gidx].pop()))

    return pd.DataFrame(out, columns = ['worker_id', 'patent_id'])

In [115]:
%%time

group_a = split_out_results(results, sampled_patents, 'A')
group_b = split_out_results(close_results, close_patents, 'B')

CPU times: user 3min 26s, sys: 1.05 s, total: 3min 27s
Wall time: 3min 27s


In [116]:
%%time

from math import floor

a_size = floor(24*40 / 3)
b_size = floor(6*40 / 3)

group_a_patents = group_a[0] \
    .groupby('class') \
    .apply(lambda df: df.sample(5)) \
    .reset_index(drop=True) \
    .sample(a_size).patent_id

group_b_patents = group_b[0].sample(b_size).patent_id

workers = assign_workers([group_a_patents, group_b_patents], 
                         make_group_idxs(30), 
                         40, 
                         30)

CPU times: user 92 ms, sys: 0 ns, total: 92 ms
Wall time: 91.9 ms


In [117]:
patents, sentences, mturk_tasks = [pd.concat(t) for t in zip(*[group_a, group_b])]

In [118]:
patents.to_csv('mturk-data/abstracts.csv', index=False)
# sentences.to_csv('mturk-data/sentences.csv', index=False)
# mturk_tasks.to_csv('mturk-data/tasks.csv', index=False)
workers.to_csv('mturk-data/workers.csv', index=False)

In [197]:
import spacy
nlp = spacy.load('en_core_web_lg')

results = pd.read_csv('patent-neighbors/week0/abstracts-neighbor-results.csv')

In [19]:
i = 7

def get_subj(sent):
    subjs = [t for t in nlp(sent) 
             if t.dep_ == 'nsubj']
    words = [str(s) for s in subjs]
    subj = ','.join(words) if words else None
    return subj

results = results.assign(subj = [get_subj(sent) for sent in results.abstract_sentence])

In [21]:
results.to_csv('abstracts-neighbor-sample.csv', index=False)

In [None]:
model = Pipeline([('sentencespace_100_us', PreEmbeddedVectorizer('../abstracts-ss-100', 100, cache_dir='embed_cache')),
                  ('knn', KNeighborsClassifier(1, n_jobs=-1))])

model.fit(X_train, y_train)

In [8]:
def _get_soc_n(df, n):
    return (df.T
            .reset_index()
            .pipe(lambda df: df.assign(soc = df['index'].map(lambda i: str(i)[0:n])))
            .set_index('soc')
            .drop('index', 1)
            .groupby('soc').sum().T)


def get_pred(model, X):
    vals = model.predict_proba(X)
    df = pd.DataFrame(vals)
    df.columns = model.classes_
    n=3
    return _get_soc_n(df, n)

class UpscaleModel(LogisticRegression):
    def predict_soc_n(self, X, n):
        preds = self.predict_proba(X)
        df = pd.DataFrame(preds)
        df.columns = labels
        return self._get_soc_n(df, n)
    
    
def make_title_lookup(path, N):
    dot_codes = get_dictionary('', N).groupby('soc').first()
    d = dot_codes[f'desc_soc{N}'].to_dict()
    def lookup(code):
        try:
            return d[int(code)]
        except KeyError:
            return code
    return lookup

In [46]:
labels = np.unique(y_train)
lookup = make_title_lookup('', 3)

def get_soc_n(df, n):
    return (df.T
            .reset_index()
            .pipe(lambda df: df.assign(soc = df['index'].map(lambda i: str(i)[0:n])))
            .set_index('soc')
            .drop('index', 1)
            .groupby('soc')
            .sum().T
            .idxmax(1))

In [55]:
def print_preds(model, labels, target):
    preds = model.predict_proba(target)
    df = pd.DataFrame(preds)
    df.columns = labels

    res = [(lookup(cl),li) for cl,li in zip(get_soc_n(df, 3).values, target)]

    for title, desc in res:
        print(title)
        print(desc)
        print('\n')

In [None]:
print_preds(model, labels, lines[:25])

In [None]:
print_preds(model, labels, lines[1000:1025])

In [None]:
print_preds(model, labels, lines[5000:5025])

In [None]:
print_preds(model, labels, lines[5000:5025])

In [None]:
print_preds(model, labels, lines[12000:12050])

In [87]:
l = pd.concat([lines[:50], lines[1000:1050], lines[10000:10050], lines[-50:]])

In [None]:
from sklearn.neighbors import NearestNeighbors

a = embed_docs('../abstracts-ss-100', '\n'.join(l))

nn = NearestNeighbors()
nn.fit(a)

In [99]:
def print_neighbors(nn, a, i):
    _, idxs = nn.kneighbors(a[i].reshape(1,-1), n_neighbors = 5)
    for i in idxs:
        print(l.values[i])