In [20]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pandas as pd
import numpy as np
import requests
import json

from pandas.io.json import json_normalize
import pickle
from collections import ChainMap

In [22]:
pd.options.display.max_columns = 50
pd.set_option('display.max_rows', 50)

In [23]:
endpoint_url = 'http://www.patentsview.org/api/patents/query'

In [24]:
patent_fields_full = ['appcit_app_number',
 'appcit_category',
 'appcit_date',
 'appcit_kind',
 'appcit_sequence',
 'app_country',
 'app_date',
 'app_number',
 'app_type',
 'assignee_city',
 'assignee_country',
 'assignee_county',
 'assignee_county_fips',
 'assignee_first_name',
 'assignee_first_seen_date',
 'assignee_id',
 'assignee_last_name',
 'assignee_last_seen_date',
 'assignee_lastknown_city',
 'assignee_lastknown_country',
 'assignee_lastknown_latitude',
 'assignee_lastknown_location_id',
 'assignee_lastknown_longitude',
 'assignee_lastknown_state',
 'assignee_latitude',
 'assignee_location_id',
 'assignee_longitude',
 'assignee_organization',
 'assignee_sequence',
 'assignee_state',
 'assignee_state_fips',
 'assignee_total_num_inventors',
 'assignee_total_num_patents',
 'assignee_type',
 'cited_patent_category',
 'cited_patent_date',
 'cited_patent_kind',
 'cited_patent_number',
 'cited_patent_sequence',
 'cited_patent_title',
 'citedby_patent_category',
 'citedby_patent_date',
 'citedby_patent_kind',
 'citedby_patent_number',
 'citedby_patent_title',
 'cpc_category',
 'cpc_first_seen_date',
 'cpc_group_id',
 'cpc_group_title',
 'cpc_last_seen_date',
 'cpc_section_id',
 'cpc_sequence',
 'cpc_subgroup_id',
 'cpc_subgroup_title',
 'cpc_subsection_id',
 'cpc_subsection_title',
 'cpc_total_num_assignees',
 'cpc_total_num_inventors',
 'cpc_total_num_patents',
 'detail_desc_length',
 'examiner_first_name',
 'examiner_id',
 'examiner_last_name',
 'examiner_role',
 'examiner_group',
 'forprior_country',
 'forprior_date',
 'forprior_docnumber',
 'forprior_kind',
 'forprior_sequence',
 'govint_contract_award_number',
 'govint_org_id',
 'govint_org_level_one',
 'govint_org_level_two',
 'govint_org_level_three',
 'govint_org_name',
 'govint_raw_statement',
 'inventor_city',
 'inventor_country',
 'inventor_county',
 'inventor_county_fips',
 'inventor_first_name',
 'inventor_first_seen_date',
 'inventor_id',
 'inventor_last_name',
 'inventor_last_seen_date',
 'inventor_lastknown_city',
 'inventor_lastknown_country',
 'inventor_lastknown_latitude',
 'inventor_lastknown_location_id',
 'inventor_lastknown_longitude',
 'inventor_lastknown_state',
 'inventor_latitude',
 'inventor_location_id',
 'inventor_longitude',
 'inventor_sequence',
 'inventor_state',
 'inventor_state_fips',
 'inventor_total_num_patents',
 'ipc_action_date',
 'ipc_class',
 'ipc_classification_data_source',
 'ipc_classification_value',
 'ipc_first_seen_date',
 'ipc_last_seen_date',
 'ipc_main_group',
 'ipc_section',
 'ipc_sequence',
 'ipc_subclass',
 'ipc_subgroup',
 'ipc_symbol_position',
 'ipc_total_num_assignees',
 'ipc_total_num_inventors',
 'ipc_version_indicator',
 'lawyer_first_name',
 'lawyer_first_seen_date',
 'lawyer_id',
 'lawyer_last_name',
 'lawyer_last_seen_date',
 'lawyer_organization',
 'lawyer_sequence',
 'lawyer_total_num_assignees',
 'lawyer_total_num_inventors',
 'lawyer_total_num_patents',
 'nber_category_id',
 'nber_category_title',
 'nber_first_seen_date',
 'nber_last_seen_date',
 'nber_subcategory_id',
 'nber_subcategory_title',
 'nber_total_num_assignees',
 'nber_total_num_inventors',
 'nber_total_num_patents',
 'patent_abstract',
 'patent_average_processing_time',
 'patent_date',
 'patent_firstnamed_assignee_city',
 'patent_firstnamed_assignee_country',
 'patent_firstnamed_assignee_id',
 'patent_firstnamed_assignee_latitude',
 'patent_firstnamed_assignee_location_id',
 'patent_firstnamed_assignee_longitude',
 'patent_firstnamed_assignee_state',
 'patent_firstnamed_inventor_city',
 'patent_firstnamed_inventor_country',
 'patent_firstnamed_inventor_id',
 'patent_firstnamed_inventor_latitude',
 'patent_firstnamed_inventor_location_id',
 'patent_firstnamed_inventor_longitude',
 'patent_firstnamed_inventor_state',
 'patent_kind',
 'patent_num_cited_by_us_patents',
 'patent_num_claims',
 'patent_num_combined_citations',
 'patent_num_foreign_citations',
 'patent_num_us_application_citations',
 'patent_num_us_patent_citations',
 'patent_number',
 'patent_processing_time',
 'patent_title',
 'patent_type',
 'patent_year',
 'pct_102_date',
 'pct_371_date',
 'pct_date',
 'pct_docnumber',
 'pct_doctype',
 'pct_kind',
 'rawinventor_first_name',
 'rawinventor_last_name',
 'uspc_first_seen_date',
 'uspc_last_seen_date',
 'uspc_mainclass_id',
 'uspc_mainclass_title',
 'uspc_sequence',
 'uspc_subclass_id',
 'uspc_subclass_title',
 'uspc_total_num_assignees',
 'uspc_total_num_inventors',
 'uspc_total_num_patents',
 'wipo_field_id',
 'wipo_field_title',
 'wipo_sector_title',
 'wipo_sequence']

In [25]:
patent_fields = ['patent_number', 
                 'patent_date', 
                 'patent_title',
                 'patent_abstract', 
                 'patent_firstnamed_assignee_id', 
                 'patent_year', 
                 'patent_type', 
                 'patent_kind']

In [26]:
query={"_or":[{"_text_phrase":{"patent_title":"machine learning"}},{"_text_phrase":{"patent_abstract":"machine learning"}}]}
fields=patent_fields
options={"per_page":4000}
sort=[{"patent_date":"desc"}]

params={'q': json.dumps(query),
        'f': json.dumps(fields),
        'o': json.dumps(options),
        's': json.dumps(sort)}

# other queries - uncomment to run
# query={"_text_phrase":{"patent_abstract":"machine learning"}} 
# query={"_text_all":{"patent_abstract":"machine learning"}} 
# query={"_or":[{"_text_all":{"patent_title":"machine learning"}},{"_text_all":{"patent_abstract":"machine learning"}}]}

In [27]:
# request and results
resp = requests.get(endpoint_url, params=params)
results = resp.json()

In [28]:
# extract metadata from response
print("status code:", resp.status_code,';', "reason:", resp.reason)
total_patent_count = results["total_patent_count"]
patents_per_page = results['count']
print("total_patent_count:",total_patent_count,';', "patents_per_page:", patents_per_page)

status code: 200 ; reason: OK
total_patent_count: 3147 ; patents_per_page: 3147


In [29]:
# extract data from response
data = results['patents']
df = pd.DataFrame(data)
df.head()

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind
0,10603498,2020-03-31,Systems and methods for closed-loop determinat...,A method or system for facilitating the determ...,org_5cFCcVidnLqkMwKWc9s4,2020,utility,B2
1,10603793,2020-03-31,Work assisting system including machine learni...,A work assisting system includes a sensor unit...,org_RQZono9Ir8KVdgNAaglV,2020,utility,B2
2,10603797,2020-03-31,"Machine learning device, robot system, and mac...",A machine learning device for learning a motio...,org_RQZono9Ir8KVdgNAaglV,2020,utility,B2
3,10605228,2020-03-31,Method for controlling operation of a wind tur...,A method for controlling operation of a wind t...,org_VIvs7w0sts1aCjlrKaiG,2020,utility,B2
4,10605702,2020-03-31,Fluid analysis and monitoring using optical sp...,"Systems, methods, and computer-program product...",org_aHdfa1XsbUURjnXmlGyp,2020,utility,B2


In [30]:
len(df)

3147

In [31]:
df.columns

Index(['patent_number', 'patent_date', 'patent_title', 'patent_abstract',
       'patent_firstnamed_assignee_id', 'patent_year', 'patent_type',
       'patent_kind'],
      dtype='object')

In [32]:
df['patent_title_and_abstract'] = df.patent_title + ' ' + df.patent_abstract
df.patent_title_and_abstract.head(3)

0    Systems and methods for closed-loop determinat...
1    Work assisting system including machine learni...
2    Machine learning device, robot system, and mac...
Name: patent_title_and_abstract, dtype: object

In [33]:
df.to_pickle("data/df.pkl")