## Extracting Patent Data on Climate Change

In [1]:
#!pip install flashtext
from flashtext import KeywordProcessor
import time
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### Time of execution
start = time.time()

monitoring_terms = ['climate','climate change','mitigation', 'climate change mitigation', 'emission reduction']

print('terms to use for monitoring: {}'.format(len(monitoring_terms)))
#20190312
patents = pd.read_csv('http://data.patentsview.org/20190312/download/patent.tsv.zip',
                      sep='\t', error_bad_lines=False, verbose=False)


print('total patents database: {}'.format(len(patents)))

def extract(vec, dictionary, info=False):
    matrix = []
    for line in vec:
        matrix.append(dictionary.extract_keywords(str(line), span_info=info))
    return matrix

#proccess
data = patents
dictionary = KeywordProcessor()
dictionary.add_keywords_from_list(monitoring_terms)
extracted = extract(data.abstract, dictionary)
row = [list(set(i)) if len(i)>0 else '' for i in extracted]
data['matches'] = [str(i).replace('[', '').replace(']', '') for i in row]
data['count_matches'] = [len(i) for i in extracted]
data['count_unique_matches'] = [len(set(i)) for i in extracted]
#get the data matches
end = time.time()
print('Elapsed time: {}'.format(time.strftime("%H:%M:%S", time.gmtime(end - start))))


terms to use for monitoring: 5


b'Skipping line 4505264: expected 11 fields, saw 12\n'
b'Skipping line 4540085: expected 11 fields, saw 12\nSkipping line 4570473: expected 11 fields, saw 12\n'
b'Skipping line 4610402: expected 11 fields, saw 12\nSkipping line 4652985: expected 11 fields, saw 12\n'
b'Skipping line 4662863: expected 11 fields, saw 12\n'


total patents database: 6957999
Elapsed time: 00:16:54


In [2]:
data.head(3)

Unnamed: 0,id,type,number,country,date,abstract,title,kind,num_claims,filename,withdrawn,matches,count_matches,count_unique_matches
0,10000000,utility,10000000,US,2018-06-19,A frequency modulated (coherent) laser detecti...,Coherent LADAR using intra-pixel quadrature de...,B2,20.0,ipg180619.xml,,,0,0
1,10000001,utility,10000001,US,2018-06-19,The injection molding machine includes a fixed...,Injection molding machine and mold thickness c...,B2,12.0,ipg180619.xml,,,0,0
2,10000002,utility,10000002,US,2018-06-19,The present invention relates to: a method for...,Method for manufacturing polymer film and co-e...,B2,9.0,ipg180619.xml,,,0,0


In [None]:
start = time.time()
#get the data matches
data_matches = data[data['count_matches'] > 0]
data_matches['date_str'] = ["".join(i.split('-')) for i in data_matches.date]
links = []
for i in data_matches.index:
    line = data_matches.loc[i]
    link = "https://worldwide.espacenet.com/publicationDetails/biblio?"+\
    "CC="+str(line.country)+\
    "&NR="+str(line.id)+\
    "&KC="+str(line.kind)+\
    "&date="+str(line.date_str)+\
    "&locale=en_EP"
    links.append(link)
data_matches['link'] = links
print('number of matches found: {}, {}%'.format(len(data_matches), round(100*len(data_matches)/len(data),5)))

#export data
export = data_matches.sort_values('count_matches', ascending=False).drop_duplicates(subset ="link")
export.to_excel('climate_change_patents.xlsx')
print('Exported in climate_change_patents.xlsx')
end = time.time()
print('Elapsed time: {}'.format(time.strftime("%H:%M:%S", time.gmtime(end - start))))