In [1]:
import json
import pandas as pd 
from functools import reduce

In [2]:
# load jsonl file to list
def load_jsonl(path):
    with open(path, 'r') as f:
        return [json.loads(line) for line in f]

# combine list of dictionaries to one dictionary
def combine_dicts(list_of_dicts):
    return {k: v for d in list_of_dicts for k, v in d.items()}

In [3]:
patent_data = load_jsonl('data.log')
patent_data = combine_dicts(patent_data)

In [4]:
# Deep search util functions


# Deep search for a key in nested dictionary 
def deep_search(key, dictionary, child_key=None):
    if isinstance(dictionary, list):
        for d in dictionary:
            for result in deep_search(key, d, child_key):
                yield result
    if isinstance(dictionary, dict):
        for k, v in dictionary.items():
            if k == key:
                if child_key is not None:
                    yield v[child_key]
                else:
                    yield v
            if isinstance(v, dict):
                for result in deep_search(key, v, child_key):
                    yield result
            elif isinstance(v, list):
                for d in v:
                    for result in deep_search(key, d, child_key):
                        yield result
                        

# Deep search for a key in nested dictionary 
def deep_search_condition(dictionary, condition):

    def safe_cond(k,v):
        try:
            return condition(k,v)
        except:
            return False
    
    if safe_cond(None, dictionary):
        yield dictionary
                
    if isinstance(dictionary, list):
        for d in dictionary:
            for result in deep_search_condition(d, condition):
                yield result
    if isinstance(dictionary, dict):
        for k, v in dictionary.items():
            if safe_cond(k,v):
                yield v
            if isinstance(v, dict):
                for result in deep_search_condition(v, condition):
                    yield result
            elif isinstance(v, list):
                for d in v:
                    for result in deep_search_condition(d, condition):
                        yield result

In [5]:
# Convert YYYYMMDD to YYYY-MM-DD
def convert_date(date):
    return date[:4] + '-' + date[4:6] + '-' + date[6:]

In [6]:




# Convert generators to pandas dataframe

def get_val(data):
    dat = list(data)
    if len(dat) == 0:
        return None
    elif len(dat) == 1:
        return dat[0]
    return dat

# Recursivelly check dictionary for nested dictionaries and lists and check if any match condition


def get_text(data, lang='pl'):
    dat_d = get_val(data)
    
    if dat_d is None:
        return None
    
    val = deep_search_condition(dat_d, lambda key, val: '@lang' in val and val['@lang'] == lang)
    if val  is None:
        val = deep_search_condition(dat_d, lambda key, val: '@lang' in val)
    
    return get_val(val)
    
    
def patent_data_to_df(data_json):
    
    root  = data_json['root']
    
    if root is None or "_embedded" not in root:
        return None
    
    embedded = root['_embedded']
    
    if embedded is None or "root" not in embedded:
        return None
    
    data =  embedded['root']
    
    all = []
    for transaction in data:
        applicant = get_val(deep_search('applicant', transaction))
        
        adress_book =  get_text(deep_search('addressbook', applicant), lang='pl')
        
        applicant_name = get_val(deep_search('name', adress_book))
        
        application_reference = get_val(deep_search('application-reference', transaction))
        
        application_date = get_val(deep_search('date', application_reference))
        
        
        
        
        if application_date is not None:
            application_date = convert_date(application_date)
        
        classification_ipc = get_val(deep_search('classification-ipc', transaction))
        
        main_classification = get_val(deep_search('main-classification', classification_ipc))
        further_classification = get_val(deep_search('further-classification', classification_ipc))
        
        
        try: 
            title =  get_text(deep_search('invention-title', transaction), lang='pl')
            if title is None or "#text" not in title:
                title =  get_text(deep_search('invention-title', transaction), lang='en')
            if title is not None:
                title = title['#text']
        except KeyError as e:
            print(get_val(deep_search('invention-title', transaction)))
            raise(e)
            
        
        
        
        id = get_val(deep_search('extidappli', transaction))
        patent_id = get_val(deep_search('extidpatent', transaction))
        
        
        decision = get_val(deep_search('decision-name', transaction))
        
        
        if isinstance(applicant_name, list):
            applicant_name = ', '.join(applicant_name)
        
        date_from = get_val(deep_search('begin-date', transaction))
        
        date_to = get_val(deep_search('end-date', transaction))
        
        #"@app-type": "applicant",
        
        all.append({
            'id':id,
            'patent_id':patent_id, 
            'title': title,
            'decision': decision,
            'applicant': applicant_name,
            'application_date':application_date,
            'ipc_classification': main_classification,
            'ipc_classification_secondary': further_classification,
            'date_from': date_from,
            'date_to': date_to
        })
        
    return pd.DataFrame(all)

In [7]:
dfs = []
for inst_fname, inst_metadata in patent_data.items():
        
    data = json.load(open(inst_fname))
    
    df = patent_data_to_df(data)
    
    if df is not None:
        df['institution_id'] = inst_metadata['id']
        df['institution'] = inst_metadata['name']
        df['file'] = inst_fname
        
        dfs.append(df)
    


In [8]:
all_patents = pd.concat(dfs)

In [9]:
all_patents=all_patents[all_patents['id'].notna()]


In [15]:
all_patents.head(15)

Unnamed: 0,id,patent_id,title,decision,applicant,application_date,ipc_classification,ipc_classification_secondary,date_from,date_to,institution_id,institution,file
0,P.428896,,Ekran dźwiękochłonny,,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2019-03-06,E01F 8/00,E04B 1/86,2020-10-12,2020-10-12,bS0fnRPSDLEFsLF-zGr8CGQ,Wyższa Szkoła Gospodarki w Bydgoszczy,out/data/wyzsza-szkoa-gospodarki-w-bydgoszczy....
1,P.428889,,Sterowanie i konstrukcja stanowiska do badań t...,,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2019-02-12,G01N 3/56,G01N 19/00,2021-02-04,2021-02-04,bS0fnRPSDLEFsLF-zGr8CGQ,Wyższa Szkoła Gospodarki w Bydgoszczy,out/data/wyzsza-szkoa-gospodarki-w-bydgoszczy....
2,P.428899,,Regulator przekształtnika energoelektroniczneg...,Decyzja o odmowie udzielenia patentu,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2019-02-12,G05F 1/67,H02S 40/30,2022-03-21,2022-03-21,bS0fnRPSDLEFsLF-zGr8CGQ,Wyższa Szkoła Gospodarki w Bydgoszczy,out/data/wyzsza-szkoa-gospodarki-w-bydgoszczy....
3,P.433021,,Zintegrowany zespół narzędziowy do obróbki mat...,,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2020-02-24,B23Q 15/00,,2021-08-30,2021-08-30,bS0fnRPSDLEFsLF-zGr8CGQ,Wyższa Szkoła Gospodarki w Bydgoszczy,out/data/wyzsza-szkoa-gospodarki-w-bydgoszczy....
4,P.433019,,Modułowy zestaw do ochrony przed hałasem,,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2020-02-24,E01F 8/00,E04B 1/86,2021-08-30,2021-08-30,bS0fnRPSDLEFsLF-zGr8CGQ,Wyższa Szkoła Gospodarki w Bydgoszczy,out/data/wyzsza-szkoa-gospodarki-w-bydgoszczy....
5,P.433020,,Sposób obróbki skrawaniem materiałów trudnoobr...,,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2020-02-24,B23Q 15/00,,2021-08-30,2021-08-30,bS0fnRPSDLEFsLF-zGr8CGQ,Wyższa Szkoła Gospodarki w Bydgoszczy,out/data/wyzsza-szkoa-gospodarki-w-bydgoszczy....
6,P.433045,,Parkownica dwustanowiskowa,,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2020-02-25,E04H 6/02,E04H 6/06,2021-08-30,2021-08-30,bS0fnRPSDLEFsLF-zGr8CGQ,Wyższa Szkoła Gospodarki w Bydgoszczy,out/data/wyzsza-szkoa-gospodarki-w-bydgoszczy....
7,P.433022,,Stanowisko badawcze do pomiaru siły skrawania ...,,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2020-02-24,G01N 3/58,"[G01N 3/31, B23P 9/04]",2021-08-31,2021-08-31,bS0fnRPSDLEFsLF-zGr8CGQ,Wyższa Szkoła Gospodarki w Bydgoszczy,out/data/wyzsza-szkoa-gospodarki-w-bydgoszczy....
8,P.433048,,"Ekologiczne, samowystarczalne energetycznie do...",,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2020-02-25,E04H 1/00,E04H 14/00,2022-09-12,2022-09-12,bS0fnRPSDLEFsLF-zGr8CGQ,Wyższa Szkoła Gospodarki w Bydgoszczy,out/data/wyzsza-szkoa-gospodarki-w-bydgoszczy....
9,P.428902,,Sposób poprawy bezpieczeństwa samolotów podcza...,,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2019-03-06,B64F 1/02,B64F 1/04,2022-09-13,2022-09-13,bS0fnRPSDLEFsLF-zGr8CGQ,Wyższa Szkoła Gospodarki w Bydgoszczy,out/data/wyzsza-szkoa-gospodarki-w-bydgoszczy....


In [11]:
# Merge institutions with patents data where institution name is in patent data 
def merge_institutions(df, institutions_df):
    df = df.merge(institutions_df, left_on='institution', right_on='Nazwa instytucji', how='left')
    df = df.drop(columns=['Nazwa instytucji'])
    return df


In [12]:
# Whatch out!!!  It's many to many merge
#inst_patents_disciplines = merge_institutions(all_patents, institutions_df)


In [13]:
# Save to excel
import os 

#out_file_disciplines = os.path.join('out', 'inst_patents_disciplines.xlsx')
out_file_patents  = os.path.join('out', 'inst_patents.xlsx')

#if not os.path.exists(out_file_disciplines):
#    inst_patents_disciplines.to_excel(out_file_disciplines)
    
#if not os.path.exists(out_file_patents):
all_patents.to_excel(out_file_patents)