# Parsing IPC codes

The Patstat IPC codes are missing a description. Here we seek to match them with the [IPC definitions available from WIPO](https://www.wipo.int/classifications/ipc/ipcpub/?notion=scheme&version=20190101&symbol=none&menulang=en&lang=en&viewmode=f&fipcpc=no&showdeleted=yes&indexes=no&headings=yes&notes=yes&direction=o2n&initial=A&cwid=none&tree=no&searchmode=smart)


## 0. Preamble

In [None]:
%run notebook_preamble.ipy

In [None]:
from zipfile import ZipFile

from io import BytesIO

from bs4 import BeautifulSoup

import xmltodict

## 1. Load data

#### Load the IPC code lookup


We have a collection of text files matching detailed IPC codes with names

In [None]:
import os

In [None]:
ipc_dicts = os.listdir('../data/external/EN_ipc_title_list_20190101/')

In [None]:
cont = []

#This is a very crude function to parse the patent data: it goes through every txt file, removes useless guff at the top and append the text 
# description of a subclass to the description of the more detailed text to make things more interpretable

for d in ipc_dicts:
    
    t = pd.read_table(f'../data/external/EN_ipc_title_list_20190101/{d}',delimiter='\t',skiprows=1)
    
    t.columns = ['code','description']
    
    #print(t.head())
    
    descr=''
    
    for a_id,row in t.iterrows():
        
        
        if len(row['code'])==4:
            descr=row['description'].lower().capitalize().split('(')[0]
            
        else:
            t.loc[a_id]['description']='__'.join([descr,str(row['description'])])
        
    
    
    cont.append(t)

all_ipc_codes = pd.concat(cont).reset_index(drop=True)

In [None]:
#We truncate the codes so we can match them with the PATSTAT data (most codes there are 10-digits long)
all_ipc_codes['ipc_match'] = [x[:10] if len(x)>=10 else x for x in all_ipc_codes['code']]

In [None]:
all_ipc_codes.head()

In [None]:
with open('../data/external/ipc_def_lookup.json','w') as outfile:
    json.dump(all_ipc_codes.set_index('ipc_match')['description'].to_dict(),outfile)