# Explore extracting references
In this notebook we will explore how to extract data from the raw google patents data.


In [1]:
import pandas as pd



In [2]:
patents = pd.read_csv("../data/rand_npl.csv", index_col="publication_number",
                      encoding='utf-8')




In [3]:
patents.head(5)

Unnamed: 0_level_0,country_code,type,npl_text,rand
publication_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
EP-1482288-B1,EP,,"HILARY E SNELL ET AL: ""Fourier Transform Fabry...",0.964252
EP-3134253-A4,EP,XPI,"ROBERT MACCURDY ET AL: ""Hybrid printing of pho...",0.102272
EP-1811026-B1,EP,,Technical Bulletin; pCI and pSI Mammalian Expr...,0.976539
EP-1901235-B1,EP,,"VERGEEST J ET AL: ""Free-form surface copy and ...",0.202909
EP-2097241-B1,EP,,"BAETEN F ET AL: ""Barium titanate characterizat...",0.497251


In [4]:
patents_clean = patents.drop(columns = ['rand'])

In [5]:
# get an overview of cell content
patents_clean['country_code'].value_counts()


EP    10000
Name: country_code, dtype: int64

In [6]:
patents_clean = patents_clean.drop(columns = ['country_code'])

In [7]:
patents_clean['type'].value_counts()

A       1209
Y        428
X        379
XI       266
I        217
T        112
XY        92
XA        64
XP        63
XYI       55
YA        54
XAI       41
AD        35
IY        29
AP        26
IA        26
YD        26
IP        21
XAY       12
ID         9
XPI        8
YP         6
XD         6
L          6
XAYI       6
XDI        4
XIY        4
P          3
IAY        3
XDYI       2
XIA        2
PX         2
DA         2
XPYI       1
DXY        1
YDA        1
YPA        1
XDY        1
XOP        1
DX         1
XPY        1
Name: type, dtype: int64

Need to figure out what these are...

Now let's try to generate a new column.

In [8]:
patents_clean['authors'] = patents_clean['npl_text'].str.extract(r'^(\w.*?):')
patents_clean.head(30)

Unnamed: 0_level_0,type,npl_text,authors
publication_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EP-1482288-B1,,"HILARY E SNELL ET AL: ""Fourier Transform Fabry...",HILARY E SNELL ET AL
EP-3134253-A4,XPI,"ROBERT MACCURDY ET AL: ""Hybrid printing of pho...",ROBERT MACCURDY ET AL
EP-1811026-B1,,Technical Bulletin; pCI and pSI Mammalian Expr...,
EP-1901235-B1,,"VERGEEST J ET AL: ""Free-form surface copy and ...",VERGEEST J ET AL
EP-2097241-B1,,"BAETEN F ET AL: ""Barium titanate characterizat...",BAETEN F ET AL
EP-2280048-B1,,"ULRICH JAHN: ""Decandisäure"", RÖMPP ONLINE 4.0,...",ULRICH JAHN
EP-2304028-B1,,"BREVNOV MAXIM G ET AL: ""Developmental Validati...",BREVNOV MAXIM G ET AL
EP-2310520-B1,,"BING FENG ET AL: ""Purification, characterizati...",BING FENG ET AL
EP-2334021-B1,,"CARLOS MOSQUERA ET AL: ""Non-Data-Aided Symbol ...",CARLOS MOSQUERA ET AL
EP-2737071-B1,,"GALIBERT L ET AL: ""Baculovirus deleted for chi...",GALIBERT L ET AL


Now to extract the titles.


In [9]:
# define function for extracting stuff

def extract_part(search_str, df = patents_clean, var = 'npl_text'):
    return df[var].str.extract(fr'{search_str}')


In [10]:
patents_clean['title'] = extract_part('(?<=:)\s?\"+(.*?)\"')
# this is not so good, since it assumes a colon before the title
# maybe just start with quotes?
patents_clean['title'] = extract_part('\"+(.*?)\"')

In [11]:
patents_clean[['npl_text', 'title']].head()

Unnamed: 0_level_0,npl_text,title
publication_number,Unnamed: 1_level_1,Unnamed: 2_level_1
EP-1482288-B1,"HILARY E SNELL ET AL: ""Fourier Transform Fabry...",Fourier Transform Fabry-Perot Interferometer
EP-3134253-A4,"ROBERT MACCURDY ET AL: ""Hybrid printing of pho...",Hybrid printing of photopolymers and electrome...
EP-1811026-B1,Technical Bulletin; pCI and pSI Mammalian Expr...,
EP-1901235-B1,"VERGEEST J ET AL: ""Free-form surface copy and ...",Free-form surface copy and paste techniques fo...
EP-2097241-B1,"BAETEN F ET AL: ""Barium titanate characterizat...",Barium titanate characterization by differenti...


In [12]:
# this shows that the improved regex matches also this case
patents_clean[patents_clean.title.str.startswith('3rd', na=False)].head(5)

Unnamed: 0_level_0,type,npl_text,authors,title
publication_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
EP-3570521-A1,A,"""3rd Generation Partnership Project; Technical...",,3rd Generation Partnership Project; Technical ...
EP-3570521-A1,A,"""3rd Generation Partnership Project; Technical...",,3rd Generation Partnership Project; Technical ...
EP-3570521-A1,X,"""3rd Generation Partnership Project;Technical ...",,3rd Generation Partnership Project;Technical S...
EP-2908463-A1,XI,"""3rd Generation Partnership Project; Technical...",,3rd Generation Partnership Project; Technical ...
EP-2908463-A1,A,"""3rd Generation Partnership Project; Technical...",,3rd Generation Partnership Project; Technical ...


Extract DOI

In [13]:
# regex pattern with slight modification from
# https://stackoverflow.com/a/10324802/3149349
# this is the simplified version in the middle of him developing the regex.
# If it works sufficiently well, let's keep it at that
patents_clean['doi'] = extract_part('(10[.][0-9]{4,}(?:[.][0-9]+)*/\S+)')

In [14]:
patents_clean[['npl_text', 'doi']].head(30)

Unnamed: 0_level_0,npl_text,doi
publication_number,Unnamed: 1_level_1,Unnamed: 2_level_1
EP-1482288-B1,"HILARY E SNELL ET AL: ""Fourier Transform Fabry...",10.1117/12.60609
EP-3134253-A4,"ROBERT MACCURDY ET AL: ""Hybrid printing of pho...",
EP-1811026-B1,Technical Bulletin; pCI and pSI Mammalian Expr...,
EP-1901235-B1,"VERGEEST J ET AL: ""Free-form surface copy and ...",
EP-2097241-B1,"BAETEN F ET AL: ""Barium titanate characterizat...",10.1016/J.JEURCERAMSOC.2005.06.029
EP-2280048-B1,"ULRICH JAHN: ""Decandisäure"", RÖMPP ONLINE 4.0,...",
EP-2304028-B1,"BREVNOV MAXIM G ET AL: ""Developmental Validati...",10.1111/J.1556-4029.2009.01013.X
EP-2310520-B1,"BING FENG ET AL: ""Purification, characterizati...",10.1007/S00253-007-1117-3
EP-2334021-B1,"CARLOS MOSQUERA ET AL: ""Non-Data-Aided Symbol ...",10.1109/TSP.2007.907888
EP-2737071-B1,"GALIBERT L ET AL: ""Baculovirus deleted for chi...",


Still todo: ISSN, year, journal title, URL