# This notebook provides the code to:
- evaluate GPT-3 and GPT-4's accuracy to classify the office action citations. 
- use GPT-4 to classify a sample of 5000 citations
- train an LLM to classify the full set of citations 
- deploy the model on the full set of citations 
- sample the set of citations to manually label and classify them.

If you have any questions on this notebook, please, feel free to contact me by email: scharfmann.emma@gmail.com

## Load packages 

In [4]:
import glob 
import pandas as pd
import random 
import openai
import time
from tqdm import tqdm
from collections import Counter
from sklearn.metrics import multilabel_confusion_matrix
from multiprocessing import Pool
from functools import partial

import warnings
warnings.filterwarnings("ignore")


path_base = "/home/fs01/spec1142/Emma/test/"

f = open(path_base + "openai_key.txt", "r")
openai.api_key = f.read()


In [5]:
# Below is the code for calcultating the accuracy, the TPR and the FPR.
def accuracy(cm):
    accuracy = (cm.ravel()[0]+cm.ravel()[3])/sum(cm.ravel())
    return accuracy

def TPR(cm):
    TPR = cm[1][1]/(cm[1][1]+cm[1][0])
    return TPR

def FPR(cm):
    FPR = cm[0][1]/(cm[0][1]+cm[0][0])
    return FPR

## Sample oa citations

In [2]:
## load oa citations and store the citations into a dictionary 

files= glob.glob(path_base + 'oa_data_v1/*')

dic_result = {}
count = 0

for k in range(len(files)):
    file= glob.glob(path_base + 'oa_data_v1/*')[k]
    
    with open(file) as lines:
        for line_ in lines: 
            
            dic_result[count] = line_.replace('\n','')
            count += 1

## count number of citations
print(len(dic_result))

In [11]:
## store data into a dataframe

table_oa_citations = pd.DataFrame()
table_oa_citations['citation'] = dic_result.values()


In [12]:
## sample citations (100 citations sample)

table_oa_citations.sample(n=100)

Unnamed: 0,citation
666891,Gao Et Al Us Publication No 2018/0293445
726424,Gb-2291949-B
619051,Jp-2012103941-A
636592,English Translation Of Kr 10-1328742
199107,Ep-1919136-A1
...,...
609844,Jp-08244048-A
151055,Walter De 102007050797 A1 – Translation Used ...
797624,"Oxford Dictionary, Https://En.Oxforddictionar..."
536083,Wo-2017200295-A1


## Evaluate GPT-3's accuracy

In [19]:
## load sample of ~300 citations manually classified by Kyle 

data1 = pd.read_excel(path_base + 'test_files/Copy of oa_300_sample_checked_kyle.xlsx')
data1 = data1[data1['manual check'] == 'y']
data1.head()

Unnamed: 0,manual check,GPT4 check,GPT3.5 check,Bib subcategory,npl_biblio,md5,language_is_reliable,language_code,npl_cat,npl_cat_score,npl_cat_language_flag,patcit_id
1,y,y,y,,Watanabe Et Al Us Patent Application Publicat...,2ca504f11c3b378ce7be4619e2ee843f,True,en,PATENT,0.51,False,2ca504f11c3b378ce7be4619e2ee843f
4,y,y,y,JOURNAL ARTICLE,"Dzulkafli Et Al., ""Effects Of Talc On Fire Re...",0685ae955c71d728f69046458ac1db0f,True,en,BIBLIOGRAPHICAL_REFERENCE,0.98,False,0685ae955c71d728f69046458ac1db0f
5,y,y,y,,Zdepski Pub No Us 2017-0201784\n,c6de38a1aa0a879105ced194459f343e,True,en,PATENT,0.33,False,c6de38a1aa0a879105ced194459f343e
7,y,y,y,JOURNAL ARTICLE,"Nobori Et Al. (Cancer Research, 1997, 51:3193...",29e27156420faa44ae01a9e1a6363781,True,en,BIBLIOGRAPHICAL_REFERENCE,0.6,False,29e27156420faa44ae01a9e1a6363781
9,y,y,y,JOURNAL ARTICLE,"Fach Et Al, Neonatal Ovine Pulmonary Dendriti...",d2bea0db1c51b5ff13072c66202ba3fe,True,en,BIBLIOGRAPHICAL_REFERENCE,0.98,False,d2bea0db1c51b5ff13072c66202ba3fe


In [21]:
## clean the labels 

data1['category'] = [ elem[0] if pd.isna(elem[0]) == False else elem[1] for elem in data1[['Bib subcategory','npl_cat']].to_numpy()]
data1 = data1[ ( data1['category'] != 'BIBLIOGRAPHICAL_REFERENCE' ) &  ( data1['category'] != '?' ) ]
data1['category'] = data1['category'].replace('JOURNAL ARTICLE', 'JOURNAL_ARTICLE')
data1['category'] = data1['category'].replace('CONFERENCE PROCEEDINGS', 'CONFERENCE_PROCEEDINGS')
data1['category'] = data1['category'].replace('PREPRINT/WORKING PAPER/TECHNICAL REPORT', 'PREPRINT/WORKING_PAPER/TECHNICAL_REPORT')


In [27]:
number = 10

## prompt for GPT 3
prompt = """I am going to give you""" + str(number) + """ cited documents that have been made in office actions by the US patent office. I want you to classify each cited document as being one of the following:
WEBPAGE: Website
PATENT: A patent or patent application
PREPRINT/WORKING_PAPER/TECHNICAL_REPORT: Any public, non-peer reviewed technical document. These can be published on preprint servers, institute/personal websites, or even governmental archives.
JOURNAL_ARTICLE: A peer reviewed article published in a journal.
CONFERENCE_PROCEEDINGS: An article published as part of conference proceedings. The peer review process for such proceedings varies significantly, and differs from journal article in that it is a one-off publication.
BOOK: A book or chapter in a book. Book chapters are a common outlet for academic research, but are often not peer reviewed by independent parties, and are usually less accessible than the average journal article.
THESIS: Thesis, sually archived by the degree-granting institution.
NORM_STANDARD: An industrial norm or standard
PRODUCT_DOCUMENTATION: documentation for a product, such as a user manual or catalogue
OFFICE_ACTION: A different office action sent by the patent office
WIKI: A wikipedia page (a subset of webpage)
DATABASE: A database, such as a genetic or corporate database
LITIGATION: A court case or formal opposition proceeding within the patent office
SEARCH_REPORT: A search report issued by a patent office
Only list the classes and the first word of the cited document. 
Be VERY carefull not to forget cited documents!
"""

true_labels = []
results = []

## ask GPT to classify the chunks of citations. Note that GPT tends to forget some citations. 
for k in tqdm(range(20)):

    citations = data1[['npl_biblio','category']].to_numpy()[number*k:number*(k+1)]
    texts = "; ".join([ str(k+1) + ": "  +citations[:,0][k] for k  in range(len(citations[:,0])) ] )
    
    completion = openai.ChatCompletion.create(
        model='gpt-3.5-turbo-0125',
        messages=[{"role": "system", "content": prompt},
                  {"role": "user", "content": texts}],
        temperature= 0.1)
    
    true_labels += list(citations[:,1])
    res = completion['choices'][0]['message']['content'].split('\n')
    
    results += res



100%|███████████████████████████████████████████| 20/20 [00:28<00:00,  1.44s/it]


In [38]:
## clean GPT's classification

labels = set(list(data1['category']))
predicted_labels = [ list(set(elem.replace('\r', '').replace(':','').replace('TECHNICAL_REPORT/WORKING_PAPER','PREPRINT/WORKING_PAPER/TECHNICAL_REPORT').split()) &  labels)[0] if list(set(elem.replace('\r', '').replace(':','').replace('TECHNICAL_REPORT/WORKING_PAPER','PREPRINT/WORKING_PAPER/TECHNICAL_REPORT').split()) &  labels) != [] else 'OTHER' for elem in results if elem != '']


In [39]:
## count citations in each class

df_counter = pd.DataFrame()
df_counter['class'] = Counter(true_labels).keys()
df_counter['number of elements'] = Counter(true_labels).values()
df_counter


Unnamed: 0,class,number of elements
0,PATENT,43
1,JOURNAL_ARTICLE,111
2,CONFERENCE_PROCEEDINGS,14
3,BOOK,4
4,PRODUCT_DOCUMENTATION,13
5,WEBPAGE,6
6,PREPRINT/WORKING_PAPER/TECHNICAL_REPORT,1
7,OFFICE_ACTION,1
8,THESIS,1
9,DATABASE,2


In [40]:
## evaluate GPT's accuracy 

labels = list(set(list(data1['category'])))
conf_matrix = multilabel_confusion_matrix(true_labels , predicted_labels, labels=labels)


df_metrics = pd.DataFrame()

missclassifications_rate = 100*sum( [ conf_matrix[k][0][1] for k in range(len(conf_matrix)) ]) / len(predicted_labels)
print('Overall accuracy: ', 100 - missclassifications_rate)


acc = []
tpr = []
fpr = []
for elem in conf_matrix:
    acc.append(accuracy(elem))
    tpr.append(TPR(elem))
    fpr.append(FPR(elem))


df_metrics['class'] = labels
df_metrics['accuracy'] = acc
df_metrics['TPR'] = tpr
df_metrics['FPR'] = fpr

df_metrics.merge(df_counter, on='class')
    

Overall accuracy:  92.5


Unnamed: 0,class,accuracy,TPR,FPR,number of elements
0,JOURNAL_ARTICLE,0.98,0.972973,0.011236,111
1,WEBPAGE,0.965,1.0,0.036082,6
2,BOOK,0.99,1.0,0.010204,4
3,SEARCH_REPORT,0.995,1.0,0.005025,1
4,OFFICE_ACTION,0.995,0.0,0.0,1
5,CONFERENCE_PROCEEDINGS,0.99,0.928571,0.005376,14
6,PREPRINT/WORKING_PAPER/TECHNICAL_REPORT,0.99,0.0,0.005025,1
7,PATENT,0.98,0.953488,0.012739,43
8,WIKI,0.995,0.5,0.0,2
9,PRODUCT_DOCUMENTATION,0.97,0.538462,0.0,13


## Evaluate GPT-4's accuracy

In [43]:
## load sample of ~300 citations manually classified by Kyle 

data1 = pd.read_excel('/home/fs01/spec1142/Emma/test/test_files/Copy of oa_300_sample_checked_kyle.xlsx')
data1 = data1[data1['manual check'] == 'y']
data1

Unnamed: 0,manual check,GPT4 check,GPT3.5 check,Bib subcategory,npl_biblio,md5,language_is_reliable,language_code,npl_cat,npl_cat_score,npl_cat_language_flag,patcit_id
1,y,y,y,,Watanabe Et Al Us Patent Application Publicat...,2ca504f11c3b378ce7be4619e2ee843f,True,en,PATENT,0.51,False,2ca504f11c3b378ce7be4619e2ee843f
4,y,y,y,JOURNAL ARTICLE,"Dzulkafli Et Al., ""Effects Of Talc On Fire Re...",0685ae955c71d728f69046458ac1db0f,True,en,BIBLIOGRAPHICAL_REFERENCE,0.98,False,0685ae955c71d728f69046458ac1db0f
5,y,y,y,,Zdepski Pub No Us 2017-0201784\n,c6de38a1aa0a879105ced194459f343e,True,en,PATENT,0.33,False,c6de38a1aa0a879105ced194459f343e
7,y,y,y,JOURNAL ARTICLE,"Nobori Et Al. (Cancer Research, 1997, 51:3193...",29e27156420faa44ae01a9e1a6363781,True,en,BIBLIOGRAPHICAL_REFERENCE,0.60,False,29e27156420faa44ae01a9e1a6363781
9,y,y,y,JOURNAL ARTICLE,"Fach Et Al, Neonatal Ovine Pulmonary Dendriti...",d2bea0db1c51b5ff13072c66202ba3fe,True,en,BIBLIOGRAPHICAL_REFERENCE,0.98,False,d2bea0db1c51b5ff13072c66202ba3fe
...,...,...,...,...,...,...,...,...,...,...,...,...
294,y,,,,Machine Translation Of Jp-2007224953 (Year: 2...,3e8f555055a762f69f1349d5df9887be,True,en,PATENT,0.34,False,3e8f555055a762f69f1349d5df9887be
295,y,,,,"Ibm, Translucent Drag Icons (Tdb Acc. No. Nn9...",02bf7bfcd8b0c278b2f170a4863b1963,True,en,BIBLIOGRAPHICAL_REFERENCE,0.65,False,02bf7bfcd8b0c278b2f170a4863b1963
296,y,,,JOURNAL ARTICLE,"Mastaloudis, A., Et Al., “Antioxidant Supplem...",e994b3f8571d466a279ba05b87eacf87,True,en,BIBLIOGRAPHICAL_REFERENCE,0.97,False,e994b3f8571d466a279ba05b87eacf87
297,y,,,CONFERENCE PROCEEDINGS,"Chang Et Al., Motion Registration And Correct...",6e88f4dabe30e52db23a420f8203a433,True,en,BIBLIOGRAPHICAL_REFERENCE,0.97,False,6e88f4dabe30e52db23a420f8203a433


In [46]:
## clean the labels 

data1['category'] = [ elem[0] if pd.isna(elem[0]) == False else elem[1] for elem in data1[['Bib subcategory','npl_cat']].to_numpy()]
data1 = data1[ ( data1['category'] != 'BIBLIOGRAPHICAL_REFERENCE' ) &  ( data1['category'] != '?' ) ]
data1['category'] = data1['category'].replace('JOURNAL ARTICLE', 'JOURNAL_ARTICLE')
data1['category'] = data1['category'].replace('CONFERENCE PROCEEDINGS', 'CONFERENCE_PROCEEDINGS')
data1['category'] = data1['category'].replace('PREPRINT/WORKING PAPER/TECHNICAL REPORT', 'PREPRINT/WORKING_PAPER/TECHNICAL_REPORT')


In [47]:
import openai
import time
from tqdm import tqdm

number = 10


## prompt for GPT 3
prompt = """I am going to give you""" + str(number) + """ cited documents that have been made in office actions by the US patent office. I want you to classify each cited document as being one of the following:
WEBPAGE: Website
PATENT: A patent or patent application
PREPRINT/WORKING_PAPER/TECHNICAL_REPORT: Any public, non-peer reviewed technical document. These can be published on preprint servers, institute/personal websites, or even governmental archives.
JOURNAL_ARTICLE: A peer reviewed article published in a journal.
CONFERENCE_PROCEEDINGS: An article published as part of conference proceedings. The peer review process for such proceedings varies significantly, and differs from journal article in that it is a one-off publication.
BOOK: A book or chapter in a book. Book chapters are a common outlet for academic research, but are often not peer reviewed by independent parties, and are usually less accessible than the average journal article.
THESIS: Thesis, sually archived by the degree-granting institution.
NORM_STANDARD: An industrial norm or standard
PRODUCT_DOCUMENTATION: documentation for a product, such as a user manual or catalogue
OFFICE_ACTION: A different office action sent by the patent office
WIKI: A wikipedia page (a subset of webpage)
DATABASE: A database, such as a genetic or corporate database
LITIGATION: A court case or formal opposition proceeding within the patent office
SEARCH_REPORT: A search report issued by a patent office
Only list the classes and the first word of the cited document. 
Be VERY carefull not to forget cited documents!
"""

true_labels = []
results = []

## ask GPT to classify the chunks of citations. Note that GPT tends to forget some citations. 
for k in tqdm(range(20)):

    citations = data1[['npl_biblio','category']].to_numpy()[number*k:number*(k+1)]
    texts = "; ".join([ str(k+1) + ": "  +citations[:,0][k] for k  in range(len(citations[:,0])) ] )
    
    completion = openai.ChatCompletion.create(
        #model="gpt-4-0125-preview", 
        model='gpt-4-0125-preview',
        messages=[{"role": "system", "content": prompt},
                  {"role": "user", "content": texts}],
        temperature= 0.1)
    
    true_labels += list(citations[:,1])
    res = completion['choices'][0]['message']['content'].split('\n')
    
    results += res



100%|███████████████████████████████████████████| 20/20 [00:58<00:00,  2.94s/it]


In [48]:
## clean GPT's classification

labels = set(list(data1['category']))
predicted_labels = [ list(set(elem.replace('\r', '').replace(':','').replace('TECHNICAL_REPORT/WORKING_PAPER','PREPRINT/WORKING_PAPER/TECHNICAL_REPORT').split()) &  labels)[0] if list(set(elem.replace('\r', '').replace(':','').replace('TECHNICAL_REPORT/WORKING_PAPER','PREPRINT/WORKING_PAPER/TECHNICAL_REPORT').split()) &  labels) != [] else 'OTHER' for elem in results if elem != '']


In [49]:
## count citations in each class

df_counter = pd.DataFrame()
df_counter['class'] = Counter(true_labels).keys()
df_counter['number of elements'] = Counter(true_labels).values()
df_counter

Unnamed: 0,class,number of elements
0,PATENT,43
1,JOURNAL_ARTICLE,111
2,CONFERENCE_PROCEEDINGS,14
3,BOOK,4
4,PRODUCT_DOCUMENTATION,13
5,WEBPAGE,6
6,PREPRINT/WORKING_PAPER/TECHNICAL_REPORT,1
7,OFFICE_ACTION,1
8,THESIS,1
9,DATABASE,2


In [50]:
## evaluate GPT's accuracy 

labels = list(set(list(data1['category'])))
conf_matrix = multilabel_confusion_matrix(true_labels , predicted_labels, labels=labels)

df_metrics = pd.DataFrame()


missclassifications_rate = 100*sum( [ conf_matrix[k][0][1] for k in range(len(conf_matrix)) ]) / len(predicted_labels)
print('Overall accuracy: ', 100 - missclassifications_rate)


acc = []
tpr = []
fpr = []
for elem in conf_matrix:
    acc.append(accuracy(elem))
    tpr.append(TPR(elem))
    fpr.append(FPR(elem))


df_metrics['class'] = labels
df_metrics['accuracy'] = acc
df_metrics['TPR'] = tpr
df_metrics['FPR'] = fpr

df_metrics.merge(df_counter, on='class')
    
    

Overall accuracy:  93.5


Unnamed: 0,class,accuracy,TPR,FPR,number of elements
0,JOURNAL_ARTICLE,0.985,0.972973,0.0,111
1,WEBPAGE,0.975,1.0,0.025773,6
2,BOOK,0.99,1.0,0.010204,4
3,SEARCH_REPORT,1.0,1.0,0.0,1
4,OFFICE_ACTION,0.995,0.0,0.0,1
5,CONFERENCE_PROCEEDINGS,0.985,0.857143,0.005376,14
6,PREPRINT/WORKING_PAPER/TECHNICAL_REPORT,0.98,1.0,0.020101,1
7,PATENT,0.995,1.0,0.006369,43
8,WIKI,1.0,1.0,0.0,2
9,PRODUCT_DOCUMENTATION,0.97,0.538462,0.0,13


## Use GPT-4 to classify a 5000 citations sample

In [51]:
dic_result = {}
count = 0
path = '/home/fs01/spec1142/Emma/test/for_grobid_all_v0/'
file = path + 'oa_crosswalk_without_sp_levi1_bq_citations0.txt'

with open(file) as lines:
    for line_ in lines: 
        
        dic_result[count] = line_.replace('\n','')
        count += 1

print(len(dic_result))

5000


In [52]:
df5000 = pd.DataFrame()
df5000['oa_citation'] = dic_result.values()
df5000.head()

Unnamed: 0,oa_citation
0,Myositis Association Retrieved From On-Line We...
1,Hirsh Et Al. Weekly Nab-Paclitaxel In Combina...
2,"Vaidyanathan Et Al. Bioconjugate Chem. 1990, ..."
3,Wiegert ` 259
4,"Trakadis, Y.J. ""Patient-Controlled Encrypted ..."


In [60]:
## function to classify a chunk of 50 citations using GPT-4

def classify_50_oa_citations(citations, openai_api_key):

    """
    This function uses the OpenAI API to classify a series of 50 citations made in office actions by the US patent office.

    Parameters:
    citations (list): A list of citations to be classified.
    openai.api_key (str): The API key for the OpenAI API.

    Note:
    - The function constructs a query string that includes the citations to be classified and a set of instructions for the OpenAI API.
    - The function returns a list of classification results, with each result including the class of the cited document and the number of the cited document.
    """
    
    number = 50
    results = []
    
    query = """I am going to give you a series of """ + str(number) + """ citations that have been made in office actions by the US patent office. I want you to classify each cited document as being one of the following:
    WEBPAGE: Website
    PATENT: A patent or patent application
    PREPRINT/WORKING_PAPER/TECHNICAL_REPORT: Any public, non-peer reviewed technical document. These can be published on preprint servers, institute/personal websites, or even governmental archives.
    JOURNAL_ARTICLE: A peer reviewed article published in a journal.
    CONFERENCE_PROCEEDINGS: An article published as part of conference proceedings. The peer review process for such proceedings varies significantly, and differs from journal article in that it is a one-off publication.
    BOOK: A book or chapter in a book. Book chapters are a common outlet for academic research, but are often not peer reviewed by independent parties, and are usually less accessible than the average journal article.
    THESIS: Thesis, sually archived by the degree-granting institution.
    NORM_STANDARD: An industrial norm or standard
    PRODUCT_DOCUMENTATION: documentation for a product, such as a user manual or catalogue
    OFFICE_ACTION: A different office action sent by the patent office
    WIKI: A wikipedia page (a subset of webpage)
    DATABASE: A database, such as a genetic or corporate database
    LITIGATION: A court case or formal opposition proceeding within the patent office
    SEARCH_REPORT: A search report issued by a patent office
    Only list the classes of the cited document and the number of the cited document."""
    
    
    texts = "; ".join([ str(k) + ': ' + citations[k] for k  in range(len(citations)) ] )
        
    completion = openai.ChatCompletion.create(
            model="gpt-4-0125-preview", 
            messages=[{"role": "system", "content": query},
                      {"role": "user", "content": texts}],
            temperature= 0.2)
        
    results += completion['choices'][0]['message']['content'].split('\n')
        

    return results 

In [61]:
## function 

def multi_gpt(openai_api_key,i):

    """
    This function uses the OpenAI API to classify a series of citations in a given dataframe, in batches of 50.

    Parameters:
    openai.api_key (str): The API key for the OpenAI API.
    i (int): The index of the dataframe to be classified.

    Note:
    - The function selects a subset of the dataframe `df5000` based on the given index `i`.
    - The function then divides the subset into smaller batches of 50 citations and uses the `classify_50_oa_citations` function to classify each batch.
    - The function returns a list of classification results for the entire subset of the dataframe.
    """
    
    result = []
    medium_df = df5000[200*i:200*(i+1)]
    for k in range(4):
        small_df = medium_df[50*k:50*(k+1)]
        citations = list(small_df['oa_citation'])
        res = classify_50_oa_citations(citations,openai_api_key)

        if len(res) == 50:
            result += res
        else:
            res = classify_50_oa_citations(citations,openai_api_key)
    return result

In [None]:
## classify the citations (uses 12 cpus)

openai_api_key = openai.api_key

p = Pool(processes=12)
func = partial(multi_gpt,openai_api_key)
results = p.map(func, [ i  for i in range(12)])
p.close()

In [102]:
## clean the results. Note that GPT-4 tends to forget some citations. 

labels = []
clean_list = []
count = 0 

for elem in results:
    k = 0 
    for line in elem:
        count += 1
        if line.split(':')[0] == str(k):
            k += 1
            if len(line.split(': ')) == 1:
                labels.append('None')
            else:
                labels.append(line.split(': ')[1]) 
            
            if k == 50:
                k = 0
        else:
            
            labels.append('None') 
            k += 1
            if len(line.split(': ')) == 1:
                labels.append('None')
            else:
                labels.append(line.split(': ')[1]) 
            
            if k == 50:
                k = 0
            k += 1
            if k == 50:
                k = 0
            
            


1450
2349
4748


In [119]:
## save the data classified by GPT-4 and flag the potential errors

df5000['labels'] = labels
df5000[df5000['labels'] == 'None']
df5000['flag'] = [ 1 if index in range(1400,1450) else 1 if  index in range(2300,2350) else 1 if index in range(4700,4750) else 0 for index in df5000.index]
df5000

Unnamed: 0,oa_citation,labels,flag
0,Myositis Association Retrieved From On-Line We...,WEBPAGE,0
1,Hirsh Et Al. Weekly Nab-Paclitaxel In Combina...,JOURNAL_ARTICLE,0
2,"Vaidyanathan Et Al. Bioconjugate Chem. 1990, ...",JOURNAL_ARTICLE,0
3,Wiegert ` 259,PATENT,0
4,"Trakadis, Y.J. ""Patient-Controlled Encrypted ...",JOURNAL_ARTICLE,0
...,...,...,...
4995,Us 0057553 A,PATENT,0
4996,De-102017004043-A1,PATENT,0
4997,Ting Et Al. (Cn 105151567) Machine Translatio...,PATENT,0
4998,Ca2829631,PATENT,0


In [121]:
## save the 5000 citations sample

df5000.to_csv('/home/fs01/spec1142/Emma/test/' + 'gpt4_5000sample.csv', index = False)

## Train our own model

### Load data

In [6]:
## load manually classified citations and clean the labels 

files = glob.glob('/home/fs01/spec1142/Emma/test/test_files/oa/*')
data = pd.concat( [ pd.read_excel(elem) for elem in files])
data = data[ ( data['manual check'] == 'y' ) |  ( data['manual_check'] == 'y')]

data['category'] = [  elem for elem in data['npl_cat']]

data = data[ ( data['category'] != 'BIBLIOGRAPHICAL_REFERENCE' ) &  ( data['category'] != '?' ) ]
data['category'] = data['category'].replace('JOURNAL ARTICLE', 'JOURNAL_ARTICLE')
data['category'] = data['category'].replace('CONFERENCE PROCEEDINGS', 'CONFERENCE_PROCEEDINGS')
data['category'] = data['category'].replace('PREPRINT/WORKING PAPER/TECHNICAL REPORT', 'PREPRINT/WORKING_PAPER/TECHNICAL_REPORT')


In [7]:
## load  citations classified by GPT-4 and clean the labels 

gpt_data = pd.read_csv('/home/fs01/spec1142/Emma/test/test_files/gpt4_5000sample.csv')
gpt_data = gpt_data[gpt_data['flag'] == 0]
gpt_data = gpt_data.rename(columns = { 'oa_citation':'npl_biblio' , 'labels' : 'category' })
gpt_data  = gpt_data[['npl_biblio','category']]


In [8]:
## merge the two files

data  = data[['npl_biblio','category']]
data = pd.concat([data,gpt_data])
data = data[(data['category'] != 'None Cited')&(data['category'] != 'GOVERNMENT_REPORT') ]

In [9]:
## sample the citations classified as patent, journal article and webpage to have a more balanced dataset.

data_patents = data[data['category'] == 'PATENT'].sample(frac=0.12)
data_articles = data[data['category'] == 'JOURNAL_ARTICLE'].sample(frac=0.4)
data_webpage = data[data['category'] == 'WEBPAGE'].sample(frac=0.3)

data_no_patents = data[(data['category'] != 'PATENT') & (data['category'] != 'JOURNAL_ARTICLE') & (data['category'] != 'WEBPAGE')]
data = pd.concat([data_patents,data_articles,data_webpage,data_no_patents]).sample(frac=1)

In [21]:
## keep classes with more than 20 datapoints

df = data.groupby('category').count()
data = data[data['category'].isin(list(df[df['npl_biblio'] > 20].index))]
data['labels'] = pd.factorize(data['category'], sort=True)[0]
data = data.sample(frac=1)
dic_labels = { elem[1] : elem[0] for elem in data[['category','labels']].drop_duplicates().to_numpy() } 


In [22]:
## print dataset by classes

print(len(data['category']))
data.groupby('category').count()

1908


Unnamed: 0_level_0,npl_biblio,labels
category,Unnamed: 1_level_1,Unnamed: 2_level_1
BOOK,86,86
CONFERENCE_PROCEEDINGS,229,229
DATABASE,141,141
JOURNAL_ARTICLE,399,399
LITIGATION,34,34
NORM_STANDARD,51,51
OFFICE_ACTION,103,103
PATENT,347,347
PREPRINT/WORKING_PAPER/TECHNICAL_REPORT,101,101
PRODUCT_DOCUMENTATION,103,103


### Train model

In [10]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Dummy data (replace this with your dataset)

texts = list(data['npl_biblio'])
labels = list(data['labels'])

# Encoding labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Splitting the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, encoded_labels, test_size=0.2, random_state=42
)

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load BERT model and tokenizer
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(set(encoded_labels)))

# Create Dataset instances
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

# DataLoader instances
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 8
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Validation - Epoch {epoch + 1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_true_labels.extend(labels.cpu().numpy())

    # Calculate accuracy
    accuracy = accuracy_score(val_true_labels, val_predictions)
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}, Validation Accuracy: {accuracy:.4f}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████████████████████████████| 24/24 [04:02<00:00, 10.08s/it]
Validation - Epoch 1: 100%|███████████████████████| 6/6 [00:08<00:00,  1.40s/it]


Epoch 1, Loss: 53.3975, Validation Accuracy: 0.5131


Epoch 2: 100%|██████████████████████████████████| 24/24 [02:19<00:00,  5.83s/it]
Validation - Epoch 2: 100%|███████████████████████| 6/6 [00:09<00:00,  1.57s/it]


Epoch 2, Loss: 38.5001, Validation Accuracy: 0.6047


Epoch 3: 100%|██████████████████████████████████| 24/24 [02:28<00:00,  6.19s/it]
Validation - Epoch 3: 100%|███████████████████████| 6/6 [00:08<00:00,  1.38s/it]


Epoch 3, Loss: 30.2190, Validation Accuracy: 0.6990


Epoch 4: 100%|██████████████████████████████████| 24/24 [02:33<00:00,  6.38s/it]
Validation - Epoch 4: 100%|███████████████████████| 6/6 [00:08<00:00,  1.36s/it]


Epoch 4, Loss: 23.6015, Validation Accuracy: 0.7592


Epoch 5: 100%|██████████████████████████████████| 24/24 [02:19<00:00,  5.83s/it]
Validation - Epoch 5: 100%|███████████████████████| 6/6 [00:09<00:00,  1.61s/it]


Epoch 5, Loss: 18.6547, Validation Accuracy: 0.7775


Epoch 6: 100%|██████████████████████████████████| 24/24 [02:37<00:00,  6.58s/it]
Validation - Epoch 6: 100%|███████████████████████| 6/6 [00:08<00:00,  1.38s/it]


Epoch 6, Loss: 15.1288, Validation Accuracy: 0.7592


Epoch 7: 100%|██████████████████████████████████| 24/24 [02:23<00:00,  5.97s/it]
Validation - Epoch 7: 100%|███████████████████████| 6/6 [00:08<00:00,  1.48s/it]


Epoch 7, Loss: 12.4134, Validation Accuracy: 0.7775


Epoch 8: 100%|██████████████████████████████████| 24/24 [02:22<00:00,  5.93s/it]
Validation - Epoch 8: 100%|███████████████████████| 6/6 [00:08<00:00,  1.45s/it]


Epoch 8, Loss: 10.0054, Validation Accuracy: 0.7984


('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/vocab.txt',
 'fine_tuned_model/added_tokens.json')

### Evaluate the model

In [14]:
## load sample of ~300 citations manually classified by Kyle 

data1 = pd.read_excel('/home/fs01/spec1142/Emma/test/test_files/Copy of oa_300_sample_checked_kyle.xlsx')
data1 = data1[data1['manual check'] == 'y']
data1

Unnamed: 0,manual check,GPT4 check,GPT3.5 check,Bib subcategory,npl_biblio,md5,language_is_reliable,language_code,npl_cat,npl_cat_score,npl_cat_language_flag,patcit_id
1,y,y,y,,Watanabe Et Al Us Patent Application Publicat...,2ca504f11c3b378ce7be4619e2ee843f,True,en,PATENT,0.51,False,2ca504f11c3b378ce7be4619e2ee843f
4,y,y,y,JOURNAL ARTICLE,"Dzulkafli Et Al., ""Effects Of Talc On Fire Re...",0685ae955c71d728f69046458ac1db0f,True,en,BIBLIOGRAPHICAL_REFERENCE,0.98,False,0685ae955c71d728f69046458ac1db0f
5,y,y,y,,Zdepski Pub No Us 2017-0201784\n,c6de38a1aa0a879105ced194459f343e,True,en,PATENT,0.33,False,c6de38a1aa0a879105ced194459f343e
7,y,y,y,JOURNAL ARTICLE,"Nobori Et Al. (Cancer Research, 1997, 51:3193...",29e27156420faa44ae01a9e1a6363781,True,en,BIBLIOGRAPHICAL_REFERENCE,0.60,False,29e27156420faa44ae01a9e1a6363781
9,y,y,y,JOURNAL ARTICLE,"Fach Et Al, Neonatal Ovine Pulmonary Dendriti...",d2bea0db1c51b5ff13072c66202ba3fe,True,en,BIBLIOGRAPHICAL_REFERENCE,0.98,False,d2bea0db1c51b5ff13072c66202ba3fe
...,...,...,...,...,...,...,...,...,...,...,...,...
294,y,,,,Machine Translation Of Jp-2007224953 (Year: 2...,3e8f555055a762f69f1349d5df9887be,True,en,PATENT,0.34,False,3e8f555055a762f69f1349d5df9887be
295,y,,,,"Ibm, Translucent Drag Icons (Tdb Acc. No. Nn9...",02bf7bfcd8b0c278b2f170a4863b1963,True,en,BIBLIOGRAPHICAL_REFERENCE,0.65,False,02bf7bfcd8b0c278b2f170a4863b1963
296,y,,,JOURNAL ARTICLE,"Mastaloudis, A., Et Al., “Antioxidant Supplem...",e994b3f8571d466a279ba05b87eacf87,True,en,BIBLIOGRAPHICAL_REFERENCE,0.97,False,e994b3f8571d466a279ba05b87eacf87
297,y,,,CONFERENCE PROCEEDINGS,"Chang Et Al., Motion Registration And Correct...",6e88f4dabe30e52db23a420f8203a433,True,en,BIBLIOGRAPHICAL_REFERENCE,0.97,False,6e88f4dabe30e52db23a420f8203a433


In [None]:
## clean the labels 

data1['category'] = [ elem[0] if pd.isna(elem[0]) == False else elem[1] for elem in data1[['Bib subcategory','npl_cat']].to_numpy()]
data1 = data1[ ( data1['category'] != 'BIBLIOGRAPHICAL_REFERENCE' ) &  ( data1['category'] != '?' ) ]
data1['category'] = data1['category'].replace('JOURNAL ARTICLE', 'JOURNAL_ARTICLE')
data1['category'] = data1['category'].replace('CONFERENCE PROCEEDINGS', 'CONFERENCE_PROCEEDINGS')
data1['category'] = data1['category'].replace('PREPRINT/WORKING PAPER/TECHNICAL REPORT', 'PREPRINT/WORKING_PAPER/TECHNICAL_REPORT')



In [16]:
## load our own model 

from transformers import TextClassificationPipeline

model_name = 'fine_tuned_model'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(set(encoded_labels)))

pipe2 = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=False)


2024-05-20 18:07:01.923779: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-20 18:07:01.923908: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-20 18:07:02.599782: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-20 18:07:03.449559: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [32]:
## classify the testing set with our model

test = data1[data1['category'].isin(dic_labels.values())][['npl_biblio','category']].to_numpy()

start = time.time()
pred_label = []
true_label = []
pred_label_raw = pipe2(list(test[:,0]), batch_size = 8)

for k in tqdm(range(len(test))):
    pred_label.append(dic_labels[int(pred_label_raw[k]['label'][6:])])
    true_label.append(test[k][1])
    

end = time.time()
print(end - start)

labels = list(set(true_label))

In [36]:
## evaluate the model 

conf_matrix = multilabel_confusion_matrix(true_label , pred_label,labels=labels)#, labels=labels)

df_metrics = pd.DataFrame()


missclassifications_rate = 100*sum( [ conf_matrix[k][0][1] for k in range(len(conf_matrix)) ]) / len(pred_label)
print('Overall accuracy: ', 100 - missclassifications_rate)


acc = []
tpr = []
fpr = []
count_true = [] 
for elem in conf_matrix:
    acc.append(accuracy(elem))
    tpr.append(TPR(elem))
    fpr.append(FPR(elem))
    count_true.append(elem[1][1] + elem[1][0])


df_metrics['labels'] = [ k for k in labels]

df_metrics['accuracy'] = acc
df_metrics['TPR'] = tpr
df_metrics['FPR'] = fpr
df_metrics['true'] = count_true

df_metrics
    

Overall accuracy:  83.02752293577981


Unnamed: 0,labels,accuracy,TPR,FPR,true
0,WEBPAGE,0.940367,0.777778,0.052632,9
1,JOURNAL_ARTICLE,0.949541,0.905172,0.0,116
2,THESIS,0.995413,0.0,0.0,1
3,PRODUCT_DOCUMENTATION,0.954128,0.533333,0.014778,15
4,SEARCH_REPORT,0.995413,1.0,0.004608,1
5,BOOK,0.990826,0.833333,0.004717,6
6,DATABASE,0.995413,1.0,0.00463,2
7,NORM_STANDARD,0.995413,0.0,0.0,1
8,PREPRINT/WORKING_PAPER/TECHNICAL_REPORT,0.972477,0.5,0.023148,2
9,OFFICE_ACTION,0.995413,0.0,0.0,1


## Classify the citations with the model

In [None]:
# Load BERT model and tokenizer

from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import TextClassificationPipeline


## load classes names 
dic_labels = {7: 'PATENT',
 4: 'LITIGATION',
 2: 'DATABASE',
 13: 'WIKI',
 12: 'WEBPAGE',
 1: 'CONFERENCE_PROCEEDINGS',
 3: 'JOURNAL_ARTICLE',
 6: 'OFFICE_ACTION',
 10: 'SEARCH_REPORT',
 9: 'PRODUCT_DOCUMENTATION',
 5: 'NORM_STANDARD',
 0: 'BOOK',
 8: 'PREPRINT/WORKING_PAPER/TECHNICAL_REPORT',
 11: 'THESIS'}

encoded_labels = list(encoded_labels.values())


## load classification model 
model_name = 'fine_tuned_model'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(set(encoded_labels)))

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=False,truncation=True)



## classify the citations and save the classes labels 
files = glob.glob(path_base + 'oa_data_v1/*')


for k in range(162,len(files)):

    ## load citations
    dic_result = {}
    count = 0
    file= files[k]
    
    with open(file) as lines:
        for line_ in lines: 
            
            dic_result[count] = line_.replace('\n','')
            count += 1

    print(len(dic_result))
    
    df = pd.DataFrame()
    df['oa_citation'] = dic_result.values()
    
    ## classify citations
    start = time.time()
    result = pipe(list(df['oa_citation']), batch_size = 128)
    end = time.time()
    print(end - start)
    
    list_pred = [] 
    for elem in result:
        list_pred.append(dic_labels[int(elem['label'][6:])])
    
    
    
    ## save classified citations
    df['label'] = list_pred
    df.to_csv(path_base + 'oa_data_v1_classified/' + file.split('/')[-1].split('.')[0] + '.tsv', sep = "\t", index = False)
    



## Sample classified citations

In [39]:
## load classified citations

table = pd.read_csv(path_base + "classificed_oa_data_v1.tsv", delimiter = "\t")
table.head(20)

Unnamed: 0,oa_citation,label
0,An English Machine Translation Of Александр Ви...,DATABASE
1,"Dash And Konkimalla, Poly-Є-Caprolactone Based...",JOURNAL_ARTICLE
2,2014045792 Wo A1 淳,PATENT
3,"А. А. Королев, Office Action For Russian Pate...",OFFICE_ACTION
4,"Xu Cn 104741552A, Cited In Ids Filed 6/29/18",PATENT
5,"Legagneur Et Al ""Limbo3 (M = Mn, Fe, Co): Syn...",JOURNAL_ARTICLE
6,Wang Cn 1037663314,PATENT
7,Haeley Wo 02/41801,PATENT
8,Skoglund Wo 2010/027317,PATENT
9,"Li Us Patent No 6,719,697",PATENT


In [12]:
## save a sample of citations frm each class

labels = list(set(list(table['label'])))
for label in labels:
    sm_table = table[table['label'] == label].sample(n=30)
    label = label.replace('/','')
    sm_table.to_excel(path_base + 'test_files/oa_v2/sample_30_cat_' + label + '.xlsx')

In [15]:
## save a random sample of 300 citations

sm_table = table.sample(n=300)
sm_table.to_excel(path_base + 'test_files/oa_v2/sample_300_all_cat.xlsx')