## num_token

In [1]:
import json
import re

def preprocess_jsonl_file(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as input_file, \
         open(output_file_path, 'w', encoding='utf-8') as output_file:
        for line in input_file:
            data = json.loads(line)
            processed_string = re.sub(r'\d+', '<num_token>', data["string"])
            data["string"] = processed_string
            json.dump(data, output_file)
            output_file.write('\n')


In [2]:
input_file_path = './train.jsonl'
output_file_path = './nt_train.jsonl'
preprocess_jsonl_file(input_file_path, output_file_path)

In [3]:
input_file_path = './test.jsonl'
output_file_path = './nt_test.jsonl'
preprocess_jsonl_file(input_file_path, output_file_path)

In [4]:
input_file_path = './dev.jsonl'
output_file_path = './nt_dev.jsonl'
preprocess_jsonl_file(input_file_path, output_file_path)

## Remove duplicate

In [6]:
import json
import re

def preprocess_jsonl_file_remove_duplicates(input_file_path, output_file_path):
    seen_strings = set()

    with open(input_file_path, 'r', encoding='utf-8') as input_file, \
         open(output_file_path, 'w', encoding='utf-8') as output_file:
        for line in input_file:
            data = json.loads(line)
            string = data["string"]

            if string not in seen_strings:
                seen_strings.add(string)
                data["string"] = string
                
                json.dump(data, output_file)
                output_file.write('\n')  

In [21]:
input_file_path = './train.jsonl'
output_file_path = './rd_train.jsonl'
preprocess_jsonl_file_remove_duplicates(input_file_path, output_file_path)

In [9]:
import pandas as pd
from tqdm import tqdm
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import spacy
from scipy.sparse import hstack
import numpy as np

In [22]:
def js_to_df(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            obj = json.loads(line.strip())
            data.append({'string': obj['string'], 'label': obj['label']})
    return pd.DataFrame(data)

train_df = js_to_df('./train.jsonl')

In [23]:
train_df

Unnamed: 0,string,label
0,"However, how frataxin interacts with the Fe-S ...",background
1,"In the study by Hickey et al. (2012), spikes w...",background
2,"The drug also reduces catecholamine secretion,...",background
3,By clustering with lowly aggressive close kin ...,background
4,Ophthalmic symptoms are rare manifestations of...,background
...,...,...
8238,"Importantly, the results of Pascalis et al. (2...",background
8239,"As suggested by Nguena et al, there is a need ...",background
8240,Skeletal muscle is also a primary site of dise...,background
8241,ACTIVATION OF TRANSCRIPTION FACTORS Roles for ...,method


In [24]:
rd_train_df = js_to_df('./rd_train.jsonl')
rd_train_df

Unnamed: 0,string,label
0,"However, how frataxin interacts with the Fe-S ...",background
1,"In the study by Hickey et al. (2012), spikes w...",background
2,"The drug also reduces catecholamine secretion,...",background
3,By clustering with lowly aggressive close kin ...,background
4,Ophthalmic symptoms are rare manifestations of...,background
...,...,...
8237,"Importantly, the results of Pascalis et al. (2...",background
8238,"As suggested by Nguena et al, there is a need ...",background
8239,Skeletal muscle is also a primary site of dise...,background
8240,ACTIVATION OF TRANSCRIPTION FACTORS Roles for ...,method


In [25]:
input_file_path = './test.jsonl'
output_file_path = './rd_test.jsonl'
preprocess_jsonl_file_remove_duplicates(input_file_path, output_file_path)

In [26]:
input_file_path = './dev.jsonl'
output_file_path = './rd_dev.jsonl'
preprocess_jsonl_file_remove_duplicates(input_file_path, output_file_path)

## remove citations

In [14]:
def preprocess_jsonl_file_remove_citations(input_file_path, output_file_path):

    with open(input_file_path, 'r', encoding='utf-8') as input_file, \
         open(output_file_path, 'w', encoding='utf-8') as output_file:
        for line in input_file:
            data = json.loads(line)
            no_citations = re.sub(r'\[\d+(-\d+)?(,\s?\d+(-\d+)?)*\]', '', data["string"])
            no_citations = re.sub(r'\(\w+ et al\., \d+\)', '', no_citations)
            data["string"] = no_citations
            json.dump(data, output_file)
            output_file.write('\n')


In [18]:
input_file_path = './train.jsonl'
output_file_path = './rc_train.jsonl'
preprocess_jsonl_file_remove_citations(input_file_path, output_file_path)

In [19]:
input_file_path = './test.jsonl'
output_file_path = './rc_test.jsonl'
preprocess_jsonl_file_remove_citations(input_file_path, output_file_path)

In [20]:
input_file_path = './dev.jsonl'
output_file_path = './rc_dev.jsonl'
preprocess_jsonl_file_remove_citations(input_file_path, output_file_path)