## Import Libraries

In [4]:
import re
import datasets
from collections import Counter
from datasets import load_dataset, DatasetDict

## Reading CSV files

### Dataset Usage

In [2]:
folder_path = "./1 Billion Citation Dataset/1 Billion Citation Dataset/"
train_files = [folder_path+"1 Billion Citation Dataset, v1 (1).csv", folder_path+"1 Billion Citation Dataset, v1 (10).csv", folder_path+"1 Billion Citation Dataset, v1 (100).csv"]

dataset = load_dataset("csv", data_files=train_files, split="train")
print(len(dataset))

13593417


In [3]:
# Shuffle the dataset
full_train_dataset = dataset.shuffle(seed=42)

# Compute validation and test set sizes (5% each)
valid_size = int(0.05 * len(full_train_dataset))
test_size = valid_size

# Create new validation and test splits
valid_dataset = full_train_dataset.select(range(valid_size))  # First 5% as validation
test_dataset = full_train_dataset.select(range(valid_size, valid_size + test_size))  # Next 5% as test
train_dataset = full_train_dataset.select(range(valid_size + test_size, len(full_train_dataset)))  # Remaining 90% as train

In [4]:
# Create final DatasetDict
datasets = DatasetDict({
    "train": train_dataset,
    "valid": valid_dataset,
    "test": test_dataset
})

# Verify the dataset sizes
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['doi', 'articleType', 'citationStyle', 'citationStringAnnotated', 'Unnamed: 4'],
        num_rows: 12234077
    })
    valid: Dataset({
        features: ['doi', 'articleType', 'citationStyle', 'citationStringAnnotated', 'Unnamed: 4'],
        num_rows: 679670
    })
    test: Dataset({
        features: ['doi', 'articleType', 'citationStyle', 'citationStringAnnotated', 'Unnamed: 4'],
        num_rows: 679670
    })
})


## Data Preparation

### Data Reading

In [5]:
ALLOWED_LABELS = ['AUTHOR', 'YEAR', 'TITLE', 'CONTAINER-TITLE', 'VOLUME', 'ISSUE', 'PAGE', 'ISBN', 'ISSN', 'PUBLISHER', 'DOI', 'URL']

### Data filtering and parsing

In [6]:
TAG_PATTERN = re.compile(r'<(?P<tag>[\w\s-]+)>(?P<content>.*?)</(?P=tag)>', re.S)

def delete_tag(tag_name, reference):
    tag = "<" + tag_name + ">"
    closing_tag = "</" + tag_name + ">"
    new_reference = reference.replace(tag, '').replace(closing_tag, '')
    return new_reference


def clean_reference(reference):
    for match in TAG_PATTERN.finditer(reference):
        tag = match.group("tag")
        content = match.group("content")

        if tag and tag.upper() in ALLOWED_LABELS:
            reference = reference.replace(content, clean_reference(content))
        elif tag:
            reference = delete_tag(tag, reference)
    return reference

def clean_reference_batch(batch):
    return {"cleaned_text": [clean_reference(ref) for ref in batch["citationStringAnnotated"]]}  # Process each reference

# Example String
text = "This is a <title><this>Simple Example</this></title> with a <test><year>2024</year></test> tag."

# Detect tags and their content
print(clean_reference(text))

This is a <title>Simple Example</title> with a <year>2024</year> tag.


In [7]:
datasets['train'][0]

{'doi': '10.3748/wjg.v11.i31.4916',
 'articleType': 3,
 'citationStyle': 1033,
 'citationStringAnnotated': '<author><family>Chen</family> <given>K-S</given></author>. <publisher>Baishideng Publishing Group Inc.</publisher>, <issued><year>2005</year></issued>. <title>Expression of heparanase mRNA in anti-sense oligonucleotide-transfected human esophageal cancer EC9706 cells</title>[J]. <container-title>World Journal of Gastroenterology</container-title>, <issued><year>2005</year></issued>, <volume>11</volume>(<issue>31</issue>): <page>4916</page>.',
 'Unnamed: 4': None}

In [8]:
datasets = datasets.map(clean_reference_batch, batched=True)

Map: 100%|██████████| 12234077/12234077 [10:37<00:00, 19203.91 examples/s]
Map: 100%|██████████| 679670/679670 [00:34<00:00, 19814.01 examples/s]
Map: 100%|██████████| 679670/679670 [00:34<00:00, 19584.03 examples/s]


In [9]:
datasets['train'][:5]["cleaned_text"]

['<author>Chen K-S</author>. <publisher>Baishideng Publishing Group Inc.</publisher>, <year>2005</year>. <title>Expression of heparanase mRNA in anti-sense oligonucleotide-transfected human esophageal cancer EC9706 cells</title>[J]. <container-title>World Journal of Gastroenterology</container-title>, <year>2005</year>, <volume>11</volume>(<issue>31</issue>): <page>4916</page>.',
 '<author>McIntire, Roger W. and Colley, Thomas A.</author> (<year>1967</year>), <title>Social Reinforcement in the Dog</title>. <container-title>Psychological Reports</container-title>, <volume>20</volume>(<issue>3</issue>): <page>843–846</page>. available at <URL>http://dx.doi.org/10.2466/pr0.1967.20.3.843</URL> ',
 '<author>M. Maguire</author>, <title>“Does usability=attractiveness?,”</title> in <container-title>Design and Emotion</container-title>, <publisher>CRC Press</publisher>, <year>2003</year>, pp. <page>303–307</page>.',
 '<author>Walters, G.D.</author> (<year>2001</year>) <container-title>Journal o

In [10]:
datasets['train'][:5]["citationStringAnnotated"]

['<author><family>Chen</family> <given>K-S</given></author>. <publisher>Baishideng Publishing Group Inc.</publisher>, <issued><year>2005</year></issued>. <title>Expression of heparanase mRNA in anti-sense oligonucleotide-transfected human esophageal cancer EC9706 cells</title>[J]. <container-title>World Journal of Gastroenterology</container-title>, <issued><year>2005</year></issued>, <volume>11</volume>(<issue>31</issue>): <page>4916</page>.',
 '<author><family>McIntire</family>, <given>Roger W.</given> and <family>Colley</family>, <given>Thomas A.</given></author> (<issued><year>1967</year></issued>), <title>Social Reinforcement in the Dog</title>. <container-title>Psychological Reports</container-title>, <volume>20</volume>(<issue>3</issue>): <page>843–846</page>. available at <URL>http://dx.doi.org/10.2466/pr0.1967.20.3.843</URL> ',
 '<author><given>M.</given> <family>Maguire</family></author>, <title>“Does usability=attractiveness?,”</title> in <container-title>Design and Emotion<

### Tokenization and BIO tagging

In [29]:
UNIQUE_TAGS = []
def tokenize_and_tag(reference):
    tokens_with_tags = []
    while len(reference) > 0:
        if reference[0].isspace():
            reference = reference[1:]
        elif reference[0] != "<" and bool(re.match(r'[^\w\s]', reference[0])):
            current_tag = "B-PUNC"
            tokens_with_tags.append((reference[0], current_tag))
            reference = reference[1:]
        else:
            if reference[0] == "<":
                match = TAG_PATTERN.match(reference)

                if not match:
                    current_tag = "B-PUNC"
                    tokens_with_tags.append((reference[0], current_tag))
                    reference = reference[1:]
                    continue
                
                tag = match.group("tag")

                if tag.upper() not in ALLOWED_LABELS:
                    reference = delete_tag(tag, reference)
                    continue

                current_tag = "B-" + tag.upper()
                content = match.group("content")
                closing_tag = "</" + tag + ">"
                close_index = reference.find(closing_tag)
                reference = reference[(close_index + len(closing_tag)):]
            else:
                next_tag = reference.find('<')
                if next_tag == -1:
                    next_tag = len(reference)
                
                tag = ""
                content = reference[0:next_tag]
                current_tag = "O"
                reference = reference[next_tag:]
            
            if tag not in UNIQUE_TAGS:
                UNIQUE_TAGS.append(tag)
            
            for word in content.split():
                while bool(re.match(r'^[^\w]', word)): # Checks if it starts with punctuation
                    tokens_with_tags.append((word[0], "B-PUNC"))
                    word = word[1:]
                
                end_punctuation = []
                while bool(re.search(r'[^\w]$', word)): # Checks if it ends with punctuation
                    end_punctuation.append(word[-1])
                    word = word[:-1]
                
                tokens_with_tags.append((word, current_tag))

                for punctuation in reversed(end_punctuation):
                    tokens_with_tags.append((punctuation, "B-PUNC"))
                
                if current_tag[0] == "B":
                    current_tag = "I-" + tag.upper()
                   
    return tokens_with_tags

def tokenize_and_tag_batch(batch):
    return {"tokenized": [tokenize_and_tag(ref) for ref in batch["cleaned_text"]]}     

In [30]:
tokenize_and_tag("< https://doi.org/<DOI>10.1007/bf02163295</DOI> >")

[('<', 'B-PUNC'),
 ('https://doi.org', 'O'),
 ('/', 'B-PUNC'),
 ('10.1007/bf02163295', 'B-DOI'),
 ('>', 'B-PUNC')]

In [31]:
UNIQUE_TAGS

['', 'DOI']

In [32]:
datasets = datasets.map(tokenize_and_tag_batch, batched=True)

Map: 100%|██████████| 12234077/12234077 [23:44<00:00, 8585.84 examples/s]
Map: 100%|██████████| 679670/679670 [01:12<00:00, 9316.05 examples/s] 
Map: 100%|██████████| 679670/679670 [01:12<00:00, 9310.76 examples/s] 


In [27]:
print(datasets['train'][0]['citationStringAnnotated'])
print(datasets['train'][0]['cleaned_text'])
print(datasets['train'][0]['tokenized'])

<author><family>Chen</family> <given>K-S</given></author>. <publisher>Baishideng Publishing Group Inc.</publisher>, <issued><year>2005</year></issued>. <title>Expression of heparanase mRNA in anti-sense oligonucleotide-transfected human esophageal cancer EC9706 cells</title>[J]. <container-title>World Journal of Gastroenterology</container-title>, <issued><year>2005</year></issued>, <volume>11</volume>(<issue>31</issue>): <page>4916</page>.
<author>Chen K-S</author>. <publisher>Baishideng Publishing Group Inc.</publisher>, <year>2005</year>. <title>Expression of heparanase mRNA in anti-sense oligonucleotide-transfected human esophageal cancer EC9706 cells</title>[J]. <container-title>World Journal of Gastroenterology</container-title>, <year>2005</year>, <volume>11</volume>(<issue>31</issue>): <page>4916</page>.
[['Chen', 'B-AUTHOR'], ['K-S', 'I-AUTHOR'], ['.', 'B-PUNC'], ['Baishideng', 'B-PUBLISHER'], ['Publishing', 'I-PUBLISHER'], ['Group', 'I-PUBLISHER'], ['Inc', 'I-PUBLISHER'], ['.

In [33]:
UNIQUE_TAGS

['',
 'DOI',
 'author',
 'publisher',
 'year',
 'title',
 'container-title',
 'volume',
 'issue',
 'page',
 'URL',
 'ISSN',
 'ISBN']

In [34]:
datasets.save_to_disk("processed_dataset")

Saving the dataset (42/42 shards): 100%|██████████| 12234077/12234077 [01:41<00:00, 120931.73 examples/s]
Saving the dataset (3/3 shards): 100%|██████████| 679670/679670 [00:06<00:00, 103784.23 examples/s]
Saving the dataset (3/3 shards): 100%|██████████| 679670/679670 [00:06<00:00, 107967.50 examples/s]
