In [None]:
!pip install transformers datasets pandas



## Splitting Hugging Face Data

 * Dataset Name : scientific_papers
    * Train Split : 1 - 2000
    * Val Splits 1 : 2000
    * Test Split 1 : 120

 * Dataset Name : scillm/scientific_papers-archive
    * Train Split : 2001 - 2250
    * Val Split : 2001 - 2250
    * Test Split : 121 - 140

In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import random

In [None]:
papers_dataset = load_dataset("scillm/scientific_papers-archive")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
feature_to_remove = "id"

for split, dataset in papers_dataset.items():
    papers_dataset[split] = dataset.remove_columns(feature_to_remove)

In [None]:
papers_dataset = papers_dataset.rename_column("input","article")
papers_dataset = papers_dataset.rename_column("output","abstract")

In [None]:
def truncate_dataset(dataset, fraction_to_keep):
    num_samples = len(dataset)
    num_samples_to_keep = int(num_samples * fraction_to_keep)
    indices_to_keep = random.sample(range(num_samples), num_samples_to_keep)
    truncated_dataset = dataset.select(indices_to_keep)
    return truncated_dataset

fraction_to_keep = 0.00777294 # Keeping 25000 Papers
for split in papers_dataset.keys():
    papers_dataset[split] = truncate_dataset(papers_dataset[split], fraction_to_keep)

In [None]:
# Removing Articles with blank text from train dataset
i = 0
indices_to_remove = []
while i < len(papers_dataset["train"]):
  if papers_dataset["train"][i]["article"] == "" or papers_dataset["train"][i]["abstract"] == "":
    indices_to_remove.append(i)
  i += 1

filtered_dataset = papers_dataset["train"].filter(lambda example, idx: idx not in indices_to_remove, with_indices=True)
papers_dataset["train"] = filtered_dataset

i = 0
indices_to_remove = []
while i < len(papers_dataset["validation"]):
  if papers_dataset["validation"][i]["article"] == "" or papers_dataset["validation"][i]["abstract"] == "":
    indices_to_remove.append(i)
  i += 1

filtered_dataset = papers_dataset["validation"].filter(lambda example, idx: idx not in indices_to_remove, with_indices=True)
papers_dataset["validation"] = filtered_dataset

print("Records with blank data removed")

Filter:   0%|          | 0/25103 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1015 [00:00<?, ? examples/s]

Records with blank data removed


In [None]:
papers_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 25001
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 1015
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 1018
    })
})

In [None]:
train_data = papers_dataset["train"]

# Number of parts to divide the dataset into
num_parts = 250

# Calculate the number of samples in each part
num_samples_per_part = len(train_data) // num_parts

# Shuffle the dataset before splitting
train_data = train_data.shuffle()

# Divide the dataset into parts
data_parts = [train_data.shard(num_parts, i) for i in range(num_parts)]

file_number = 2001

# Store each part locally as a CSV file
for i, part in enumerate(data_parts):
    part_df = pd.DataFrame(part)
    part_df.to_csv(f"/content/drive/MyDrive/Submission/BE Project Group No 31/Data/training_data{file_number}.csv", index=False,escapechar="\\")
    file_number += 1

In [None]:
val_data = papers_dataset["validation"]

# Number of parts to divide the dataset into
num_parts = 250

# Calculate the number of samples in each part
num_samples_per_part = len(val_data) // num_parts

# Shuffle the dataset before splitting
val_data = val_data.shuffle()

# Divide the dataset into parts
data_parts = [val_data.shard(num_parts, i) for i in range(num_parts)]

file_number = 2001

# Store each part locally as a CSV file
for i, part in enumerate(data_parts):
    part_df = pd.DataFrame(part)
    part_df.to_csv(f"./drive/MyDrive/Submission/BE Project Group No 31/Data/val_data{file_number}.csv", index=False,escapechar="\\")
    file_number += 1

In [None]:
test_data = papers_dataset["test"]

# Number of parts to divide the dataset into
num_parts = 20

# Calculate the number of samples in each part
num_samples_per_part = len(test_data) // num_parts

# Shuffle the dataset before splitting
test_data = test_data.shuffle()

# Divide the dataset into parts
data_parts = [test_data.shard(num_parts, i) for i in range(num_parts)]

file_number = 121

# Store each part locally as a CSV file
for i, part in enumerate(data_parts):
    part_df = pd.DataFrame(part)
    part_df.to_csv(f"./drive/MyDrive/Submission/BE Project Group No 31/Data/test_data{file_number}.csv", index=False,escapechar="\\")
    file_number += 1
