# Downloading summarization test set

## Downloading Summarization dataset.

In [None]:
# for train set
#!wget -O train.zip --no-check-certificate https://huggingface.co/datasets/ccdv/arxiv-summarization/resolve/main/train.zip?download=true

In [2]:
# download the test set. replace the link with the train set download link if you want to
!wget -O test.zip --no-check-certificate https://huggingface.co/datasets/ccdv/arxiv-summarization/resolve/main/test.zip?download=true

--2024-03-28 04:58:14--  https://huggingface.co/datasets/ccdv/arxiv-summarization/resolve/main/test.zip?download=true
Resolving huggingface.co (huggingface.co)... 18.172.134.124, 18.172.134.24, 18.172.134.4, ...
Connecting to huggingface.co (huggingface.co)|18.172.134.124|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/datasets/ccdv/arxiv-summarization/8ee7cf45fde92768515e2f3170ecb1cf9bdae60169f2b4d4f9b60f1f628e862c?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27test.zip%3B+filename%3D%22test.zip%22%3B&response-content-type=application%2Fzip&Expires=1711861094&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxMTg2MTA5NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9kYXRhc2V0cy9jY2R2L2FyeGl2LXN1bW1hcml6YXRpb24vOGVlN2NmNDVmZGU5Mjc2ODUxNWUyZjMxNzBlY2IxY2Y5YmRhZTYwMTY5ZjJiNGQ0ZjliNjBmMWY2MjhlODYyYz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29ud

In [3]:
# extract the downloaded file
import zipfile
with zipfile.ZipFile("test.zip", 'r') as zip_ref:
    zip_ref.extractall("./")

In [None]:
!head -1 test.txt

In [5]:
# load papers in json format from each line of the extracted file
papers = []
import json
article_ids = []
with open('test.txt', 'r') as file:
    for line in file:
        try:
            # Parse the JSON data from each line
            paper_data = json.loads(line)
            # Extract the arXiv ID and append to the list
            if 'article_id' in paper_data:
                article_ids.append(paper_data['article_id'])
            papers.append(paper_data)
        except json.JSONDecodeError as e:
            # Output an error message if a line is not valid JSON
            print(f"Error parsing JSON for line: {line}")
            print(str(e))

# Now `article_ids` contains all the article IDs from the file
# print(article_ids)


In [6]:
# counting the number of astro physics papers in the dataset
count = 0
for i in article_ids:
  if i.startswith("astro-ph"):
    count += 1
print(count)

1046


In [7]:
# total article ids
len(article_ids)

6440

In [None]:
# print the beautified paper which is in JSON format
json_formatted_str = json.dumps(papers[0], indent=2)
print(json_formatted_str)

## Skip to LSA Summarization

### Alternative Downloader (downloads data in a way that we dont desire, check it)

In this method, the huggingface dataset downloader will directly download the dataset. But it will not contain the fields we desire (The downloader script ignores the fields we desire when loading the dataset. For example, the data that is separated into sections in the field "sections" will not be loaded if we download the data this way. So we consider the manually downloaded the data as above. 

In [None]:
# Install the Hugging Face `datasets` library
!pip install datasets

# Import the library
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("ccdv/arxiv-summarization")

# The dataset is now loaded and can be used


In [None]:
dataset

In [None]:
print(dataset["test"][1])

In [None]:
print(dataset["validation"])

## Extractive Summarization

### LSA summarization

In [9]:
import nltk
import numpy as np
import re

nltk.download('stopwords')
nltk.download('punkt')
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [11]:
from scipy.sparse.linalg import svds

def low_rank_svd(matrix, singular_count=2):
    u, s, vt = svds(matrix, k=singular_count)
    return u, s, vt

In [None]:
# Get top n sentences. Default value of n is 4. 
def get_sentences_top(sentences, num_sentences=4):
    norm_sentences = normalize_corpus(sentences)
    
    norm_sentences = list(filter(None, norm_sentences))
    
    tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
    
    if len(norm_sentences) == 0:
        return np.array([])
    
    dt_matrix = tv.fit_transform(norm_sentences)
    dt_matrix = dt_matrix.toarray()
    vocab = tv.get_feature_names_out()
    td_matrix = dt_matrix.T
    #print(td_matrix.shape)
    pd.DataFrame(np.round(td_matrix, 2), index=vocab).head(10)
    num_sentences = min(num_sentences, len(sentences))
    #print("num_sentences", num_sentences)
    num_topics = 3
    if(len(norm_sentences) <= num_topics):
        return np.array(sentences)

    u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)
    #print(u.shape, s.shape, vt.shape)
    term_topic_mat, singular_values, topic_document_mat = u, s, vt

    sv_threshold = 0.5
    min_sigma_value = max(singular_values) * sv_threshold
    singular_values[singular_values < min_sigma_value] = 0
    salience_scores = np.sqrt(np.dot(np.square(singular_values),
                                 np.square(topic_document_mat)))
    top_sentence_indices = (-salience_scores).argsort()[:num_sentences]
    top_sentence_indices.sort()
    return np.array(sentences)[top_sentence_indices]

In [186]:
'''# Get top n sentences. Default value of n is 4. 
def get_sentences_top(sentences, num_sentences=4):
  norm_sentences = normalize_corpus(sentences)
  #norm_sentences[:3]
  
  print(norm_sentences)
  norm_sentences = list(filter(None, norm_sentences))
  
  print(norm_sentences)
  tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
  #print(norm_sentences)
  if norm_sentences is None:
    return np.array([])
  dt_matrix = tv.fit_transform(norm_sentences)
  dt_matrix = dt_matrix.toarray()
  vocab = tv.get_feature_names_out()
  td_matrix = dt_matrix.T
  #print(td_matrix.shape)
  pd.DataFrame(np.round(td_matrix, 2), index=vocab).head(10)
  num_sentences = min(num_sentences, len(sentences))
  #print("num_sentences", num_sentences)
  num_topics = 3
  if(len(sentences) <= num_topics):
    return np.array(sentences)

  u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)
  #print(u.shape, s.shape, vt.shape)
  term_topic_mat, singular_values, topic_document_mat = u, s, vt

  sv_threshold = 0.5
  min_sigma_value = max(singular_values) * sv_threshold
  singular_values[singular_values < min_sigma_value] = 0
  salience_scores = np.sqrt(np.dot(np.square(singular_values),
                                 np.square(topic_document_mat)))
  top_sentence_indices = (-salience_scores).argsort()[:num_sentences]
  top_sentence_indices.sort()
  return np.array(sentences)[top_sentence_indices]'''

In [16]:
# Sample paper from the dataset
paper = papers[1]
sections = paper["sections"]
sentences = sections[0]
get_sentences_top(sentences, 5)

array(['it is believed that the direct detection of gravitational waves ( gws ) will bring the era of gravitational wave astronomy .',
       'the first direct detection of the gravitational waves might be achieved by ptas .',
       'we also discuss a method to separate the intensity ( @xmath3 mode ) and circular polarization ( @xmath2 mode ) of the sgwb .',
       'the basic framework is essentially a combination of the formalism of @xcite , and the polarization decomposition formula of the sgwb derived in @xcite . in section [ sec : the generalized overlap reduction function for circular polarization ] , we calculate the generalized orfs for the @xmath2 mode .',
       'the results for @xmath3 mode are consistent with the previous work  @xcite . in section [ sec : separation method ] , we give a method for separation between the @xmath3 mode and @xmath2 mode of the sgwb .'],
      dtype='<U349')

In [17]:
# Sample sectionwise summaries for a paper
paper = papers[1]
sections = paper["sections"]
extractive_summaries = []
for section in sections:
  extractive_summaries.append(get_sentences_top(section, 5))

In [53]:
# Define a method to perform extractive summarization)
def extractive(document_sections, num_sentences=4):
    overall_summary = ""
    for section in document_sections:
        summary = get_sentences_top(section, num_sentences)
        overall_summary += " ".join(summary) + " "       
    return overall_summary

### Pegasus test (Ignore)

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

In [None]:
model_name = 'google/pegasus-xsum'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [None]:
'''text = ""
for summary in extractive_summaries:
  text += " ".join(summary)
  text += "\n"  '''
text = []
for summary in extractive_summaries:
  text.append(" ".join(summary))

In [None]:
src_text = text
batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest',return_tensors='pt')

In [None]:
batch = batch.to('cuda')

In [None]:
translated = model.generate(**batch)

In [None]:
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

In [None]:
tgt_text

In [None]:
res = []
for t in text:
  src_text = t
  batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest',return_tensors='pt')
  batch = batch.to('cuda')
  translated = model.generate(**batch)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  res.append(tgt_text[0])

In [None]:
res

In [None]:
" ".join(res)

# Summarization Pre Processing

In [20]:
pip install transformers datasets evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m-:--:--[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=f2c16f3e62ac6dbe78e58a826cf7adb0c6ad88dcfe2fe5b734275a7446ad6dad
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.1 rouge_score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [21]:
from datasets import __version__ as ver 
print(ver)

2.1.0


In [22]:
# load the data as huggingface dataset object from papers list
from datasets import Dataset, load_dataset
papers_ds = Dataset.from_pandas(pd.DataFrame(data=papers))
papers_ds

Dataset({
    features: ['article_id', 'article_text', 'abstract_text', 'labels', 'section_names', 'sections'],
    num_rows: 6440
})

In [23]:
papers_split = papers_ds.train_test_split(test_size=0.2)
rpapers = papers_split
# access paper like this papers_split["train"][0]

In [24]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [25]:
# skip this. 
# find the highest length of a sentence in the first paper
l = 0
for text in papers[0]["article_text"]:
    l = max(len(nltk.word_tokenize(text)), l)
print(l)

144


### Lengths analysis (takes time to execute, 77 seconds for 1000 papers)

In [None]:
lengths = []
sents = []
import time
l = 0
max_text = ""
start = time.time()
considered = papers[:1000]
for i in range(len(considered)):
    paper = considered[i]
    for text in paper["article_text"]:
        chk_len = len(nltk.word_tokenize(text))
        lengths.append(chk_len)
        sents.append(text)
        if chk_len > l:
            max_text = i
            l = chk_len
        
end = time.time()
print("Time taken:", (end-start), "seconds")
print((end-start)/len(considered), "seconds per example")

In [None]:
print(np.std(lengths))
print(np.average(lengths))
print(np.median(lengths))

### Finding the highest length sentence within a single paper

In [None]:
lengths = []
import time
l = 0
max_text = ""
start = time.time()
considered = papers[:1]
for i in range(len(considered)):
    paper = considered[i]
    for text in paper["article_text"]:
        chk_len = len(nltk.word_tokenize(text))
        lengths.append(l)
        if chk_len > l:
            max_text = i
            l = chk_len
        
end = time.time()
print("Time taken:", (end-start), "seconds")
print((end-start)/len(considered), "seconds per example")

In [None]:
print(l)

In [None]:
# print the sentence with the highest length
for text in considered[max_text]["article_text"]:
    chk_len = len(nltk.word_tokenize(text))
    if chk_len == l:
        print(text)

In [None]:
# ignore
"""import time
l = 0
max_text = ""
start = time.time()
considered = papers[:10]
indices = []
for i in range(len(considered)):
    paper = considered[i]
    for text in paper["article_text"]:
        if "\\" in text:
            indices.append(i)
            print(">> ", text)
            break
        
end = time.time()
print("Time taken:", (end-start), "seconds")
print((end-start)/len(considered), "seconds per example")"""

In [212]:
prefix = "summarize: "

#mine = [0]
def preprocess_function(batched_examples):
    #mine[0] = batched_examples
    inputs = [prefix + extractive(sections, num_sentences=4) for sections in batched_examples["sections"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    check_outputs = [abstract_combiner(abstract) for abstract in batched_examples["abstract_text"]]
    labels = tokenizer(text_target=check_outputs, max_length=400, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [28]:
# combine sentences in abstract and remove the <s> and </s> tokens at the beginning and end of the sentence
def abstract_combiner(abstract):
    combined = ""
    for sentence in abstract:
        combined += (sentence[4:-4]) + " "
    return combined
#abstract_combiner(rpapers["train"][6]["abstract_text"])

In [29]:
rpapers = papers_split
rpapers

DatasetDict({
    train: Dataset({
        features: ['article_id', 'article_text', 'abstract_text', 'labels', 'section_names', 'sections'],
        num_rows: 5152
    })
    test: Dataset({
        features: ['article_id', 'article_text', 'abstract_text', 'labels', 'section_names', 'sections'],
        num_rows: 1288
    })
})

In [None]:
#x = mine[0]

In [None]:
tokenized_papers = rpapers.map(preprocess_function, batched=True,batch_size=2)

In [202]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

2024-03-28 06:35:08.613794: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-28 06:35:08.613915: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-28 06:35:08.784550: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [203]:
# ROUGE Score evaluation metric
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [206]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [207]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### Paste this Weights and Biases token when prompted
"8f4032952d4c019bb9442b2297082babb75d0956"

In [208]:
tokenized_papers["train"]

Dataset({
    features: ['article_id', 'article_text', 'abstract_text', 'labels', 'section_names', 'sections', 'input_ids', 'attention_mask'],
    num_rows: 5152
})

In [209]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_papers["train"],
    eval_dataset=tokenized_papers["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in _worker
    output = module(*input, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1748, in forward
    decoder_outputs = self.decoder(
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1115, in forward
    layer_outputs = layer_module(
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 725, in forward
    cross_attention_outputs = self.layer[1](
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 636, in forward
    attention_output = self.EncDecAttention(
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 562, in forward
    attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/functional.py", line 1856, in softmax
    ret = input.softmax(dim)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 200.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 129.06 MiB is free. Process 2200 has 14.62 GiB memory in use. Of the allocated memory 14.33 GiB is allocated by PyTorch, and 102.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


In [None]:
tokenized_papers["train"][0]["input_ids"]

In [None]:
text

In [None]:
my = "several periodicities were detected , but the periodicities about 155 days and from the interval of @xmath3 $ ] days ( @xmath4 $ ] years ) are mentioned most often . it was also found in proton flares during solar cycles 19 and 20 @xcite , but it was not found in the solar flares data during solar cycles 22 @xcite . the existence of the 156-day periodicity in sunspot data were confirmed by @xcite . the periodicities from the interval @xmath6 $ ] days ( @xmath4 $ ] years ) have been considered from 1968 . several periodicities were detected , but the periodicities about 155 days and from the interval of @xmath3 $ ] days ( @xmath4 $ ] years ) are mentioned most often . it was also found in proton flares during solar cycles 19 and 20 @xcite , but it was not found in the solar flares data during solar cycles 22 @xcite . the existence of the 156-day periodicity in sunspot data were confirmed by @xcite . the periodicities from the interval @xmath6 $ ] days ( @xmath4 $ ] years ) have been considered from 1968 . several periodicities were detected , but the periodicities about 155 days and from the interval of @xmath3 $ ] days ( @xmath4 $ ] years ) are mentioned most often . it was also found in proton flares during solar cycles 19 and 20 @xcite , but it was not found in the solar flares data during solar cycles 22 @xcite . the existence of the 156-day periodicity in sunspot data were confirmed by @xcite . the periodicities from the interval @xmath6 $ ] days ( @xmath4 $ ] years ) have been considered from 1968 . several periodicities were detected , but the periodicities about 155 days and from the interval of @xmath3 $ ] days ( @xmath4 $ ] years ) are mentioned most often . it was also found in proton flares during solar cycles 19 and 20 @xcite , but it was not found in the solar flares data during solar cycles 22 @xcite . the existence of the 156-day periodicity in sunspot data were confirmed by @xcite . the periodicities from the interval @xmath6 $ ] days ( @xmath4 $ ] years ) have been considered from 1968 ."

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model")
inputs = tokenizer(my, return_tensors="pt").input_ids

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [None]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
for paper in tokenized_papers["train"][:5]:
    print(paper)

In [None]:
tokenizer.decode(tokenized_papers["train"][0]["input_ids"], skip_special_tokens=True)

In [None]:
sizes = []
for paper in tokenized_papers["train"]:
    sizes.append(len(paper["input_ids"]))

In [None]:
print(np.average(np.asarray(sizes)))
print(np.median(np.asarray(sizes)))

In [None]:
abstract_sizes = []
for paper in tokenized_papers["train"]:
    abstract_sizes.append(len(paper["labels"]))

In [None]:
print(np.average(np.asarray(abstract_sizes)))
print(np.median(np.asarray(abstract_sizes)))

In [None]:
!wget -O dl.zip --no-check-certificate https://drive.usercontent.google.com/download?id=1g89WgFHMRbr4QrvA0ngh26PY081Nv3lx&export=download&authuser=0&confirm=t&uuid=78cc955c-3938-4c22-a41b-dd22276d33af&at=APZUnTX0OKRIoSs5dBh5AddabZCt%3A1711597810528