In [1]:
# !pip uninstall transformers -y
# !pip install transformers
# !pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q
# !pip install accelerate -U
# !pip install evaluate
# !pip install tensorrt
# !pip install tensorflow
# !pip install ipywidgets
# !pip install plotly
#!pip install nbformat

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python"

In [2]:
import pandas as pd
import numpy as np 
import gc

# visualization libraries.
import plotly.graph_objects as go 

# Dataset packages.
from datasets import load_dataset
from datasets.dataset_dict import DatasetDict, Dataset
import datasets

# transformer packages.
from transformers.tokenization_utils import BatchEncoding
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM



In [3]:
# Configuring Pandas to exhibit larger columns.
pd.set_option('display.max_colwidth', 1000)

# Dataset explorations.

In [4]:
def count_box_plot(count_df: pd.DataFrame,
                   height: int,
                   width: int,
                   plot_name: str,
                   want_legend: bool = True,
                   ):
    """ 
    
    """
    for col_name in count_df.columns:
        data = go.Box(
            y = count_df[col_name],
            name = "Box Plot",
            boxmean=True,
            line_color= '#d500ff'
        )
        fig = go.Figure(data = data)

        # Update the layout.
        fig.update_layout(
            title={'text': f'<b>Word Count<br>{plot_name}<sup><i>&nbsp;&nbsp;&nbsp;&nbsp;{col_name}</i></sup></b>',
                   'x': .025, 'xanchor': 'left'},
            margin=dict(t=100),
            showlegend=want_legend,
            template = 'plotly_dark',
            height=height, width=width
        )
        fig.update_yaxes(title_text=f"<b>Words count</b>", showgrid=False)
        fig.update_xaxes(title_text=f"", showgrid=False)

        fig.show()

In [5]:

dataset = load_dataset("ccdv/arxiv-summarization")
dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6440
    })
})

In [10]:
# getting the variables.
categorical_columns = dataset["train"].column_names
#categorical_columns = ['article', 'abstract']

In [7]:
# find if there any null values.
data_train = pd.DataFrame(dataset["train"])
print(data_train.isna().sum())


# check the data type of the features.
print(data_train.dtypes)

data_train.head()

article     0
abstract    0
dtype: int64
article     object
abstract    object
dtype: object


Unnamed: 0,article,abstract
0,"additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability when compared to fully nonparametric models . \n it is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models . \n many examples of such estimators belong to the large class of regularized kernel based methods over a reproducing kernel hilbert space @xmath0 , see e.g. @xcite . in the last years \n many interesting results on learning rates of regularized kernel based models for additive models have been published when the focus is on sparsity and when the classical least squares loss function is used , see e.g. @xcite , @xcite , @xcite , @xcite , @xcite , @xcite and the references therein . of cou...","additive models play an important role in semiparametric statistics . \n this paper gives learning rates for regularized kernel based methods for additive models . \n these learning rates compare favourably in particular in high dimensions to recent results on optimal learning rates for purely nonparametric regularized kernel based quantile regression using the gaussian radial basis function kernel , provided the assumption of an additive model is valid . \n additionally , a concrete example is presented to show that a gaussian function depending only on one variable lies in a reproducing kernel hilbert space generated by an additive gaussian kernel , but does not belong to the reproducing kernel hilbert space generated by the multivariate gaussian kernel of the same variance . * \n key words and phrases . * additive model , kernel , quantile regression , semiparametric , rate of convergence , support vector machine ."
1,"the leptonic decays of a charged pseudoscalar meson @xmath7 are processes of the type @xmath8 , where @xmath9 , @xmath10 , or @xmath11 . because no strong interactions are present in the leptonic final state @xmath12 , such decays provide a clean way to probe the complex , strong interactions that bind the quark and antiquark within the initial - state meson . in these decays , strong interaction effects can be parametrized by a single quantity , @xmath13 , the pseudoscalar meson decay constant . \n the leptonic decay rate can be measured by experiment , and the decay constant can be determined by the equation ( ignoring radiative corrections ) @xmath14 where @xmath15 is the fermi coupling constant , @xmath16 is the cabibbo - kobayashi - maskawa ( ckm ) matrix @xcite element , @xmath17 is the mass of the meson , and @xmath18 is the mass of the charged lepton . \n the quantity @xmath13 describes the amplitude for the @xmath19 and @xmath20-quarks within the @xmath21 to have zero sep...","we have studied the leptonic decay @xmath0 , via the decay channel @xmath1 , using a sample of tagged @xmath2 decays collected near the @xmath3 peak production energy in @xmath4 collisions with the cleo - c detector . \n we obtain @xmath5 and determine the decay constant @xmath6 mev , where the first uncertainties are statistical and the second are systematic ."
2,"the transport properties of nonlinear non - equilibrium dynamical systems are far from well - understood@xcite . \n consider in particular so - called ratchet systems which are asymmetric periodic potentials where an ensemble of particles experience directed transport@xcite . \n the origins of the interest in this lie in considerations about extracting useful work from unbiased noisy fluctuations as seems to happen in biological systems@xcite . \n recently attention has been focused on the behavior of deterministic chaotic ratchets@xcite as well as hamiltonian ratchets@xcite . \n chaotic systems are defined as those which are sensitively dependent on initial conditions . whether chaotic or not , the behavior of nonlinear systems including the transition from regular to chaotic behavior is in general sensitively dependent on the parameters of the system . \n that is , the phase - space structure is usually relatively complicated , consisting of stability islands embedded in chaoti...","in 84 , 258 ( 2000 ) , mateos conjectured that current reversal in a classical deterministic ratchet is associated with bifurcations from chaotic to periodic regimes . \n this is based on the comparison of the current and the bifurcation diagram as a function of a given parameter for a periodic asymmetric potential . \n barbi and salerno , in 62 , 1988 ( 2000 ) , have further investigated this claim and argue that , contrary to mateos claim , current reversals can occur also in the absence of bifurcations . \n barbi and salerno s studies are based on the dynamics of one particle rather than the statistical mechanics of an ensemble of particles moving in the chaotic system . \n the behavior of ensembles can be quite different , depending upon their characteristics , which leaves their results open to question . in this paper we present results from studies showing how the current depends on the details of the ensemble \n used to generate it , as well as conditions for convergent beh..."
3,"studies of laser beams propagating through turbulent atmospheres are important for many applications such as remote sensing , tracking , and long - distance optical communications . \n howerver , fully coherent laser beams are very sensitive to fluctuations of the atmospheric refractive index . \n the initially coherent laser beam acquires some properties of gaussian statistics in course of its propagation through the turbulence . as a result , the noise / signal ratio approaches unity for long - distance propagation . \n ( see , for example , refs.@xcite-@xcite ) . \n this unfavourable effect limits the performance of communication channels . to mitigate this negative effect \n the use of partially ( spatially ) coherent beams was proposed . \n the coherent laser beam can be transformed into a partially coherent beam by means of a phase diffuser placed near the exit aperture . \n this diffuser introduces an additional phase ( randomly varying in space and time ) to the wave front ...","the effect of a random phase diffuser on fluctuations of laser light ( scintillations ) is studied . \n not only spatial but also temporal phase variations introduced by the phase diffuser are analyzed . \n the explicit dependence of the scintillation index on finite - time phase variations is obtained for long propagation paths . \n it is shown that for large amplitudes of phase fluctuations , a finite - time effect decreases the ability of phase diffuser to suppress the scintillations ."
4,"the so - called `` nucleon spin crisis '' raised by the european muon collaboration ( emc ) measurement in 1988 is one of the most outstanding findings in the field of hadron physics @xcite,@xcite . \n the renaissance of the physics of high energy deep inelastic scatterings is greatly indebted to this epoch - making finding . \n probably , one of the most outstanding progresses achieved recently in this field of physics is the discovery and the subsequent research of completely new observables called generalized parton distribution functions ( gpd ) . \n it has been revealed that the gpds , which can be measured through the so - called deeply - virtual compton scatterings ( dvcs ) or the deeply - virtual meson productions ( dvmp ) , contain surprisingly richer information than the standard parton distribution functions @xcite@xcite . \n roughly speaking , the gpds are generalization of ordinary parton distributions and the elastic form factors of the nucleon . \n the gpds in the mo...","with a special intention of clarifying the underlying spin contents of the nucleon , we investigate the generalized form factors of the nucleon , which are defined as the @xmath0-th @xmath1-moments of the generalized parton distribution functions , within the framework of the chiral quark soliton model . \n a particular emphasis is put on the pion mass dependence of final predictions , which we shall compare with the predictions of lattice qcd simulations carried out in the so - called heavy pion region around @xmath2 . \n we find that some observables are very sensitive to the variation of the pion mass . \n it will be argued that the negligible importance of the quark orbital angular momentum indicated by the lhpc and qcdsf lattice collaborations might be true in the unrealistic heavy pion world , but it is not necessarily the case in our real world close to the chiral limit ."


In [11]:
# Count the number of words in each content and headline.
words_count_df = pd.DataFrame()
for col_name in categorical_columns:
    words_count_df[col_name] = data_train[col_name].apply(lambda row: len(str(row).split(" ")))

words_count_df.head()

Unnamed: 0,article,abstract
0,4662,152
1,3500,63
2,5579,268
3,3925,84
4,9720,154


In [12]:
count_box_plot(words_count_df, 800, 800, plot_name="Train data words count")