In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import gaussian_kde
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.figure_factory as ff
from datasets import Dataset, load_metric
import shutil
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', 1000)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import BartTokenizer, BartForConditionalGeneration      
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments         
from transformers import pipeline                                         
from transformers import DataCollatorForSeq2Seq                        
import torch                                                            
import evaluate                                                           
                                 
from sklearn.feature_extraction.text import TfidfVectorizer             
import re                                                                 
import nltk                                                               
import os 
nltk.download('punkt') 




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [4]:
# Help Functions for describe and plot the data
def display_feature_list(features, feature_type):

    '''
    This function displays the features within each list for each type of data
    '''

    print(f"\n{feature_type} Features: ")
    print(', '.join(features) if features else 'None')

def describe_df(df):
    """
    This function prints some basic info on the dataset and
    sets global variables for feature lists.
    """

    global categorical_features, continuous_features, binary_features
    categorical_features = [col for col in df.columns if df[col].dtype == 'object']
    binary_features = [col for col in df.columns if df[col].nunique() <= 2 and df[col].dtype != 'object']
    continuous_features = [col for col in df.columns if df[col].dtype != 'object' and col not in binary_features]

    print(f"\n{type(df).__name__} shape: {df.shape}")
    print(f"\n{df.shape[0]:,.0f} samples")
    print(f"\n{df.shape[1]:,.0f} attributes")
    print(f'\nMissing Data: \n{df.isnull().sum()}')
    print(f'\nDuplicates: {df.duplicated().sum()}')
    print(f'\nData Types: \n{df.dtypes}')

    #negative_valued_features = [col for col in df.columns if (df[col] < 0).any()]
    #print(f'\nFeatures with Negative Values: {", ".join(negative_valued_features) if negative_valued_features else "None"}')

    display_feature_list(categorical_features, 'Categorical')
    display_feature_list(continuous_features, 'Continuous')
    display_feature_list(binary_features, 'Binary')

    print(f'\n{type(df).__name__} Head: \n')
    display(df.head(5))
    print(f'\n{type(df).__name__} Tail: \n')
    display(df.tail(5))

def histogram_boxplot(df,hist_color, box_color, height, width, legend, name):
    '''
    This function plots a Histogram and a Box Plot side by side

    Parameters:
    hist_color = The color of the histogram
    box_color = The color of the boxplots
    heigh and width = Image size
    legend = Either to display legend or not
    '''

    features = df.select_dtypes(include = [np.number]).columns.tolist()

    for feat in features:
        try:
            fig = make_subplots(
                rows=1,
                cols=2,
                subplot_titles=["Box Plot", "Histogram"],
                horizontal_spacing=0.2
            )

            density = gaussian_kde(df[feat])
            x_vals = np.linspace(min(df[feat]), max(df[feat]), 200)
            density_vals = density(x_vals)

            fig.add_trace(go.Scatter(x=x_vals, y = density_vals, mode = 'lines',
                                     fill = 'tozeroy', name="Density", line_color=hist_color), row=1, col=2)
            fig.add_trace(go.Box(y=df[feat], name="Box Plot", boxmean=True, line_color=box_color), row=1, col=1)

            fig.update_layout(title={'text': f'<b>{name} Word Count<br><sup><i>&nbsp;&nbsp;&nbsp;&nbsp;{feat}</i></sup></b>',
                                     'x': .025, 'xanchor': 'left'},
                             margin=dict(t=100),
                             showlegend=legend,
                             template = 'plotly_dark',
                             #plot_bgcolor=bg_color,paper_bgcolor=paper_color,
                             height=height, width=width
                            )

            fig.update_yaxes(title_text=f"<b>Words</b>", row=1, col=1, showgrid=False)
            fig.update_xaxes(title_text="", row=1, col=1, showgrid=False)

            fig.update_yaxes(title_text="<b>Frequency</b>", row=1, col=2,showgrid=False)
            fig.update_xaxes(title_text=f"<b>Words</b>", row=1, col=2, showgrid=False)

            fig.show()
            print('\n')
        except Exception as e:
            print(f"An error occurred: {e}")

def plot_correlation(df, title, subtitle, height, width, font_size):
    '''
    This function is resposible to plot a correlation map among features in the dataset.

    Parameters:
    height = Define height
    width = Define width
    font_size = Define the font size for the annotations
    '''
    corr = np.round(df.corr(numeric_only = True), 2)
    mask = np.triu(np.ones_like(corr, dtype = bool))
    c_mask = np.where(~mask, corr, 100)

    c = []
    for i in c_mask.tolist()[1:]:
        c.append([x for x in i if x != 100])



    fig = ff.create_annotated_heatmap(z=c[::-1],
                                      x=corr.index.tolist()[:-1],
                                      y=corr.columns.tolist()[1:][::-1],
                                      colorscale = 'cividis')

    fig.update_layout(title = {'text': f"<b>{title} Heatmap<br><sup>&nbsp;&nbsp;&nbsp;&nbsp;<i>{subtitle}</i></sup></b>",
                                'x': .025, 'xanchor': 'left', 'y': .95},
                    margin = dict(t=210, l = 110),
                    yaxis = dict(autorange = 'reversed', showgrid = False),
                    xaxis = dict(showgrid = False),
                    template = 'plotly_dark',
                    #plot_bgcolor=bg_color,paper_bgcolor=paper_color,
                    height = height, width = width)


    fig.add_trace(go.Heatmap(z = c[::-1],
                             colorscale = 'cividis',
                             showscale = True,
                             visible = False))
    fig.data[1].visible = True

    for i in range(len(fig.layout.annotations)):
        fig.layout.annotations[i].font.size = font_size

    fig.show()


In [6]:
# Help functions to clean the data
def clean_text(text):
    # Remove special characters except "."
    text = text.replace("\n", " ")
        # Use regular expression to remove spaces before dots
    text = re.sub(r'\s+\.', '.', text)
    return text

## **Load Data**

In [7]:
df_1 = pd.read_parquet("Train_1.parquet").sample(frac=0.05, random_state=42)
test = pd.read_parquet("Test.parquet").sample(frac=0.05, random_state=42)
valid = pd.read_parquet("Validation.parquet").sample(frac=0.05, random_state=42)

In [8]:
train_all = df_1.drop('id', axis=1)
test = test.drop('id', axis=1)
valid = valid.drop('id', axis=1)

## **Analyze Dataset**

**Train Data**

In [9]:
# Extracting info on the training Dataframe
describe_df(train_all)


DataFrame shape: (4785, 2)

4,785 samples

2 attributes

Missing Data: 
article       0
highlights    0
dtype: int64

Duplicates: 8

Data Types: 
article       object
highlights    object
dtype: object

Categorical Features: 
article, highlights

Continuous Features: 
None

Binary Features: 
None

DataFrame Head: 



Unnamed: 0,article,highlights
15056,"Washington (CNN) -- The Federal Emergency Management Agency has not attempted to recoup some $643 million in payments that were improperly given to 160,000 individuals for housing and other aid following Hurricanes Katrina and Rita, an independent government investigator says. In a letter to FEMA Administrator Craig Fugate, Inspector General Richard Skinner wrote that a federal court in 2008 ordered FEMA to change its process for recovering the money. But Monday, three years after that court ruling, ""These payments remain uncollected because your office has not given final approval of a new recoupment process,"" Skinner wrote. Following the back-to-back storms in 2005, FEMA disbursed more than $7 billion in assistance to survivors. At the time, the government placed a premium on distributing the money quickly because of the dire needs of residents of the Gulf Coast. The money was intended for rental assistance, home repairs, housing replacement, moving costs, medical costs and other...","FEMA disbursed more than $7 billion in aid after hurricanes Katrina and Rita hit in 2005 .\nLater, FEMA estimated that about $643 million were improper payments .\nFEMA cited human error and fraud as the causes of improper payments .\nAn inspector says FEMA ""has not given final approval"" on a process to recoup the money ."
77601,"An airstrike hits an open air market. An explosion rocks a school-turned-shelter. A booby-trapped tunnel explodes. These were among the scenes in Gaza in a more-than-three-week-old conflict that has left more than 1,300 people dead and even more wounded. The violence between Israel's military and Palestinian militants played out Wednesday against a backdrop of another failed humanitarian cease-fire attempt, with militants firing rockets from Gaza into Israel and Israelis responding with airstrikes. With more than 100 people, mostly civilians, reportedly killed Wednesday in the fighting, the United Nations and the United States demanded more be done to protect civilians. A large part of the criticism has been leveled at Israel and its airstrikes, which have bombarded Gaza. Israel in turn has accused Hamas of hiding weapons, including rockets, in schools and launching attacks from near shelters. More than 1,300 Palestinians have been killed since the conflict between Israel and Hamas...","""Shrapnel was falling like rain,"" says a woman at a shelter that was shelled .\nWhite House says Israel must do more to protect civilians .\nMore than 1,300 killed in Gaza, Palestinian officials say .\n3 more Israeli soldiers killed, bringing total to 56 plus 3 civilians in Israel ."
47314,"(Travel + Leisure) -- Fall is foliage season in Ann Arbor, but the colors that really get locals excited this time of year are blue and yellow -- for the University of Michigan's football team. Every Saturday, millions of fans across the U.S. stream into college football stadiums like Michigan's Big House to chant, cheer, and stomp for the home teams. While winning helps, the best places to watch college football are based on more than any record. The stadiums must be outstanding, whether for their history or sheer size. Toss in rousing fight songs, stunts, and postgame hangouts, and you've got an experience worth traveling for. The stadium attendance numbers speak for themselves. College football drew a combined 49.6 million fans in 2010 -- nearly three times as many as the NFL (17.4 million). There's also a sense -- despite a few recent scandals -- that college football is much ""purer"" than the pro game. And college football is peppered with larger-than-life historic figures like...","Bronco Stadium's bright blue playing field was the first non-green field in college football .\nUPenn's Franklin Field is the oldest college stadium still in use, dating back to 1895 .\nOver the past 20 years Florida has won nearly 90 percent of its home games at ""The Swamp"""
87020,"(CNN)Imagine flying over The Netherlands and seeing one of the fat-pixeled images from the gallery above. It would be hard not to smile, right? I mean, what is that alien thing? An oversized kaleidoscope? A rip in the Matrix? Some kind of freakish, town-sized cauliflower? When Mishka Henner, a 38-year-old artist and photographer, came across these ""blurred"" images of Dutch landscapes on Google Maps, he was similarly perplexed and amused. ""Well, I laughed,"" he said of the initial discovery. The hidden zones are ""not only bases, they're also royal palaces and fuel depots and ammunition depots and that sort of thing,"" Henner told me. The Dutch government ""used a pretty spectacular method for hiding these locations, which does everything but hide them, basically."" Henner, who lives in the UK, decided to turn this strangely beautiful form of censorship into art. His series of high-resolution Google Maps renderings is called ""Dutch Landscapes."" Created in 2011, the series has been on dis...","John Sutter talks with artist Mishka Henner about his ""Dutch Landscapes"" series .\nThe series focuses on an artful effort by the Dutch government to censor Google Maps ."
94838,"By . Tamara Cohen . PUBLISHED: . 18:08 EST, 15 April 2012 . | . UPDATED: . 02:27 EST, 18 April 2012 . More than 35million people are now living in the drought zone which is engulfing England. Another 17 counties in the South West and the Midlands will be officially designated as 'water-stressed' today - the largest area suffering a water shortage since 1976. Although the hosepipe ban currently in force for 20million people has not yet been extended, contingency plans are being drawn up as the Environment Agency confirmed the drought is likely to last until Christmas. Water-free zone: Angie Evans walks along a dried up river bed near Chichester, West Sussex, today. More than 35million people are now living in the drought zone which is engulfing England . Running near empty: Isla Stanton, five, paddles in the depleted Bewl Water Reservoir near Lamberhurst, Kent . Worst water shortage since 1976: The half-full Bewl reservoir is holding just 50 per cent of its capacity as opposed to th...",More than 35million people now living in drought zone .\nAnother 17 counties officially designated as 'water-stressed'\nThat area is largest suffering water shortage since 1976 .\nRiver Severn may completely dry up in places by the summer .\nCommunities .\nforced to pull out of Britain in Bloom contest .\nDrought could last until after Christmas in parts of England .



DataFrame Tail: 



Unnamed: 0,article,highlights
75755,"FIFA president Sepp Blatter has confirmed that world football's governing body will donate $250,000 to the families of those people killed in last week's football riots in Egypt. More than 70 people lost their lives after violence erupted at the end of a match between Al Masry and Al Ahly in Port Said. A bank account has been set up by Al Ahly to help support families who lost loved ones, with Blatter appealing to the world of football to show their support for the north African nation. Bradley - Egyptian players need time to heal . Blatter said in a statement: ""I stated after the tragedy that this day was a black day for football. ""I'm still very shocked by what happened. Many of the victims were so young and they were also a crucial support for their families, who now need help."" Blatter added: ""The football community, including FIFA, must assist its Egyptian brothers and sisters."" Security forces were criticized by many supporters for their apparent lack of urgency as the riots ...","FIFA president Sepp Blatter has called on the football community to rally around Egypt .\nBlatter also reveals FIFA will donate $250,000 to the families of recent football riot victims .\nOver 70 people were killed following the match between Al Masry and Al Ahly in Port Said ."
58839,"(CNN) -- OK, there's no way to write this article without name dropping. To cut to the chase, I know Jon Stewart and Stephen Colbert, and you don't. I used to work for ""The Daily Show"" and ""The Colbert Report"" as the audience warm-up act. Not a bad day job. But knowing the hosts didn't mean I got to parade on stage at Saturday's ""Rally to Restore Sanity and/or Fear."" I was in the audience with the thousands of fans, reporting on the events for CNN. But the audience was where the real spectacle unfolded this weekend. Stewart, Colbert preside over rally . In a lot of ways, the people congregating on the National Mall looked just like the people I used to perform for back at ""The Daily Show"" and ""The Colbert Report."" Black, white, Hispanic, old, young -- every race, every ethnicity and every age group was represented in the sea of fans. The unifying characteristic, however, was that they were overwhelmingly thoughtful and hilarious Americans. They appreciate good comedy and political...","CNN's Pete Dominick once worked with Jon Stewart and Stephen Colbert .\nDominick wasn't talking to his pals; he was out Saturday observing folks attending rally .\nDominick finds that a good sense of humor is what attendees had in common .\nFunny signs he spotted: ""God Hates Nags,"" ""Hitler was a total Nazi"""
94613,"By . Phil Vinter and Jill Reilly . PUBLISHED: . 04:25 EST, 14 June 2012 . | . UPDATED: . 04:35 EST, 14 June 2012 . Tragic: The killing of Wpc Yvonne Fletcher sparked an 11 day police siege of the Libyan embassy in St James's Square . Two detectives have visited Libya as part of the ongoing investigation into the murder of WPc Yvonne Fletcher. The killer of the police officer who was shot outside the Libyan embassy in London in 1984 was never . convicted, but now 28 years on there are fresh hopes that her family may . finally see justice. The officers, from the Metropolitan Police’s Counter Terrorism Command, flew to Tripoli where they met Libyan officials to discuss how the investigation can be taken forward in the future. In a statement the force said: 'We hope these preliminary discussions will pave the way for the MPS and Libyan authorities to work jointly to identify who was responsible for WPc Fletcher’s murder in 1984.' The officers - a detective superintendent and a detectiv...",The officers travelled to the nation’s capital on Monday and returned to the UK last night .\nWPc Fletcher shot dead controlling an anti-Gaddafi demonstration in London in 1984 .\nBelieved that she was hit by shot fired from embassy .
67044,"(CNN) -- Too few Americans are willing to talk about sexually transmitted infections, or STIs, but we simply cannot afford to avoid these discussions any longer. The Centers for Disease Control and Prevention recently released data showing just how common and costly STIs are in the United States, especially for America's youth. Each year, we have 20 million new STI cases, half among teens and young adults ages 15 to 24. Across the nation at any given time, there are more than 110 million total infections, including new and existing infections. While the number of new infections is roughly equal among young women and young men, the health consequences of untreated STIs can be much more serious for young women, including losing the ability to have children. Every year, about 24,000 women in the United States become infertile because of an STI they probably didn't even know they had, because most infections have no symptoms. Left untreated, common STIs such as chlamydia and gonorrhea ...","Gail Bolan: Each year, we have 20 million new STIs cases, half of them among young people .\nBolan: The health consequences of untreated STIs can be more serious for young women .\nShe says most STIs have no symptoms, so testing is the necessary first step to treatment .\nBolan: STIs are preventable and most are curable, but we need to raise get the word out ."
53086,"Charlotte, North Carolina (CNN) -- A smiling Ted Scott strides into the bustling caddy area at the Wells Fargo Championship in Charlotte, North Carolina, and sets down the golf bag marked ""Bubba Watson"" that defines his working life. The fresh-faced 37-year old is one of the lucky ones. Caddies for the top PGA Tour golfers can bring home as much as $250,000 a year in the modern game, traveling the world and taking a cut of everything their player earns along the way. Such rewards make for a fiercely competitive industry. These days, most caddies are college-educated and strong players in their own right -- a long way from the stereotype of disheveled, drunken hobos portrayed in movies like Caddyshack. ""It's changed a lot since I've been out here,"" says Scott, who took his first job in 2000, and counts 2008 U.S. Ryder Cup captain Paul Azinger among his former employers. ""There used to be guys in the parking lot at tournaments, and you could pick up a bag. Now I've got friends who ar...","Ted Scott is in elite group of caddies that travel world and earn lucrative salaries .\nAfter failing to make impact as professional golfer, Scott became caddy in 2000 .\nThe 37-year-old began assisting Bubba Watson in 2006 and has helped him win three PGA titles .\nScott and Watson share close bond, which has helped working relationship flourish ."


In [10]:
df_text_lenght = pd.DataFrame() # Creating an empty dataframe
for feat in categorical_features: # Iterating through features --> Dialogue & Summary
    df_text_lenght[feat] = train_all[feat].apply(lambda x: len(str(x).split())) #  Counting words for each feature

# Plotting histogram-boxplot
histogram_boxplot(df_text_lenght,'#89c2e0', '#d500ff', 600, 1000, True, 'Train Dataset')









In [11]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english') # Top 15 terms
x = vectorizer.fit_transform(train_all['article'])
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Unigrams', 'Train - Article', 800, 800, 12)

In [12]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english') # Top 15 terms
x = vectorizer.fit_transform(train_all['highlights'])
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Unigrams', 'Train - Highlights', 800, 800, 12)

In [13]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english',ngram_range = (2,2)) # Top 15 terms
x = vectorizer.fit_transform(train_all['article'].fillna(''))
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Bigrams', 'Train - Article', 800, 800, 12)

In [14]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english',ngram_range = (2,2)) # Top 15 terms
x = vectorizer.fit_transform(train_all['highlights'].fillna(''))
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Bigrams', 'Train - Hightlight', 800, 800, 12)

**Test Data**

In [15]:
# Extracting info on the training Dataframe
describe_df(test)


DataFrame shape: (574, 2)

574 samples

2 attributes

Missing Data: 
article       0
highlights    0
dtype: int64

Duplicates: 0

Data Types: 
article       object
highlights    object
dtype: object

Categorical Features: 
article, highlights

Continuous Features: 
None

Binary Features: 
None

DataFrame Head: 



Unnamed: 0,article,highlights
1516,"Down Augusta way they say the azaleas are in full bloom, which is more than can be said for England’s Justin Rose. A bruising Florida swing last month saw the Englishman fall outside the world’s top 10. For a player who has been virtually a fixture in the top five for the last three years it was certainly a dent to the ego, with the Masters now just around the corner. Rose’s solution to his miserable form — three missed cuts and a 55th-place finish at the Cadillac Championship in four PGA Tour starts — was the time-honoured one. For the past two weeks, the 34-year-old has spent long hours on the practice ground. Justin Rose hit 17 out of 18 greens in regulation and signed for a 69 at the Shell Houston Open . In the first round of the Shell Houston Open on Thursday there were encouraging signs his decline will prove temporary. Rose hit 17 out of 18 greens in regulation and signed for a 69, the same score as his playing partner, the ever- consistent Jordan Spieth. ‘It’s certainly a w...",Justin Rose bounced back from Florida misery by carding 69 in Houston .\nThree-time Masters champion Phil Mickelson enjoyed return to form .\nPaul Casey celebrated last-gasp Masters invitation with fine round of 68 .
1393,"There was no special treatment for Lewis Ferguson at Paul Nicholls’ yard on Thursday morning. The 18-year-old was mucking out the stables as usual, just a cut on the nose to show for the fall which has made him an internet sensation. Ferguson, whose double somersault horror fall from the favourite Merrion Square in the 4.20pm at Wincanton on Wednesday has been watched hundreds of thousands of times online. But he was back riding out and said he was undeterred from getting back in the sadal. ‘It was just a blur,’ he said, ‘I couldn’t work out what had happened until I got back to the weighing room and watched the replay. All the other jockeys asked me if I was alright and stuff, they all watched with me and looked away in horror. It’s about the most impressive fall I’ve seen. Jockey Lewis Ferguson, 18, who survived a spectacular fall from Merrion Square shows off the cut on his nose . Ferguson, whose double somersault horror fall has been watched by hundreds of thousands online . Th...",Lewis Ferguson fell from Merrion Square at Wincanton on Wednesday .\nDespite spectacular tumble he escaped with just a cut nose .\nFerguson has been mucking out stables as usual on Thursday morning .\nThe 18-year-old says incident that went viral was 'just a blur' to him .
10560,"When emergency crews received a call saying 'someone' had been knocked down by a car in Essex, they immediately sent two ambulances to the rescue. But the vehicles returned to base just moments later when the caller revealed the 'victim' was actually a squirrel that had been run over in Epping Forest. The report forms just part of a long list released by East of England ambulance service detailing their most bizarre, and irritating, call outs. Two ambulances were sent to Epping Forest after a caller told crews 'someone' had been run down, only for the 'victim' to turn out to be a squirrel. Another man said he had dropped his burger and it was 'bleeding' Another came from reveller in Basildon, Essex, who said 'I've dropped my burger and it is bleeding', while a third woman in Chelmsford, dialled 999 to report 'I've eaten too much takeaway food.' The service said it receives around 2,500 calls per day, and in the last two years has had to deal with 1,248 that did not need emergency h...","Woman reported 'someone' had been run over, but victim was a squirrel .\nAnother man dialled 999 to say he dropped a burger which was 'bleeding'\nEast of England ambulance service warned hoax calls can cost lives ."
11457,"A loving boyfriend has granted his girlfriend her birthday wish of having their wedding photographs taken - even though they can't officially get married yet because she is bedridden in hospital. Guo Kai and girlfriend Dong Hui, who turned 22 on Monday, had been planning to get married this month in Sichuan in southwest China. But Dong was suddenly diagnosed with serious bone cancer and admitted to hospital, meaning a formal ceremony had to be postponed, reported the People's Daily Online. Big day: Dong Hui, 22, was diagnosed with bone cancer last month but still wanted to have wedding photographs taken for her birthday . You may kiss the bride: The couple had planned to tie the knot this month, but had to make do with the photos for now . Besotted: Guo Kai (pictured right) has been at his girlfriend's bedside every day . Instead, family and friends helped Dong put on a beautiful wedding dress so that a team of professional photographers could take pictures for the besotted couple....","Guo Kai and girlfriend Dong Hui, 22, had planned to get married this month .\nBut ceremony had to be postponed after Dong was admitted to hospital .\nInstead Guo arranged for photographers to go to the ward on her birthday .\nFamily and friends helped Dong get into her dream wedding dress ."
647,"(CNN)Sunday's announcement that Corinthian Colleges Inc. would shut down all of its remaining 28 campuses is a positive development in a long struggle to hold for-profit colleges accountable. Corinthian, which once enrolled more than 70,000 students, is one of the worst of the ""predator colleges"" -- schools that offer dubious degrees, saddle students with high amounts of debt and gobble up tens of billions of dollars in federal money every year. Many of these schools are for-profit career colleges that operate mostly online. It's no wonder that Corinthian is doing this after the U.S. Department of Education curtailed its access to federal student aid last summer. There are about 1.3 million students enrolled in for-profit colleges, many of which have questionable track records, and their students need help transitioning into legitimate postsecondary schools. With Uncle Sam's student loan debt sheet topping $1 trillion, we literally can't afford to continue funding for-profit colleg...","David Wheeler: Corinthian, considered a ""predator"" school, will shut down campuses .\nWheeler: Students of for-profit colleges are hapless victims; their debts should be forgiven ."



DataFrame Tail: 



Unnamed: 0,article,highlights
7129,"A British tourist has drowned in a pool in a luxury villa in Spain, just hours after arriving on holiday. Brian Nicol, 32, dived into a swimming pool at a luxury holiday villa in the Costa del Sol, where he was staying with friends, but failed to resurface. His desperate friends dragged Mr Nicol, who was born in Glasgow, out of the water and tried to save him. Brian Nicol, 32 from Glasgow, Scotland, died while staying in a villa in Nueva Andalucia like the one pictured . Police and paramedics arrived shortly after the incident took place around 8.30am on Easter Sunday morning, but Mr Nicol had died. Officers said early evidence suggests his death was a tragic accident, but they will try to establish whether he suffered a blow to the head or a digestion problems which meant he couldn't swim. The group had been on a night out in Marbella after jetting to Spain on Saturday, before returning to the rented villa in Nueva Andalucia to carry on partying around the pool. A post-mortem was ...","Brian Nicol, 32 from Glasgow, dived into a pool but failed to resurface .\nFrantic friends dragged him out of the water but he could not be revived .\nGroup were partying around pool after night out in Marbella, says source .\nEarly investigations show death was a 'tragic accident' according to police ."
7028,"Chelsea Clinton is opening up about life in the public eye, being a new mother and whether or not she would like to see her own mother become president in a new interview. Clinton, whose mother Hillary is expected to formally announce her presidential campaign this Sunday, said it is time the United States had a female leader. 'One of our core values in this country is that we are the land of equal opportunity, but when equal hasn't yet included gender, there is a fundamental challenge there that, I believe, having our first woman president—whenever that is—will help resolve,' said Clinton. Scroll down for video . Chelsea Clinton is opening up about motherhood, life in the public eye and why the United States needs a female president in a new interview with Elle . Gucci\t Black Stretch Viscose Dress with Knot Detail . Get it now at Neiman's . Visit site . Chelsea Clinton has come a long way from being the adolescent that grew up in front of our eyes while her dad was POTUS. Yes, Ch...","Chelsea Clinton is opening up about motherhood, life in the public eye and why the United States needs a female president in a new interview with Elle .\nClinton says that though the US is the 'land of equal opportunity,' that is not true about gender, and a female president would change that .\nThis just two days before her mother Hillary is expected to announce her presidential campaign .\n'It is challenging to me that women comprising 20 percent of Congress is treated as a real success. Since when did 20 percent become the definition of equality?' says Clinton .\nClinton, who is described in the magazine as 'innately regal, also appears in a fashion spread in which she looks almost unrecognizable ."
11103,"The broccoli chemical sulforaphane is known to block the inflammation and damage to cartilage associated with arthritis (file picture) An artificial version of a compound in broccoli could hold the key to treating arthritis, say researchers. The broccoli chemical sulforaphane is known to block the inflammation and damage to cartilage associated with the condition. But patients would have to eat several pounds of the vegetable every day to derive any significant benefit. Sulforaphane in its natural form is also too unstable to turn into a medicine. But UK drug company Evgen Pharma has developed a stable synthetic version of the chemical that offers the potential of a pill treatment. A single dose of the drug, known as Sulforadex or SFX-01, is the equivalent of eating 5.5lb of broccoli in one day. In tests on mice affected by osteoarthritis, Sulforadex significantly improved bone architecture, gait balance and movement. Professor Andrew Pitsillides, who co-led the research at the Roy...",Sulforaphane known to block inflammation and damage to the cartilage .\nPeople would have to eat several pounds daily to derive significant benefit .\nDrug company Evgen Pharma has developed synthetic version of chemical .
8470,"A baby girl has been badly disfigured in a dog attack at home after her parents left her by herself while they worked in nearby fields. Ten-week-old Qingqing is currently in a critical condition in hospital in eastern China, following emergency surgery to repair her mauled face. Her mother told the People's Daily Online: ‘We left after our baby girl fell asleep. Who knew this would happen?' Recovering from surgery: Little Qingqing, who is just ten weeks old, has bandages around her damaged face . The woman, named only as Ms Li, said that on the day of the attack, she and her husband fed their daughter then went to work near their house in Haimen City, eastern China. But Ms Li could not stop thinking about her daughter so returned home after just ten minutes. When she arrived, a white dog with blood around its mouth came running towards her, she said. She dashed to the bedroom where she had left Qingqing - and discovered that the little girl had been dragged from her bed by the dog,...","Ten-week-old Qingqing is in a critical condition following surgery .\nThe tiny child was on her own in a house in Haimen City, China .\nHer mother returned from work to find bloodied daughter on floor ."
7207,"For U.S. moms, the typical time between pregnancies is about 2½ years but nearly a third of women space their children too close, a government study shows. Experts say mothers should wait at least 18 months to give their body time to recover and increase the chances the next child is full-term and healthy. The study released on Thursday by the Centers for Disease Control and Prevention found that about 30 percent of women who'd had a child became pregnant again within 18 months. Dangerous pregnancy: The study released on Thursday by the Centers for Disease Control and Prevention found that about 30 percent of women who'd had a child became pregnant again within 18 months . Second baby: For U.S. moms, the typical time between pregnancies is about 2½ years but nearly a third of women space their children too close, a government study shows . 'That is actually pretty high and very problematic,' said Heike Thiel de Bocanegra, a reproductive health researcher at the University of Califo...","For U.S. moms, the typical time between pregnancies is about 2½ years .\nExperts say mothers should wait at least 18 months to give their body time to recover and increase the chances the next child is full-term and healthy .\nThe study found that about 30 percent of women who'd had a child became pregnant again within 18 months ."


In [16]:
df_text_lenght = pd.DataFrame() # Creating an empty dataframe
for feat in categorical_features: # Iterating through features --> Dialogue & Summary
    df_text_lenght[feat] = test[feat].apply(lambda x: len(str(x).split())) #  Counting words for each feature

# Plotting histogram-boxplot
histogram_boxplot(df_text_lenght,'#89c2e0', '#d500ff', 600, 1000, True, 'Test Dataset')









In [17]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english') # Top 15 terms
x = vectorizer.fit_transform(test['article'])
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Unigrams', 'Test - Article', 800, 800, 12)

In [18]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english') # Top 15 terms
x = vectorizer.fit_transform(test['highlights'])
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Unigrams', 'Test - highlights', 800, 800, 12)

In [19]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english',ngram_range = (2,2)) # Top 15 terms
x = vectorizer.fit_transform(test['article'].fillna(''))
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Bigrams', 'Test - Article', 800, 800, 12)

In [20]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english',ngram_range = (2,2)) # Top 15 terms
x = vectorizer.fit_transform(test['highlights'].fillna(''))
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Bigrams', 'Test - Hightlight', 800, 800, 12)

## **Validation Dataset**

In [21]:
# Extracting info on the training Dataframe
describe_df(valid)


DataFrame shape: (668, 2)

668 samples

2 attributes

Missing Data: 
article       0
highlights    0
dtype: int64

Duplicates: 0

Data Types: 
article       object
highlights    object
dtype: object

Categorical Features: 
article, highlights

Continuous Features: 
None

Binary Features: 
None

DataFrame Head: 



Unnamed: 0,article,highlights
12822,"A woman has sparked uproar on social media after enthusiastically bearing her K-cup breasts on Google Street View. Karen Davis from Port Pirie in South Australia was captured streaking by the Google camera cars for the popular Google Maps app, which allows users to zoom in on certain streets and towns in cities all over the world with a 360-degree view. However the 38-year-old mother, who plans to skydive topless for her 40th birthday, has hit back at 'flat-tittie chicks' claiming they are not confident enough with their own bodies and should focus on how they look. Karen Davis (pictured) from Port Pirie has caused controversy after a picture appeared on Google Maps Street View showing her bearing her size-K breasts . In the image, Ms Davis can be seen holding her arms up in the air with her T-shirt hunched up around her neck bearing her breasts, as she follows the Google camera cars around the street. Her sons are playing in the background and an unknown man stands at the fence wa...",A woman has caused a social media storm after bearing her boobs .\nThe Port Pirie woman showed her size-K assets on Google Street View .\nShe has been called a 'bad mother' by people from her town .\nKaren Davis says they are 'not happy with their own bodies'\nShe plans to do a topless skydive for her 40th birthday next year .
2232,"Goal hero Christophe Berra said Scotland were rewarded for their patience with a late winner in the 1-0 victory against Northern Ireland. The Ipswich defender headed home in the 85th minute from a Matt Ritchie corner in the friendly at Hampden Park on Wednesday night. Scotland keeper Christophe Berra celebrates scoring the winner against Northern Ireland . Berra's goal finally broke the resistance of Northern Ireland and goalkeeper Michael McGovern, who had frustrated Scotland with a succession of saves. He said: 'It was a difficult game, they made it very difficult. When teams do that it's very hard to break them down. 'Sometimes games are won on set pieces and luckily enough I managed to get my head on the end of a good corner kick and it just sneaked in.' Scotland boss Gordon Strachan was happy with a win and a clean sheet as preparation for Sunday's Euro 2016 qualifier against Gibraltar. Christophe Berra gets above his marker to head the ball into the corner for a late winner f...",Christophe Berra scored late winner for Scotland in friendly on Wednesday .\nNorthern Ireland goalkeeper Michael McGovern had kept home side at bay .\nGordon Strachan's side face Gibraltar in Euro 2016 qualifier on Sunday .
6606,"The White House, the State Department and Democrats on Capitol Hill are side-stepping questions about Hillary Clinton as the controversy over her cyber hygiene escalates. While some of the presumed presidential candidate's party members have come to her defense, many lawmakers headed for the exits this week as reporters asked them for their opinion on the way Clinton handled her email while at the State Department. Democratic strategists were more willing discuss the scandal - but what they had to say, Clinton probably won't like. Democrats are running for cover this week as former Secretary of State Hillary Clinton takes fire from the press for over her cyber hygiene. Officials at 1600 Pennsylvania now claim they didn't know that Clinton was solely using her personal email address to conduct government business . Officials at 1600 Pennsylvania now claim they didn't know that Clinton was solely using her personal email address to conduct government business until Congress reopened ...","White House now claims it didn't know Clinton was solely using her personal email address to conduct government business .\nState Department wouldn't say it was 'confident' that Clinton had turned over all her emails from her tenure there .\nClinton's former Democratic colleagues in Congress were won't to discuss the issue at all .\nNew York Rep. Steve Israel blamed Republicans for the mess and accused them of 'trying to manufacture or amplify crisis or scandal,' however .\nFormer Obama adviser David Axelrod said Wednesday night that he thinks the Clinton is adding to her woes by keeping silent .\n'If this is not handled really well,' a Dem strategist said, Dem operatives will begin saying, ' ""Maybe we need another person in this race"" '"
2057,"Chelsea took to Instagram on Saturday to back up their manager and attack the pundits who criticised their behaviour during the Champions League exit at the hands of PSG in midweek. Graeme Souness and Sportsmail columnist Jamie Carragher branded Jose Mourinho's team 'a disgrace' after they surrounded the referee to get Zlatan Ibrahimovic sent off during the 2-2 draw. But afterwards, Mourinho accused the pair of having short memories, pointing out both men's own on-field reputations. Jose Mourinho spikily replied to comments made by Jamie Carragher and Graeme Souness . Chelsea players surround Bjorn Kuipers as the Dutch referee gives Zlatan Ibrahimovic his marching orders . And the club have put their weight behind his claims, publishing pictures of both Carragher and Souness confronting officials from years gone by. The post was captioned with the message 'For those with short memories...', and shows Carragher with his Liverpool team-mates protesting a decision in 2007, and Souness...",Jose Mourinho had accused pundits of forgetting what they did on pitch .\nChelsea post pictures of both men surrounding referees in the past .\nChelsea have come under fire for players reaction during PSG defeat .\nJamie Carragher blasted Mourinho's team 'a disgrace'
9825,"A detailed search for Indian Ocean rubbish on the south west coast of Tasmania might have indicated that the hunt for missing flight MH370 is in the wrong place. Volunteers have scoured beaches, rocks and shallow waters along a 60-mile stretch of coast in the lower corner of the island state, located to the south of the Australian continent, picking up no less than 80,000 pieces of rubbish. Some of the debris has drifted 6,000 miles across the Indian Ocean from as far away as Madagascar and South Africa but there has been nothing that appears to be from the Malaysia Airlines jet. Yet the official search area for the aircraft, some 1200 miles south west of Perth in Western Australia, lies virtually in the path that the rubbish from the other side of the ocean has drifted along. Doomed: A detailed search for Indian Ocean rubbish on the south west coast of Tasmania might have indicated that the hunt for missing Malaysia Airlines flight MH370 (pictured) is in the wrong place . If nothi...","Volunteers have been cleaning a 60-mile stretch of the Tasmanian coast .\nThey gathered 80,000 pieces of rubbish - some as small as a grain of rice .\nBut so far there has been no sign of any debris from doomed flight MH370 .\nVolunteers say this is impossible if jet crashed in the current search area .\nSuggests investigators could be looking for the aircraft in the wrong place ."



DataFrame Tail: 



Unnamed: 0,article,highlights
3941,"Real Madrid and Manchester United target Danilo was rushed to hospital on Tuesday night after a sickening collision with his own goalkeeper during Porto's 4-0 Champions League win against Basle. The Brazilian right back appeared to momentarily knocked unconscious after Fabiano came rushing out of his area to make a headed clearance. Danilo, with his eyes on the ball, failed to see the on-rushing keeper as the pair came together with Fabiano's shoulder making contact with the unfortunate full back's face. Danilo lays stricken on the turf following the collision with Porto goalkeeper Fabiano . Danilo collided with Fabiano as the Porto keeper cam rushing out of his goal to make a headed clearance . Fabiano headed the ball clear but made contact with Danilo's head with his shoulder . Danilo's head was snapped back by the force of the contact with the goalkeeper . Concerned team-mates and opposition players surround Danilo as he appeared unconscious . Medical staffed tend to the Brazil ...",Porto beat Basle 4-0 in the Champions League on Tuesday night .\nDanilo was rushed to hospital after being knocked out by keeper Fabiano .\nThe full back was given the all-clear after medical examination .\nDanilo has reportedly agreed to join Real Madrid at the end of the season .\nManchester United are also interested in the Brazil international .
4879,"Nico Rosberg has taken his preparations for the Malaysian Grand Prix to a whole new level after climbing to the top of one of the giant Petronas Towers in Kuala Lumpur. The Mercedes star, who finished second in the opening race of the season in Australia, filmed himself and some friends making their way up the 88-floor building on foot. Rosberg and his pals managed all 2,170 steps to reach the top of the 1,483 foot building, leaving them with amazing views over the Malaysian capital. VIDEO Scroll down to see Nico Rosberg climb one of the Petronas Towers in Kuala Lumpur . Nico Rosberg and his friends pose in front of the Petronas Towers in Kuala Lumpur, Malaysia . The Mercedes driver films himself and his friends climbing the 2,170 steps to the top of tower . Rosberg (left) makes his way towards the top of the building - which was the tallest in the world until 2004 . The German appears determined to get himself in the best possible shape for Sunday's Grand Prix after finishing behi...","Mercedes' Nico Rosberg will race in the Malaysian Grand Prix on Sunday .\nThe German warmed up by climbing one of the Petronas Towers .\nThe towers in Kula Lumpur have 88 floors and are 1,483 feet high .\nRosberg finished second in the opening race of the season ."
1491,"England forward Danny Welbeck is set for a 'great future' at Arsenal, but manager Arsene Wenger warned all his players their first duty is to serve the club - whether that is on the pitch or on the bench. Welbeck joined the Gunners on transfer deadline day from Manchester United, where he had become frustrated at a lack of opportunities in his preferred central striker role. Since the return to fitness and form of Olivier Giroud, who spent four months out with a broken leg, the 24-year-old has found himself deployed on the flank of a front three along with Alexis Sanchez, rather than down the middle. Danny Welbeck joined Arsenal from Manchester United last summer after growing frustrated at bit-part role . The striker was left on the bench for Wednesday's Premier League match against QPR . Welbeck was a late substitute for both of the previous two Barclays Premier League matches, but could face his former club in Monday night's FA Cup quarter-final at Old Trafford. Wenger is convin...",Danny Welbeck joined Arsenal from Manchester United last summer .\nStriker has had to make do with substitute appearances in recent games .\nArsene Wenger insists the England forward has a bright future at the club .\nArsenal have a lot of competition particularly in the forward area .
12297,"Tourists visiting Buckingham Palace when it opens for the summer season will be able to experience a taste of royal hospitality for the first time. Paying tourists will now be allowed to enter the State Rooms through the Grand Entrance - just like a visiting head of state - as they are given unique insight into the preparations that go into hosting opulent state banquets, investitures, garden parties and private audiences. As such, this year's annual summer opening exhibition is entitled, A Royal Welcome, as the Palace reveals just how it copes with the 62,000 guests it entertains each year. Forthcoming exhibition, A Royal Welcome, will reveal exactly how Buckingham Palace prepares for its opulent events, like state banquets . Anna Reynolds, the curator of the summer season exhibit, prepares a silver gilt centrepiece, which will be on display from June . In the ballroom, the horseshoe shaped table will be set for a state banquet, complete with silver gilt centrepieces, candelabra a...","When it opens for the summer season, tourists will get to see how the Palace copes with its 62,000 yearly guests .\nThis year's annual exhibition is entitled, A Royal Welcome, and will display preparations that go into state banquets .\nAlso on display will be: silver centrepieces, red leather seating planner and elaborate dresses worn by the Queen ."
8337,"BBC Director-General Tony Hall has said he will 'gather the facts' and make a decision 'based on a whale raft of things' regarding Jeremy Clarkson's future at the corporation. Clarkson was suspended from Top Gear last week following an alleged altercation with producer Oisin Tymon. In a Q&A session at the Enders Analysis conference in London today, the first two questions directed to Lord Hall related to Clarkson. Scroll down for video . Lord Hall, Director-General of the BBC, last week described himself as a 'fan' of Top Gear presenter Clarkson . He began: 'I thought for one glorious moment, you know ...' (that he might not get a Clarkson question), a joke that drew laughs. 'The most important thing is to gather the facts around you and not listen to all the speculation and let me tell you I will gather the facts and make my decision based on a whole raft of things.' Business Insider then asked him whether it was healthy to keep staff on board who might be punching their colleague...","Tony Hall will 'gather the facts' before deciding on Clarkson's future .\nPresenter suspended last week after alleged altercation with producer .\nLast week, Lord Hall described himself as a 'fan' of the Top Gear presenter .\nBBC has now launched an inquiry into what happened in Yorkshire ."


In [22]:
df_text_lenght = pd.DataFrame() # Creating an empty dataframe
for feat in categorical_features: # Iterating through features --> Dialogue & Summary
    df_text_lenght[feat] = valid[feat].apply(lambda x: len(str(x).split())) #  Counting words for each feature

# Plotting histogram-boxplot
histogram_boxplot(df_text_lenght,'#89c2e0', '#d500ff', 600, 1000, True, 'Validation Dataset')









In [23]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english') # Top 15 terms
x = vectorizer.fit_transform(valid['article'])
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Unigrams', 'Validation - Article', 800, 800, 12)

In [24]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english') # Top 15 terms
x = vectorizer.fit_transform(valid['highlights'])
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Unigrams', 'Validation - Highlight', 800, 800, 12)

In [25]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english',ngram_range = (2,2)) # Top 15 terms
x = vectorizer.fit_transform(valid['article'].fillna(''))
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Bigrams', 'Validation - Article', 800, 800, 12)

In [26]:
vectorizer = TfidfVectorizer(max_features = 15,stop_words = 'english',ngram_range = (2,2)) # Top 15 terms
x = vectorizer.fit_transform(valid['highlights'].fillna(''))
df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
plot_correlation(df_tfidfvect, 'Bigrams', 'Validation - Hightlight', 800, 800, 12)

## **Preprocessing Data**

In [27]:
# Cleaning texts in all datasets
train_all['highlights']=train_all['highlights'].apply(clean_text)
test['highlights']=test['highlights'].apply(clean_text)
valid['highlights']=valid['highlights'].apply(clean_text)

In [28]:
# Transforming dataframes into datasets
train_ds = Dataset.from_pandas(train_all)
test_ds = Dataset.from_pandas(test)
val_ds = Dataset.from_pandas(valid)
#train_df1 = Dataset.from_pandas(df_1)

# Visualizing results
print(train_ds)
print('\n' * 2)
print(test_ds)
print('\n' * 2)
print(val_ds)

Dataset({
    features: ['article', 'highlights', '__index_level_0__'],
    num_rows: 4785
})



Dataset({
    features: ['article', 'highlights', '__index_level_0__'],
    num_rows: 574
})



Dataset({
    features: ['article', 'highlights', '__index_level_0__'],
    num_rows: 668
})


## **Bart Model**

In [30]:
checkpoint = 'facebook/bart-large-xsum' # Model
tokenizer = BartTokenizer.from_pretrained(checkpoint) # Loading Tokenizer

In [31]:
model = BartForConditionalGeneration.from_pretrained(checkpoint) 

In [32]:
max_pos_embeddings = model.config.max_position_embeddings
print("Maximum position embeddings:", max_pos_embeddings)

Maximum position embeddings: 1024


In [33]:
def preprocess_function(examples):
    inputs = [doc for doc in examples['article']]
    summaries = [summary for summary in examples['highlights']]
    # Increase max_length for inputs and max_length for the summary as needed
    model_inputs = tokenizer(inputs, max_length=max_pos_embeddings-1, truncation=True, padding="max_length")

    # Setup the tokenizer for targets with increased max_length
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(summaries, max_length=max_pos_embeddings-1, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [34]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50265, 1024)

In [35]:
print(model)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, el

In [36]:
# Applying preprocess_function to the datasets
tokenized_train = train_ds.map(preprocess_function, batched=True,
                               remove_columns=['article', 'highlights']) # Removing features

tokenized_test = test_ds.map(preprocess_function, batched=True,
                               remove_columns=['article', 'highlights']) 

tokenized_val = val_ds.map(preprocess_function, batched=True,
                               remove_columns=['article', 'highlights']) 

# Printing results
print('\n' * 3)
print('Preprocessed Training Dataset:\n')
print(tokenized_train)
print('\n' * 2)
print('Preprocessed Test Dataset:\n')
print(tokenized_test)
print('\n' * 2)
print('Preprocessed Validation Dataset:\n')
print(tokenized_val)


`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.

Map: 100%|██████████| 4785/4785 [00:17<00:00, 267.69 examples/s]
Map: 100%|██████████| 574/574 [00:02<00:00, 225.10 examples/s]
Map: 100%|██████████| 668/668 [00:02<00:00, 228.38 examples/s]





Preprocessed Training Dataset:

Dataset({
    features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 4785
})



Preprocessed Test Dataset:

Dataset({
    features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 574
})



Preprocessed Validation Dataset:

Dataset({
    features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 668
})





In [37]:
# Selecting a sample from the dataset
sample = tokenized_train[0]

# Printing its features
print("input_ids:")
print(sample['input_ids'])
print("\n")
print("attention_mask:")
print(sample['attention_mask'])
print("\n")
print("Label:")
print(sample['labels'])
print("\n")

input_ids:
[0, 22247, 36, 16256, 43, 480, 20, 1853, 6824, 1753, 3131, 34, 45, 3751, 7, 3872, 18615, 103, 68, 37853, 153, 11, 3081, 14, 58, 21559, 576, 7, 10572, 6, 151, 2172, 13, 2004, 8, 97, 2887, 511, 13697, 16470, 8, 18431, 6, 41, 2222, 168, 13660, 161, 4, 96, 10, 1601, 7, 19004, 19552, 5369, 34704, 877, 6, 12412, 1292, 2431, 27192, 875, 14, 10, 752, 461, 11, 2266, 2740, 19004, 7, 464, 63, 609, 13, 7979, 5, 418, 4, 125, 302, 6, 130, 107, 71, 14, 461, 2255, 6, 22, 4528, 3081, 1091, 542, 9119, 17970, 142, 110, 558, 34, 45, 576, 507, 2846, 9, 10, 92, 3872, 18615, 1757, 609, 60, 27192, 875, 4, 3515, 5, 124, 12, 560, 12, 1644, 7749, 11, 4013, 6, 19004, 2982, 428, 24417, 55, 87, 68, 406, 325, 11, 3485, 7, 7149, 4, 497, 5, 86, 6, 5, 168, 2325, 10, 4549, 15, 20566, 5, 418, 1335, 142, 9, 5, 10697, 782, 9, 1196, 9, 5, 4602, 2565, 4, 20, 418, 21, 3833, 13, 7582, 3485, 6, 184, 9534, 6, 2004, 5010, 6, 1375, 1042, 6, 1131, 1042, 8, 97, 1736, 3485, 4, 125, 11, 5, 2130, 18, 3874, 6, 19004, 2319, 14

In [38]:
# Instantiating Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [39]:
metric = load_metric('rouge') 


load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate


The repository for rouge contains custom code which must be executed to correctly load the metric. You can inspect the repository content at https://raw.githubusercontent.com/huggingface/datasets/2.18.0/metrics/rouge/rouge.py
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.



In [40]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred# Obtaining predictions and true labels
    
    # Decoding predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Obtaining the true labels tokens, while eliminating any possible masked token (i.e., label = -100)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    
    # Computing rouge score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()} # Extracting some results

    # Add mean-generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [41]:
# Defining parameters for training
training_args = Seq2SeqTrainingArguments(
    output_dir = 'bart_samsum',
    evaluation_strategy = "epoch",
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_loss',
    seed = 42,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)

In [42]:
# Defining Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [43]:
trainer.train()


1Torch was not compiled with flash attention. (Triggered internally at ..\aten\src\ATen\native\transformers\cuda\sdp_utils.cpp:263.)

 10%|█         | 500/4784 [1:21:16<11:37:35,  9.77s/it]

{'loss': 0.3177, 'grad_norm': 0.45230451226234436, 'learning_rate': 1.79180602006689e-05, 'epoch': 0.42}


 21%|██        | 1000/4784 [2:42:31<10:01:44,  9.54s/it]

{'loss': 0.1029, 'grad_norm': 0.5177956223487854, 'learning_rate': 1.5827759197324415e-05, 'epoch': 0.84}


                                                        
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


{'eval_loss': 0.11954127252101898, 'eval_rouge1': 32.7171, 'eval_rouge2': 14.7024, 'eval_rougeL': 24.0704, 'eval_rougeLsum': 30.0849, 'eval_gen_len': 38.892, 'eval_runtime': 2528.9189, 'eval_samples_per_second': 0.227, 'eval_steps_per_second': 0.113, 'epoch': 1.0}


 31%|███▏      | 1500/4784 [4:46:37<8:54:03,  9.76s/it]   

{'loss': 0.087, 'grad_norm': 0.5146250128746033, 'learning_rate': 1.3737458193979934e-05, 'epoch': 1.25}


 42%|████▏     | 2000/4784 [6:08:00<7:33:29,  9.77s/it]

{'loss': 0.0768, 'grad_norm': 0.37484827637672424, 'learning_rate': 1.1647157190635453e-05, 'epoch': 1.67}


                                                       
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


{'eval_loss': 0.12450303882360458, 'eval_rouge1': 30.5316, 'eval_rouge2': 13.4375, 'eval_rougeL': 22.783, 'eval_rougeLsum': 28.0523, 'eval_gen_len': 34.0122, 'eval_runtime': 2961.5514, 'eval_samples_per_second': 0.194, 'eval_steps_per_second': 0.097, 'epoch': 2.0}


 52%|█████▏    | 2500/4784 [8:21:10<6:53:23, 10.86s/it]   

{'loss': 0.0736, 'grad_norm': 0.37336501479148865, 'learning_rate': 9.55685618729097e-06, 'epoch': 2.09}


 63%|██████▎   | 3000/4784 [9:51:08<5:22:24, 10.84s/it]

{'loss': 0.0582, 'grad_norm': 0.41269451379776, 'learning_rate': 7.4665551839464886e-06, 'epoch': 2.51}


 73%|███████▎  | 3500/4784 [11:23:23<3:55:31, 11.01s/it]

{'loss': 0.0588, 'grad_norm': 0.4636141061782837, 'learning_rate': 5.376254180602007e-06, 'epoch': 2.93}


                                                        
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


{'eval_loss': 0.13088922202587128, 'eval_rouge1': 33.8582, 'eval_rouge2': 14.7473, 'eval_rougeL': 24.0362, 'eval_rougeLsum': 31.0717, 'eval_gen_len': 41.8554, 'eval_runtime': 3836.3836, 'eval_samples_per_second': 0.15, 'eval_steps_per_second': 0.075, 'epoch': 3.0}


 84%|████████▎ | 4000/4784 [13:59:59<2:21:22, 10.82s/it]    

{'loss': 0.048, 'grad_norm': 0.48340409994125366, 'learning_rate': 3.2859531772575254e-06, 'epoch': 3.34}


 94%|█████████▍| 4500/4784 [15:31:29<52:20, 11.06s/it]  

{'loss': 0.0467, 'grad_norm': 0.415427029132843, 'learning_rate': 1.1998327759197326e-06, 'epoch': 3.76}


                                                      
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


{'eval_loss': 0.13898888230323792, 'eval_rouge1': 35.8197, 'eval_rouge2': 15.1173, 'eval_rougeL': 24.7427, 'eval_rougeLsum': 32.8143, 'eval_gen_len': 47.2387, 'eval_runtime': 3855.8961, 'eval_samples_per_second': 0.149, 'eval_steps_per_second': 0.074, 'epoch': 4.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 4784/4784 [17:28:42<00:00, 13.15s/it]

{'train_runtime': 62922.4165, 'train_samples_per_second': 0.304, 'train_steps_per_second': 0.076, 'train_loss': 0.09360834009272598, 'epoch': 4.0}





TrainOutput(global_step=4784, training_loss=0.09360834009272598, metrics={'train_runtime': 62922.4165, 'train_samples_per_second': 0.304, 'train_steps_per_second': 0.076, 'train_loss': 0.09360834009272598, 'epoch': 4.0})

In [44]:
# Evaluating model performance on the tokenized validation dataset
validation = trainer.evaluate(eval_dataset = tokenized_val)
print(validation) # Printing results

100%|██████████| 334/334 [1:08:13<00:00, 12.26s/it]

{'eval_loss': 0.12741529941558838, 'eval_rouge1': 33.0058, 'eval_rouge2': 15.533, 'eval_rougeL': 24.1852, 'eval_rougeLsum': 30.3611, 'eval_gen_len': 39.1407, 'eval_runtime': 4103.2426, 'eval_samples_per_second': 0.163, 'eval_steps_per_second': 0.081, 'epoch': 4.0}





In [45]:

# Saving model to a custom directory
directory = "bart_finetuned_cnn"
trainer.save_model(directory)

# Saving model tokenizer
tokenizer.save_pretrained(directory)

Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


('bart_finetuned_cnn\\tokenizer_config.json',
 'bart_finetuned_cnn\\special_tokens_map.json',
 'bart_finetuned_cnn\\vocab.json',
 'bart_finetuned_cnn\\merges.txt',
 'bart_finetuned_cnn\\added_tokens.json')

In [47]:
# Saving model in .zip format
shutil.make_archive('bart_finetuned_cnn', 'zip', 'D:/THOWL/AML/Abgabe/bart_finetuned_cnn')
shutil.move('bart_finetuned_cnn.zip', 'D:/THOWL/AML/Abgabe/bart_finetuned_cnn.zip')

'D:/THOWL/AML/Abgabe/bart_finetuned_cnn.zip'