# BBC NEWS ARTICLE SUMMARIZATION

### Import libraries

In [18]:
import os
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer

### Load the text files from the directory

In [19]:
# Define the directory of the files
files_directory = "data/entertainment"

# Create an empty list to store DataFrames
dataframes = []

# Loop through all text files in the directory
for filename in os.listdir(files_directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(files_directory, filename)
        
        # Read the text from the file
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
        
        # Create a DataFrame for each file
        dataset = pd.DataFrame({"Name": [filename], "Text": [text]})
        dataframes.append(dataset)

# Concatenate the list of DataFrames into one DataFrame
texts_data = pd.concat(dataframes, ignore_index=True)

# Display the DataFrame to explore the text content
texts_data.head(10)

Unnamed: 0,Name,Text
0,001.txt,Gallery unveils interactive tree\n\nA Christma...
1,002.txt,Jarre joins fairytale celebration\n\nFrench mu...
2,003.txt,Musical treatment for Capra film\n\nThe classi...
3,004.txt,Richard and Judy choose top books\n\nThe 10 au...
4,005.txt,Poppins musical gets flying start\n\nThe stage...
5,006.txt,Bennett play takes theatre prizes\n\nThe Histo...
6,007.txt,Levy tipped for Whitbread prize\n\nNovelist An...
7,008.txt,West End to honour finest shows\n\nThe West En...
8,009.txt,Da Vinci Code is 'lousy history'\n\nThe plot o...
9,010.txt,Uganda bans Vagina Monologues\n\nUganda's auth...


### Explore the data

In [20]:
texts_data.Text[0]

'Gallery unveils interactive tree\n\nA Christmas tree that can receive text messages has been unveiled at London\'s Tate Britain art gallery.\n\nThe spruce has an antenna which can receive Bluetooth texts sent by visitors to the Tate. The messages will be "unwrapped" by sculptor Richard Wentworth, who is responsible for decorating the tree with broken plates and light bulbs. It is the 17th year that the gallery has invited an artist to dress their Christmas tree. Artists who have decorated the Tate tree in previous years include Tracey Emin in 2002.\n\nThe plain green Norway spruce is displayed in the gallery\'s foyer. Its light bulb adornments are dimmed, ordinary domestic ones joined together with string. The plates decorating the branches will be auctioned off for the children\'s charity ArtWorks. Wentworth worked as an assistant to sculptor Henry Moore in the late 1960s. His reputation as a sculptor grew in the 1980s, while he has been one of the most influential teachers during th

In [21]:
# getting the information of the data
texts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386 entries, 0 to 385
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    386 non-null    object
 1   Text    386 non-null    object
dtypes: object(2)
memory usage: 6.2+ KB


In [22]:
# checking for null value
texts_data.isna().sum()

Name    0
Text    0
dtype: int64

In [23]:
# limiting the number of data for easy tokenization and summary
texts_data = texts_data.iloc[0:50] 
texts_data

Unnamed: 0,Name,Text
0,001.txt,Gallery unveils interactive tree\n\nA Christma...
1,002.txt,Jarre joins fairytale celebration\n\nFrench mu...
2,003.txt,Musical treatment for Capra film\n\nThe classi...
3,004.txt,Richard and Judy choose top books\n\nThe 10 au...
4,005.txt,Poppins musical gets flying start\n\nThe stage...
5,006.txt,Bennett play takes theatre prizes\n\nThe Histo...
6,007.txt,Levy tipped for Whitbread prize\n\nNovelist An...
7,008.txt,West End to honour finest shows\n\nThe West En...
8,009.txt,Da Vinci Code is 'lousy history'\n\nThe plot o...
9,010.txt,Uganda bans Vagina Monologues\n\nUganda's auth...


### Initialize the T5 model and tokenizer

In [24]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

### Create a DataFrame to store the summaries

In [25]:

texts_data["Summary"] = ""

### Tokenize and summarize the text

In [26]:
# Initialize the model once for better performance
model.eval()

# Iterate through the rows of the DataFrame
for index, row in texts_data.iterrows():
    text = row["Text"]
    
    # Tokenize and summarize the text
    input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Update the "Summary" column with the generated summary
    texts_data.at[index, "Summary"] = summary



### View Summary

In [27]:

texts_data.head(10)

Unnamed: 0,Name,Text,Summary
0,001.txt,Gallery unveils interactive tree\n\nA Christma...,the spruce has an antenna which can receive Bl...
1,002.txt,Jarre joins fairytale celebration\n\nFrench mu...,"""Christian Andersen's fairy tales are timeless..."
2,003.txt,Musical treatment for Capra film\n\nThe classi...,the musical is being turned into a musical by ...
3,004.txt,Richard and Judy choose top books\n\nThe 10 au...,the 10 authors shortlisted for a Richard and J...
4,005.txt,Poppins musical gets flying start\n\nThe stage...,the stage adaptation of children's film Mary P...
5,006.txt,Bennett play takes theatre prizes\n\nThe Histo...,The History Boys by Alan Bennett takes theatre...
6,007.txt,Levy tipped for Whitbread prize\n\nNovelist An...,Andrea Levy is favourite to win the main Whitb...
7,008.txt,West End to honour finest shows\n\nThe West En...,"the Producers, starring Nathan Lane and Lee Ev..."
8,009.txt,Da Vinci Code is 'lousy history'\n\nThe plot o...,the da Vinci Code claims Jesus was not crucifi...
9,010.txt,Uganda bans Vagina Monologues\n\nUganda's auth...,"the play is due to open in the capital, Kampal..."


### Save the DataFrame with summaries to a CSV file

In [29]:
texts_data.to_csv("summaries.csv", index=False)