In [1]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [7]:
file_path = 'wikihowSep.csv'
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0,overview,headline,text,sectionLabel,title
0,So you're a new or aspiring artist and your c...,\nSell yourself first.,"Before doing anything else, stop and sum up y...",Steps,How to Sell Fine Art Online
1,"If you want to be well-read, then, in the wor...",\nRead the classics before 1600.,Reading the classics is the very first thing ...,Reading the Classics,How to Be Well Read
2,So you're a new or aspiring artist and your c...,\nJoin online artist communities.,Depending on what scale you intend to sell yo...,Steps,How to Sell Fine Art Online
3,So you're a new or aspiring artist and your c...,\nMake yourself public.,Get yourself out there as best as you can by ...,Steps,How to Sell Fine Art Online
4,So you're a new or aspiring artist and your c...,\nBlog about your artwork.,"Given the hundreds of free blogging websites,...",Steps,How to Sell Fine Art Online


In [10]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nasir\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nasir\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
def preprocess_text(text):
    # Check if the input is a string or if it's NaN (missing value)
    if isinstance(text, str) or pd.notnull(text):
        # Remove unnecessary formatting and special characters
        text = re.sub(r'\n', ' ', str(text))  # Replace newline characters with space
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
        text = re.sub(r'\[[^]]*\]', '', text)  # Remove text within square brackets (e.g., citations)

        # Tokenize text
        tokens = word_tokenize(text)

        # Remove stopwords and punctuation
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token.lower() not in stop_words and token.isalnum()]

        # Join tokens back into a single string
        preprocessed_text = ' '.join(tokens)
        
        return preprocessed_text
    else:
        # If the input is not a string or NaN, return an empty string
        return ''

In [14]:
text_columns = ['overview', 'headline', 'text', 'sectionLabel', 'title']
for column in text_columns:
    df[column] = df[column].apply(preprocess_text)
print(df.head())

                                            overview  \
0  new aspiring artist creativity spawned somethi...   
1  want words William Faulkner Read read read Rea...   
2  new aspiring artist creativity spawned somethi...   
3  new aspiring artist creativity spawned somethi...   
4  new aspiring artist creativity spawned somethi...   

                         headline  \
0                      Sell first   
1              Read classics 1600   
2  Join online artist communities   
3                     Make public   
4                    Blog artwork   

                                                text      sectionLabel  \
0  anything else stop sum artist think translate ...             Steps   
1  Reading classics first thing want build solid ...  Reading Classics   
2  Depending scale intend sell art pieces may wan...             Steps   
3  Get best advertising Publish example pieces ar...             Steps   
4  Given hundreds free blogging websites lot choi...             Steps

In [15]:
df.head()

Unnamed: 0,overview,headline,text,sectionLabel,title
0,new aspiring artist creativity spawned somethi...,Sell first,anything else stop sum artist think translate ...,Steps,Sell Fine Art Online
1,want words William Faulkner Read read read Rea...,Read classics 1600,Reading classics first thing want build solid ...,Reading Classics,Well Read
2,new aspiring artist creativity spawned somethi...,Join online artist communities,Depending scale intend sell art pieces may wan...,Steps,Sell Fine Art Online
3,new aspiring artist creativity spawned somethi...,Make public,Get best advertising Publish example pieces ar...,Steps,Sell Fine Art Online
4,new aspiring artist creativity spawned somethi...,Blog artwork,Given hundreds free blogging websites lot choi...,Steps,Sell Fine Art Online


In [16]:
df.to_csv('wikihowPreprocess.csv', index=False)

In [2]:
file_path1 = 'wikihowPreprocess.csv'
df1 = pd.read_csv(file_path1)
df1.head(5)

Unnamed: 0,overview,headline,text,sectionLabel,title
0,new aspiring artist creativity spawned somethi...,Sell first,anything else stop sum artist think translate ...,Steps,Sell Fine Art Online
1,want words William Faulkner Read read read Rea...,Read classics 1600,Reading classics first thing want build solid ...,Reading Classics,Well Read
2,new aspiring artist creativity spawned somethi...,Join online artist communities,Depending scale intend sell art pieces may wan...,Steps,Sell Fine Art Online
3,new aspiring artist creativity spawned somethi...,Make public,Get best advertising Publish example pieces ar...,Steps,Sell Fine Art Online
4,new aspiring artist creativity spawned somethi...,Blog artwork,Given hundreds free blogging websites lot choi...,Steps,Sell Fine Art Online


In [3]:
from sklearn.model_selection import train_test_split

X = df1['text'].tolist()
y = df1['overview'].tolist()

In [4]:
X = [str(x) for x in X]
y = [str(y) for y in y]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from transformers import BartTokenizer, BartForConditionalGeneration
from rouge import Rouge 
import torch

In [6]:
# Load pre-trained BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

In [None]:
# Tokenize input text and target summaries
train_encodings = tokenizer(X_train, truncation=True, padding=True)
train_labels = tokenizer(y_train, truncation=True, padding=True)

test_encodings = tokenizer(X_test, truncation=True, padding=True)