# Connect to Drive

In [1]:
from google.colab import drive

drive.mount('/content/drive')

%cd /content/drive/MyDrive/deeplearning/projects/en-text-sum-fine-tuned-bart/

Mounted at /content/drive
/content/drive/MyDrive/deeplearning/projects/en-text-sum-fine-tuned-bart


# Overview original dataset

In [2]:
import pandas as pd

original_df = pd.read_csv('dataset/wikihow_all.csv')

original_df.head()

Unnamed: 0,headline,title,text
0,"\nKeep related supplies in the same area.,\nMa...",How to Be an Organized Artist1,"If you're a photographer, keep all the necess..."
1,\nCreate a sketch in the NeoPopRealist manner ...,How to Create a Neopoprealist Art Work,See the image for how this drawing develops s...
2,"\nGet a bachelor’s degree.,\nEnroll in a studi...",How to Be a Visual Effects Artist1,It is possible to become a VFX artist without...
3,\nStart with some experience or interest in ar...,How to Become an Art Investor,The best art investors do their research on t...
4,"\nKeep your reference materials, sketches, art...",How to Be an Organized Artist2,"As you start planning for a project or work, ..."


In [3]:
print(f'Shape of original dataset: {original_df.shape}')
print('Information of original dataset:')
original_df.info()

Shape of original dataset: (215365, 3)
Information of original dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215365 entries, 0 to 215364
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  214547 non-null  object
 1   title     215364 non-null  object
 2   text      214294 non-null  object
dtypes: object(3)
memory usage: 4.9+ MB


# Get examples

In [4]:
for id in range(2):
  example = original_df.iloc[id]

  print(f'EXAMPLE {id}-TH:')
  print(f"Title: {example['title']}")
  print(f"Text: {example['text']}")
  print(f"Headline: {example['headline']}")

EXAMPLE 0-TH:
Title: How to Be an Organized Artist1
Text:  If you're a photographer, keep all the necessary lens, cords, and batteries in the same quadrant of your home or studio. Paints should be kept with brushes, cleaner, and canvas, print supplies should be by the ink, etc. Make broader groups and areas for your supplies to make finding them easier, limiting your search to a much smaller area. Some ideas include:


Essential supplies area -- the things you use every day.
Inspiration and reference area.
Dedicated work area .
Infrequent or secondary supplies area, tucked out of the way.;
, This doesn't mean cleaning the entire studio, it just means keeping the area immediately around the desk, easel, pottery wheel, etc. clean each night. Discard trash or unnecessary materials and wipe down dirty surfaces. Endeavor to leave the workspace in a way that you can sit down the next day and start working immediately, without having to do any work or tidying.


Even if the rest of your studi

# Remove null-values

In [5]:
print('Null-values statistics:')
original_df.isna().sum()

Null-values statistics:


Unnamed: 0,0
headline,818
title,1
text,1071


In [6]:
df_no_na = original_df.dropna().reset_index(drop=True)

print(f'Shape of dataset without null-values: {df_no_na.shape}')
print(f'Number of original records: {original_df.shape[0]}')
print(f'Number of records after drop null-values: {df_no_na.shape[0]}')
print(f'Number of droped records: {original_df.shape[0] - df_no_na.shape[0]}')
print('Information of dataset without null-values:')
df_no_na.info()

Shape of dataset without null-values: (214294, 3)
Number of original records: 215365
Number of records after drop null-values: 214294
Number of droped records: 1071
Information of dataset without null-values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214294 entries, 0 to 214293
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  214294 non-null  object
 1   title     214294 non-null  object
 2   text      214294 non-null  object
dtypes: object(3)
memory usage: 4.9+ MB


# Clean raw data

In [7]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K

In [8]:
import re
import contractions

In [9]:
text = original_df.iloc[10]['text']
headline = original_df.iloc[10]['headline']

print('BEFORE CLEANING:')
print(f'Text: {text}')
print(f'Headline: {headline}')

BEFORE CLEANING:
Text: ;
, Professional quality recordings of your songs are always preferred.

, Record a music demo. The word demo is shortened from the word "demonstration". A demo is used to demonstrate your artistic abilities to the A&R representatives of record labels.

, "A demo submission" is the process where an artist will find a record label company and send out their best three to five songs (demo) in the hopes of "getting signed" (receiving a record deal after signing a recording contract).

, Talk intelligently about your music, goals and where you fit with the label.

,,,
Headline: 
Practice your material until you can perform each song perfectly.,
Choose whether you're going to record your music in your own home studio or a professional recording studio.,
Prove your talent.,
Pick your best three to five songs for a demo submission.,
Write a brief bio.,
Visit http://www.PhantomCityStudio.com/Demos for up-to-date information on how to properly submit your music demo.

,
C

In [10]:
def clean_source_feature(text: str) -> str:
    cleaned_text = re.sub(r"\n", " ", text)
    cleaned_text = re.sub(r"[\.;],", ".", cleaned_text)
    cleaned_text = re.sub(r"\.{2,}", ".", cleaned_text)
    cleaned_text = re.sub(r",{2,}", ",", cleaned_text)
    cleaned_text = re.sub(r"^[^\w]+|[^\w]+$", "", cleaned_text)
    cleaned_text = cleaned_text + "."

    return cleaned_text


def clean_target_feature(summary: str) -> str:
    cleaned_summary = re.sub(r"\n", " ", summary)
    cleaned_summary = re.sub(r"\.,", ". ", cleaned_summary)
    cleaned_summary = re.sub(r"^[^\w]+|[^\w]+$", "", cleaned_summary)
    cleaned_summary = cleaned_summary + "."

    return cleaned_summary

In [11]:
print('AFTER CLEANING:')
print(f'Text: {clean_source_feature(text)}')
print(f'Headline: {clean_target_feature(headline)}')

AFTER CLEANING:
Text: Professional quality recordings of your songs are always preferred.  , Record a music demo. The word demo is shortened from the word "demonstration". A demo is used to demonstrate your artistic abilities to the A&R representatives of record labels.  , "A demo submission" is the process where an artist will find a record label company and send out their best three to five songs (demo) in the hopes of "getting signed" (receiving a record deal after signing a recording contract).  , Talk intelligently about your music, goals and where you fit with the label.
Headline: Practice your material until you can perform each song perfectly.  Choose whether you're going to record your music in your own home studio or a professional recording studio.  Prove your talent.  Pick your best three to five songs for a demo submission.  Write a brief bio.  Visit http://www.PhantomCityStudio.com/Demos for up-to-date information on how to properly submit your music demo.  , Complete the

In [12]:
for idx in range(30, 35):
  text = original_df.iloc[idx]['text']
  headline = original_df.iloc[idx]['headline']

  print('AFTER CLEANING:')
  print(f'Text: {clean_source_feature(text)}')
  print(f'Headline: {clean_target_feature(headline)}')

AFTER CLEANING:
Text: Don't use materials like plastic. , The bigger the animal, the larger the container should be. Don't scrub them with anything abrasive. You want them to stay as pristine as possible. Scratches and abrasions will encourage mold and bacteria growth. It doesn't work, unless you are sending it through a commercial level dishwasher (which uses soap, anyway). Use whatever soap you use, and be sure to rinse it really well. Do it at least twice a day. If it is a warm day, they will be needing more water than usual, and no one wants to drink gross water. It helps to ensure that there are fewer germs and parasites for your pets to be exposed to. The tiny amount of hydrogen peroxide that they are exposed to is insignificant. There are more and more things in our water that we don't want in our bodies. If you drink filtered water, so should they.
Headline: Use stainless steel or ceramic for the water bowl container.  Make sure the container is over-sized.  Take care of the co

In [13]:
cleaned_df = df_no_na.copy()

cleaned_df.loc[:, 'text'] = df_no_na['text'].map(lambda text: clean_source_feature(text))
cleaned_df.loc[:, 'headline'] = df_no_na['headline'].map(lambda headline: clean_target_feature(headline))

cleaned_df.head()

Unnamed: 0,headline,title,text
0,Keep related supplies in the same area. Make ...,How to Be an Organized Artist1,"If you're a photographer, keep all the necessa..."
1,Create a sketch in the NeoPopRealist manner of...,How to Create a Neopoprealist Art Work,See the image for how this drawing develops st...
2,Get a bachelor’s degree. Enroll in a studio-b...,How to Be a Visual Effects Artist1,It is possible to become a VFX artist without ...
3,Start with some experience or interest in art....,How to Become an Art Investor,The best art investors do their research on th...
4,"Keep your reference materials, sketches, artic...",How to Be an Organized Artist2,"As you start planning for a project or work, y..."


# Remove URLs, remove HTML tags, and process punctuations

In [14]:
def remove_special_chars(text: str) -> str:
  return text.replace('\xa0', ' ')

def remove_urls(text: str) -> str:
  return re.sub(r"http[s]?:\/\/\S+|www\.\S+", "", text, flags=re.MULTILINE)


def remove_html_tags(text: str) -> str:
  return re.sub(r"<.*?>", "", text)


def process_punctuations(text: str) -> str:
  text = re.sub(r"[^a-zA-Z0-9\s\.,]", " ", text)
  text = re.sub(r"([\.,])", r" \1 ", text)
  text = re.sub(r"\s{2,}", " ", text)
  return text.strip()


def process_en_text(text: str) -> str:
  text = str(text).lower()
  text = contractions.fix(text)

  text = remove_urls(text)
  text = remove_html_tags(text)
  text = process_punctuations(text)
  text = remove_special_chars(text)

  return text

In [15]:
headline_20 = cleaned_df.iloc[20]['headline']

print('TESTING REMOVE URLS:')

print(f'Before: {headline_20}')
headline_20 = remove_urls(headline_20)
print(f'After: {headline_20}')

TESTING REMOVE URLS:
Before: Visit the following URL on your Internet browser: http://www.pokerhouse.co.uk/landbasedcasinos.html.  Click the link that applies to your location.  Determine the distance from your location to each casino. Choose the casino that is the shortest distance from your location.
After: Visit the following URL on your Internet browser:   Click the link that applies to your location.  Determine the distance from your location to each casino. Choose the casino that is the shortest distance from your location.


In [16]:
text_23 = cleaned_df.iloc[23]['text']

print('TESTING PROCESS PUNCTUATIONS:')

print(f'Before: {text_23}')
text_23 = process_punctuations(text_23)
print(f'After: {text_23}')

TESTING PROCESS PUNCTUATIONS:
Before: Adult humans should drink 2-3 litres(or roughly eight 8 oz glasses) of water per day whilst children should drink 1-2 litres ( or roughly five 8 oz glasses.That is in addition to things like tea and coffee. Water keeps bodies at the correct temperature and removes toxins that are the inevitable result of metabolism and industrial life. You'll be healthier automatically.   Water also clears your skin, helps your kidneys, helps to control your appetite, and keeps you energized.If being healthier, more wakeful, and having better skin isn't motivation, what is? It also keeps you from drinking unhealthy beverages like soda and juice, which are high in calories. The body barely registers the intake of these unhealthy drinks and yet you still feel thirsty hundreds of calories later. If you need the taste, splash some lemon, lime, or 100% juice into your water. However, be sure that you eat plenty of calcium-rich foods, such as broccoli, to support healthy

In [17]:
headline_27 = cleaned_df.iloc[27]['headline']

print('TESTING PROCESS TEXT:')

print(f'Before: {headline_27}')
headline_27 = process_en_text(headline_27)
print(f'After: {headline_27}')

TESTING PROCESS TEXT:
Before: Assess your pet's needs.  Adjust your pet's schedule.  Write detailed instructions.  Talk to your veterinarian.
After: assess your pet s needs . adjust your pet s schedule . write detailed instructions . talk to your veterinarian .


In [18]:
processed_df = cleaned_df.copy()

processed_df.loc[:, 'text'] = cleaned_df['text'].map(process_en_text)
processed_df.loc[:, 'headline'] = cleaned_df['headline'].map(process_en_text)

processed_df.head()

Unnamed: 0,headline,title,text
0,keep related supplies in the same area . make ...,How to Be an Organized Artist1,"if you are a photographer , keep all the neces..."
1,create a sketch in the neopoprealist manner of...,How to Create a Neopoprealist Art Work,see the image for how this drawing develops st...
2,get a bachelor s degree . enroll in a studio b...,How to Be a Visual Effects Artist1,it is possible to become a vfx artist without ...
3,start with some experience or interest in art ...,How to Become an Art Investor,the best art investors do their research on th...
4,"keep your reference materials , sketches , art...",How to Be an Organized Artist2,"as you start planning for a project or work , ..."


In [19]:
print(f'Shape of processed dataset: {processed_df.shape}')
print('Information of processed dataset:')
processed_df.info()

Shape of processed dataset: (214294, 3)
Information of processed dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214294 entries, 0 to 214293
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  214294 non-null  object
 1   title     214294 non-null  object
 2   text      214294 non-null  object
dtypes: object(3)
memory usage: 4.9+ MB


# Remove rows with empty values

In [20]:
print('STATISTICS EMPTY VALUES:')
print(f"In text feature:\n{processed_df[processed_df['text'] == ''].count()}")
print(f"In headline feature:\n{processed_df[processed_df['headline'] == ''].count()}")

STATISTICS EMPTY VALUES:
In text feature:
headline    3
title       3
text        3
dtype: int64
In headline feature:
headline    0
title       0
text        0
dtype: int64


In [21]:
import numpy as np

df_no_empty = processed_df.copy()

print('Replace all empty values with NaN')
df_no_empty['text'] = df_no_empty['text'].replace('', np.nan)
df_no_empty['headline'] = df_no_empty['headline'].replace('', np.nan)

df_no_empty.isna().sum()

Replace all empty values with NaN


Unnamed: 0,0
headline,0
title,0
text,3


In [22]:
df_no_empty = df_no_empty.dropna().reset_index(drop=True)

print(f'Shape of dataset after drop empty values: {df_no_empty.shape}')
print('Information of dataset after drop empty values:')
df_no_empty.info()

Shape of dataset after drop empty values: (214291, 3)
Information of dataset after drop empty values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214291 entries, 0 to 214290
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  214291 non-null  object
 1   title     214291 non-null  object
 2   text      214291 non-null  object
dtypes: object(3)
memory usage: 4.9+ MB


# Drop duplicates

In [23]:
print(f'CHECKING NULL-VALUES:\n{df_no_empty.isna().sum()}')
print(f"Number of duplicated records: {df_no_empty[['text', 'headline']].duplicated().sum()}")

CHECKING NULL-VALUES:
headline    0
title       0
text        0
dtype: int64
Number of duplicated records: 113


In [24]:
df_no_duplicated = df_no_empty.copy()

df_no_duplicated = df_no_duplicated.drop(columns=['title'])
df_no_duplicated = df_no_duplicated.dropna().drop_duplicates().reset_index(drop=True)

print(f'Shape of dataset after drop null-values and drop duplicates: {df_no_duplicated.shape}')
print('Information of dataset after drop null-values and drop duplicates:')
df_no_duplicated.info()
print(f'CHECKING NULL-VALUES:\n{df_no_duplicated.isna().sum()}')
print(f"Number of duplicated records: {df_no_duplicated[['text', 'headline']].duplicated().sum()}")

Shape of dataset after drop null-values and drop duplicates: (214178, 2)
Information of dataset after drop null-values and drop duplicates:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214178 entries, 0 to 214177
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  214178 non-null  object
 1   text      214178 non-null  object
dtypes: object(2)
memory usage: 3.3+ MB
CHECKING NULL-VALUES:
headline    0
text        0
dtype: int64
Number of duplicated records: 0


# Save processed data

In [25]:
df_no_duplicated.to_csv('dataset/wikihow_processed.csv', index=False)

# Check saved dataset

In [26]:
df = pd.read_csv('dataset/wikihow_processed.csv')
df.head()

Unnamed: 0,headline,text
0,keep related supplies in the same area . make ...,"if you are a photographer , keep all the neces..."
1,create a sketch in the neopoprealist manner of...,see the image for how this drawing develops st...
2,get a bachelor s degree . enroll in a studio b...,it is possible to become a vfx artist without ...
3,start with some experience or interest in art ...,the best art investors do their research on th...
4,"keep your reference materials , sketches , art...","as you start planning for a project or work , ..."


In [27]:
print(f'Shape of processed dataset: {df.shape}')
print('Information of processed dataset:')
df.info()

Shape of processed dataset: (214178, 2)
Information of processed dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214178 entries, 0 to 214177
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  214178 non-null  object
 1   text      214178 non-null  object
dtypes: object(2)
memory usage: 3.3+ MB
