# Connect to Drive

In [1]:
from google.colab import drive

drive.mount('/content/drive')

%cd /content/drive/MyDrive/deeplearning/projects/en-text-sum-fine-tuned-bart/

Mounted at /content/drive
/content/drive/MyDrive/deeplearning/projects/en-text-sum-fine-tuned-bart


# Load dataset

In [2]:
import pandas as pd

df = pd.read_csv('dataset/wikihow_processed.csv')

print(f'Shape of dataset: {df.shape}')
print('Information of dataset:')
df.info()

Shape of dataset: (214178, 2)
Information of dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214178 entries, 0 to 214177
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  214178 non-null  object
 1   text      214178 non-null  object
dtypes: object(2)
memory usage: 3.3+ MB


In [3]:
KEEPED_PUNCS = ['.', ',']

# Remove all single characters in sentences

In [4]:
def remove_single_chars(text: str) -> str:
  words = text.split(' ')
  processed_words = []

  for word in words:
    if len(word) > 1:
      processed_words.append(word)
    elif word in KEEPED_PUNCS:
      processed_words.append(word)

  return ' '.join(processed_words)

In [5]:
df_no_single_chars = df.copy()

df_no_single_chars.loc[:, 'text'] = df_no_single_chars['text'].apply(remove_single_chars)
df_no_single_chars.loc[:, 'headline'] = df_no_single_chars['headline'].apply(remove_single_chars)

print(f'Shape of dataset without single characters in sentence: {df_no_single_chars.shape}')
print('Information of dataset without single characters in sentence')
df_no_single_chars.info()

Shape of dataset without single characters in sentence: (214178, 2)
Information of dataset without single characters in sentence
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214178 entries, 0 to 214177
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  214178 non-null  object
 1   text      214178 non-null  object
dtypes: object(2)
memory usage: 3.3+ MB


# Replace string of `.`, `,` with only the encoutered first character

In [6]:
def replace_special_chars(text: str) -> str:
  words = text.split(' ')
  new_words = []

  for i, word in enumerate(words):
    if word in KEEPED_PUNCS:
      if i == 0:
        new_words.append(word)
      elif i > 0 and words[i - 1] not in KEEPED_PUNCS:
        new_words.append(word)
    else:
      new_words.append(word)

  return ' '.join(new_words)

In [7]:
df_valid_text = df_no_single_chars.copy()

df_valid_text.loc[:, 'text'] = df_valid_text['text'].apply(replace_special_chars)
df_valid_text.loc[:, 'headline'] = df_valid_text['headline'].apply(replace_special_chars)

print(f'Shape of dataset with valid text: {df_valid_text.shape}')
print('Information of dataset with valid text')
df_valid_text.info()

Shape of dataset with valid text: (214178, 2)
Information of dataset with valid text
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214178 entries, 0 to 214177
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  214178 non-null  object
 1   text      214178 non-null  object
dtypes: object(2)
memory usage: 3.3+ MB


# Keep the first word in consecutive repeated words

In [8]:
def reduce_repeated_chars(text: str) -> str:
  words = text.split(' ')
  processed_words = []

  for i, word in enumerate(words):
    if i > 0 and words[i - 1] == word:
      continue
    processed_words.append(word)

  return ' '.join(processed_words)

In [9]:
df_no_repeats = df_valid_text.copy()

df_no_repeats.loc[:, 'text'] = df_no_repeats['text'].apply(reduce_repeated_chars)
df_no_repeats.loc[:, 'headline'] = df_no_repeats['headline'].apply(reduce_repeated_chars)

print(f'Shape of dataset with valid text: {df_no_repeats.shape}')
print('Information of dataset with valid text')
df_no_repeats.info()

Shape of dataset with valid text: (214178, 2)
Information of dataset with valid text
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214178 entries, 0 to 214177
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  214178 non-null  object
 1   text      214178 non-null  object
dtypes: object(2)
memory usage: 3.3+ MB


# Keep the first numeric digit in consecutive digit sequence

In [10]:
def reduce_repeated_digits(text: str) -> str:
  words = text.split(' ')
  processed_words = []

  for i, word in enumerate(words):
    if word.isdigit():
      if i > 0 and words[i - 1].isdigit():
        continue
    processed_words.append(word)

  return ' '.join(processed_words)

In [11]:
df_no_repeated_digits = df_no_repeats.copy()

df_no_repeated_digits.loc[:, 'text'] = df_no_repeated_digits['text'].apply(reduce_repeated_digits)
df_no_repeated_digits.loc[:, 'headline'] = df_no_repeated_digits['headline'].apply(reduce_repeated_digits)

print(f'Shape of dataset without consecutive digits in sentence: {df_no_repeated_digits.shape}')
print('Information of dataset without consecutive digits in sentence')
df_no_repeated_digits.info()

Shape of dataset without consecutive digits in sentence: (214178, 2)
Information of dataset without consecutive digits in sentence
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214178 entries, 0 to 214177
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  214178 non-null  object
 1   text      214178 non-null  object
dtypes: object(2)
memory usage: 3.3+ MB


# Remove sequence of characters `.`, `,` adjent to numeric digits

In [12]:
def is_special_char_or_digit(char: str) -> bool:
  return char in KEEPED_PUNCS or char.isdigit()

def remove_repeated_chars_digits(text: str) -> str:
  words = text.split(' ')
  processed_words = []

  for i, word in enumerate(words):
    if i == 0:
      processed_words.append(word)
    else:
      if is_special_char_or_digit(word) and is_special_char_or_digit(words[i - 1]):
        continue
      processed_words.append(word)

  return ' '.join(processed_words)

In [13]:
df_no_repeated_char_digits = df_no_repeated_digits.copy()

df_no_repeated_char_digits.loc[:, 'text'] = df_no_repeated_char_digits['text'].apply(remove_repeated_chars_digits)
df_no_repeated_char_digits.loc[:, 'headline'] = df_no_repeated_char_digits['headline'].apply(remove_repeated_chars_digits)

print(f'Shape of dataset without consecutively special characters and digits in sentence: {df_no_repeated_char_digits.shape}')
print('Information of dataset without consecutively special characters and digits in sentence')
df_no_repeated_char_digits.info()

Shape of dataset without consecutively special characters and digits in sentence: (214178, 2)
Information of dataset without consecutively special characters and digits in sentence
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214178 entries, 0 to 214177
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  214178 non-null  object
 1   text      214178 non-null  object
dtypes: object(2)
memory usage: 3.3+ MB


# Remove words with number of characters of word exceed 16

In [14]:
def remove_long_words(text: str, limit: int = 16) -> str:
  words = text.split(' ')
  processed_words = []

  for word in words:
    if len(word) <= limit:
      processed_words.append(word)

  return ' '.join(processed_words)

In [15]:
LONG_LIMIT = 16

df_no_long_words = df_no_repeated_char_digits.copy()

df_no_long_words.loc[:, 'text'] = df_no_long_words['text'].apply(lambda text: remove_long_words(text, LONG_LIMIT))
df_no_long_words.loc[:, 'headline'] = df_no_long_words['headline'].apply(lambda text: remove_long_words(text, LONG_LIMIT))

print(f'Shape of dataset without long words in sentence: {df_no_long_words.shape}')
print('Information of dataset without long words in sentence')
df_no_long_words.info()

Shape of dataset without long words in sentence: (214178, 2)
Information of dataset without long words in sentence
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214178 entries, 0 to 214177
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  214178 non-null  object
 1   text      214178 non-null  object
dtypes: object(2)
memory usage: 3.3+ MB


# Remove rows with no any alphabet characters in sentence

In [16]:
def is_no_alphabet(text: str, special_chars: list[str] = KEEPED_PUNCS) -> bool:
  i = 0
  while i < len(text):
    if text[i] not in special_chars:
      return True
    i += 1
  return False

In [17]:
df_alphabet = df_no_long_words[df_no_long_words['text'].apply(is_no_alphabet) & df_no_long_words['headline'].apply(is_no_alphabet)].reset_index(drop=True)

print(f'Shape of dataset with no sentence is without alphabet: {df_alphabet.shape}')
print('Information of dataset:')
df_alphabet.info()

Shape of dataset with no sentence is without alphabet: (209013, 2)
Information of dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209013 entries, 0 to 209012
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  209013 non-null  object
 1   text      209013 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB


# Calculate sentence length of each features

In [18]:
def compute_length(text: str) -> int:
  return len(str(text).split(' '))

cleaned_df = df_alphabet.copy()

cleaned_df.loc[:, 'text_length'] = cleaned_df['text'].apply(compute_length)
cleaned_df.loc[:, 'headline_length'] = cleaned_df['headline'].apply(compute_length)

In [19]:
print('Information of dataset:')
cleaned_df.info()
print('Statistics of headline_length and text_length:')
cleaned_df[['headline_length', 'text_length']].describe()

Information of dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209013 entries, 0 to 209012
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   headline         209013 non-null  object
 1   text             209013 non-null  object
 2   text_length      209013 non-null  int64 
 3   headline_length  209013 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 6.4+ MB
Statistics of headline_length and text_length:


Unnamed: 0,headline_length,text_length
count,209013.0,209013.0
mean,66.010277,487.692081
std,60.177224,524.622261
min,2.0,2.0
25%,29.0,162.0
50%,48.0,332.0
75%,85.0,594.0
max,4083.0,13319.0


# Remove rows with `text_length <= headline_length`

In [20]:
print('Number rows with text_length less than or equal headline_length:')
cleaned_df[cleaned_df['text_length'] <= cleaned_df['headline_length']].count()

Number rows with text_length less than or equal headline_length:


Unnamed: 0,0
headline,22912
text,22912
text_length,22912
headline_length,22912


In [21]:
valid_df = cleaned_df[cleaned_df['text_length'] > cleaned_df['headline_length']].reset_index(drop=True)

print(f'Shape of valid dataset: {valid_df.shape}')
print('Information of valid dataset:')
valid_df.info()
print('Statistic valid dataset')
valid_df.describe()

Shape of valid dataset: (186101, 4)
Information of valid dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186101 entries, 0 to 186100
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   headline         186101 non-null  object
 1   text             186101 non-null  object
 2   text_length      186101 non-null  int64 
 3   headline_length  186101 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 5.7+ MB
Statistic valid dataset


Unnamed: 0,text_length,headline_length
count,186101.0,186101.0
mean,541.198661,60.787148
std,531.61184,53.427598
min,6.0,2.0
25%,217.0,27.0
50%,374.0,44.0
75%,645.0,78.0
max,13319.0,4083.0


# Save cleaned data

In [22]:
cleaned_df  = valid_df[['text', 'headline']]

print(f'Shape of cleaned dataset: {cleaned_df.shape}')
print('Information of cleaned dataset:')
cleaned_df.info()

cleaned_df.to_csv('dataset/wikihow_cleaned.csv', index=False)

Shape of cleaned dataset: (186101, 2)
Information of cleaned dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186101 entries, 0 to 186100
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   text      186101 non-null  object
 1   headline  186101 non-null  object
dtypes: object(2)
memory usage: 2.8+ MB
