In [194]:
import pandas as pd
import contractions
import nltk

In [195]:
# Dataset
gigawordPath = {
    "small": "../../dataset/generalization/gigaword_small.csv",
    "medium": "../../dataset/generalization/gigaword_medium.csv",
    "large": "../../dataset/generalization/gigaword_large.csv",
    "xlarge": "../../dataset/generalization/gigaword_xlarge.csv",
    "xxlarge": "../../dataset/generalization/gigaword_xxlarge.csv"
}

In [196]:
gigawordDataset = pd.read_csv(gigawordPath["small"], encoding='latin-1')

In [197]:
gigawordDataset.head(3)

Unnamed: 0,document,summary
0,b'officials of the cabinet-level fair trade co...,b'fair trade commission investigating consumer...
1,"b""five people were killed , and a woman gravel...",b'colombian nightclub shootout leaves five dead'
2,b'preliminary dna testing on remains of ## red...,b'estonia provides red army soldiers dna sampl...


In [198]:
# 1. Renaming columns
gigawordDataset = gigawordDataset.rename(columns={'document': 'text'})
gigawordDatasetCopy = gigawordDataset.copy()
gigawordDataset.shape

(10000, 2)

In [199]:
gigawordDataset.head(3)

Unnamed: 0,text,summary
0,b'officials of the cabinet-level fair trade co...,b'fair trade commission investigating consumer...
1,"b""five people were killed , and a woman gravel...",b'colombian nightclub shootout leaves five dead'
2,b'preliminary dna testing on remains of ## red...,b'estonia provides red army soldiers dna sampl...


In [200]:
# 2. Drop NA values
gigawordDataset = gigawordDataset.dropna()
gigawordDataset.shape

(10000, 2)

In [201]:
# 3. Drop duplicates values
gigawordDataset = gigawordDataset.drop_duplicates("text")
gigawordDataset.shape

(9994, 2)

In [202]:
# 4. Converting to lowercase
gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: x.lower())
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: x.lower())

In [203]:
# 7. Remove (b')
import re
def remove_s(text):
    text = re.sub("b'", "", text)
    text = re.sub('b"', "", text)
    return text

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_s(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_s(x))

In [204]:
# 5. Remove HTML tags
from bs4 import BeautifulSoup
gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: BeautifulSoup(x, "html.parser").text)
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: BeautifulSoup(x, "html.parser").text)



In [205]:
# 6. Contraction Mapping [Expansion] eg:- "aren't" ==> "are not"
gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: [contractions.fix(word) for word in x.split()])
gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: ' '.join(x))

gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: [contractions.fix(word) for word in x.split()])
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: ' '.join(x))

In [206]:
# 7. Remove (‘s)
import re
def remove_s(text):
    text = re.sub("'s", "", text)
    return text

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_s(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_s(x))

In [207]:
# 8. Remove any text inside any form of parenthesis ( ) [] {} < >
def remove_content_between_parenthsis(text):
    return re.sub(r'\([^)]*\)', '', text)

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_content_between_parenthsis(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_content_between_parenthsis(x))

In [208]:
# 9. Eliminate punctuations and special characters
import string
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_punctuation(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_punctuation(x))

In [209]:
# 11. Remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_stopwords(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_stopwords(x))

In [210]:
# 12. Remove short words
def remove_shortwords(text):
    return ' '.join([word for word in text.split() if len(word) > 2])

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_shortwords(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_shortwords(x))

In [211]:
# 13. Remove the rows that have empty text or summary
def remove_empty_rows(text, summary):
    return (text != '') & (summary != '')

gigawordDataset = gigawordDataset[gigawordDataset.apply(lambda x: remove_empty_rows(x['text'], x['summary']), axis=1)]
gigawordDataset.shape

(9994, 2)

In [212]:
# 14. remove extra lines and trim spaces
def remove_extra_lines(text):
    return text.strip()

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_extra_lines(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_extra_lines(x))

In [213]:
# 15. Removing Emojis from the text
import re
def remove_emojis(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_emojis(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_emojis(x))

In [214]:
# 16. Removing URLs
import re
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_urls(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_urls(x))

In [215]:
# Saving the cleaned data to a csv file
gigawordDataset.to_csv('../../dataset/generalization/cleaned_gigaword_small.csv', index=False)

In [216]:
gigawordDataset.shape

(9994, 2)

In [217]:
gigawordDatasetCopy.shape

(10000, 2)