In [1]:
pip install contractions




[notice] A new release of pip available: 22.2.2 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [72]:
import pandas as pd
import contractions
import nltk

In [73]:
# Dataset
gigawordPath = {
    "medium": "../../dataset/generalization/gigaword_medium.csv",
    "large": "../../dataset/generalization/gigaword_large.csv",
    "xlarge": "../../dataset/generalization/gigaword_xlarge.csv",
    "xxlarge": "../../dataset/generalization/gigaword_xxlarge.csv"
}

In [74]:
gigawordDataset = pd.read_csv(gigawordPath["medium"], encoding='latin-1')

In [75]:
gigawordDataset.head(3)

Unnamed: 0,document,summary
0,b'officials of the cabinet-level fair trade co...,b'fair trade commission investigating consumer...
1,"b""five people were killed , and a woman gravel...",b'colombian nightclub shootout leaves five dead'
2,b'preliminary dna testing on remains of ## red...,b'estonia provides red army soldiers dna sampl...


In [76]:
# 1. Renaming columns
gigawordDataset = gigawordDataset.rename(columns={'document': 'text'})
gigawordDatasetCopy = gigawordDataset.copy()
gigawordDataset.shape

(15000, 2)

In [77]:
gigawordDataset.head(3)

Unnamed: 0,text,summary
0,b'officials of the cabinet-level fair trade co...,b'fair trade commission investigating consumer...
1,"b""five people were killed , and a woman gravel...",b'colombian nightclub shootout leaves five dead'
2,b'preliminary dna testing on remains of ## red...,b'estonia provides red army soldiers dna sampl...


In [78]:
# 2. Drop NA values
gigawordDataset = gigawordDataset.dropna()
gigawordDataset.shape

(15000, 2)

In [79]:
# 3. Drop duplicates values
gigawordDataset = gigawordDataset.drop_duplicates("text")
gigawordDataset.shape

(14983, 2)

In [80]:
# 4. Converting to lowercase
gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: x.lower())
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: x.lower())

In [81]:
# 7. Remove (b')
import re
def remove_s(text):
    text = re.sub("b'", "", text)
    text = re.sub('b"', "", text)
    return text

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_s(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_s(x))

In [82]:
# 5. Remove HTML tags
from bs4 import BeautifulSoup
gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: BeautifulSoup(x, "html.parser").text)
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: BeautifulSoup(x, "html.parser").text)



In [83]:
# 6. Contraction Mapping [Expansion] eg:- "aren't" ==> "are not"
gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: [contractions.fix(word) for word in x.split()])
gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: ' '.join(x))

gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: [contractions.fix(word) for word in x.split()])
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: ' '.join(x))

In [84]:
# 7. Remove (‘s)
import re
def remove_s(text):
    text = re.sub("'s", "", text)
    return text

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_s(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_s(x))

In [85]:
# 8. Remove any text inside any form of parenthesis ( ) [] {} < >
def remove_content_between_parenthsis(text):
    return re.sub(r'\([^)]*\)', '', text)

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_content_between_parenthsis(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_content_between_parenthsis(x))

In [86]:
# 9. Eliminate punctuations and special characters
import string
def remove_special_characters(text):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    return text

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_special_characters(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_special_characters(x))

In [87]:
# 11. Remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_stopwords(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_stopwords(x))

In [88]:
# 12. Remove short words
def remove_shortwords(text):
    return ' '.join([word for word in text.split() if len(word) > 2])

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_shortwords(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_shortwords(x))

In [89]:
# 13. Remove the rows that have empty text or summary
def remove_empty_rows(text, summary):
    return (text != '') & (summary != '')

gigawordDataset = gigawordDataset[gigawordDataset.apply(lambda x: remove_empty_rows(x['text'], x['summary']), axis=1)]
gigawordDataset.shape

(14983, 2)

In [90]:
# 14. remove extra lines and trim spaces
def remove_extra_lines(text):
    return text.strip()

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_extra_lines(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_extra_lines(x))

In [91]:
# 15. Removing Emojis from the text
import re
def remove_emojis(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_emojis(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_emojis(x))

In [92]:
# 16. Removing URLs
import re
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_urls(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_urls(x))

In [93]:
# # Saving the cleaned data to a csv file
# gigawordDataset.to_csv('../../dataset/news_summary/cleaned_news_summary.csv', index=False)

In [94]:
gigawordDataset.head()

Unnamed: 0,text,summary
0,officials cabinetlevel fair trade commission l...,fair trade commission investigating consumer p...
1,five people killed woman gravely wounded follo...,colombian nightclub shootout leaves five dead
2,preliminary dna testing remains red army soldi...,estonia provides red army soldiers dna samples...
3,transocean inc world largest offshore drilling...,transocean globalsantafe plan combine create n...
4,palestinian president mahmoud abbas make worki...,palestinian president visit malaysia may


In [95]:
gigawordDatasetCopy.head()

Unnamed: 0,text,summary
0,b'officials of the cabinet-level fair trade co...,b'fair trade commission investigating consumer...
1,"b""five people were killed , and a woman gravel...",b'colombian nightclub shootout leaves five dead'
2,b'preliminary dna testing on remains of ## red...,b'estonia provides red army soldiers dna sampl...
3,"b""transocean inc. , the world 's largest offsh...",b'transocean globalsantafe plan to combine to ...
4,b'palestinian president mahmoud abbas will mak...,b'palestinian president to visit malaysia on m...


In [None]:
"#" in gigawordDatasetCopy