In [1]:
import pandas as pd
import contractions
import nltk

In [3]:
# Dataset
cnn_dailymail = {
    "normal": "../../dataset/cnn_dailymail/cnn_dailymail.csv",
}

In [5]:
cnn_dailymailDataset = pd.read_csv(cnn_dailymail["normal"], encoding='latin-1')

In [6]:
cnn_dailymailDataset.head(3)

Unnamed: 0,article,highlights
0,"b""By. Associated Press. PUBLISHED:. 14:11 EST,...","b'Bishop John Folda, of North Dakota, is takin..."
1,b'(CNN) -- Ralph Mata was an internal affairs ...,b'Criminal complaint: Cop used his role to hel...
2,"b""A drunk driver who killed a young woman in a...","b""Craig Eccleston-Todd, 27, had drunk at least..."


In [7]:
# 1. Renaming columns
cnn_dailymailDataset = cnn_dailymailDataset.rename(columns={'article': 'text'})
cnn_dailymailDataset = cnn_dailymailDataset.rename(columns={'highlights': 'summary'})
cnn_dailymailDatasetCopy = cnn_dailymailDataset.copy()
cnn_dailymailDataset.shape

(5000, 2)

In [8]:
cnn_dailymailDataset.head(3)

Unnamed: 0,text,summary
0,"b""By. Associated Press. PUBLISHED:. 14:11 EST,...","b'Bishop John Folda, of North Dakota, is takin..."
1,b'(CNN) -- Ralph Mata was an internal affairs ...,b'Criminal complaint: Cop used his role to hel...
2,"b""A drunk driver who killed a young woman in a...","b""Craig Eccleston-Todd, 27, had drunk at least..."


In [9]:
# 2. Drop NA values
cnn_dailymailDataset = cnn_dailymailDataset.dropna()
cnn_dailymailDataset.shape

(5000, 2)

In [10]:
# 3. Drop duplicates values
cnn_dailymailDataset = cnn_dailymailDataset.drop_duplicates("text")
cnn_dailymailDataset.shape

(5000, 2)

In [11]:
# 4. Converting to lowercase
cnn_dailymailDataset['text'] = cnn_dailymailDataset['text'].apply(lambda x: x.lower())
cnn_dailymailDataset['summary'] = cnn_dailymailDataset['summary'].apply(lambda x: x.lower())

In [12]:
# 7. Remove (b')
import re
def remove_s(text):
    text = re.sub("b'", "", text)
    text = re.sub('b"', "", text)
    return text

cnn_dailymailDataset['text'] = cnn_dailymailDataset['text'].apply(lambda x: remove_s(x))
cnn_dailymailDataset['summary'] = cnn_dailymailDataset['summary'].apply(lambda x: remove_s(x))

In [13]:
cnn_dailymailDataset.head(3)

Unnamed: 0,text,summary
0,"by. associated press. published:. 14:11 est, 2...","bishop john folda, of north dakota, is taking ..."
1,(cnn) -- ralph mata was an internal affairs li...,criminal complaint: cop used his role to help ...
2,a drunk driver who killed a young woman in a h...,"craig eccleston-todd, 27, had drunk at least t..."


In [14]:
# 5. Remove HTML tags
from bs4 import BeautifulSoup
cnn_dailymailDataset['text'] = cnn_dailymailDataset['text'].apply(lambda x: BeautifulSoup(x, "html.parser").text)
cnn_dailymailDataset['summary'] = cnn_dailymailDataset['summary'].apply(lambda x: BeautifulSoup(x, "html.parser").text)



In [15]:
# 6. Contraction Mapping [Expansion] eg:- "aren't" ==> "are not"
cnn_dailymailDataset['text'] = cnn_dailymailDataset['text'].apply(lambda x: [contractions.fix(word) for word in x.split()])
cnn_dailymailDataset['text'] = cnn_dailymailDataset['text'].apply(lambda x: ' '.join(x))

cnn_dailymailDataset['summary'] = cnn_dailymailDataset['summary'].apply(lambda x: [contractions.fix(word) for word in x.split()])
cnn_dailymailDataset['summary'] = cnn_dailymailDataset['summary'].apply(lambda x: ' '.join(x))

In [206]:
# # 7. Remove (‘s)
# import re
# def remove_s(text):
#     text = re.sub("'s", "", text)
#     return text

# gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_s(x))
# gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_s(x))

In [16]:
# 8. Remove any text inside any form of parenthesis ( ) [] {} < >
def remove_content_between_parenthsis(text):
    return re.sub(r'\([^)]*\)', '', text)

cnn_dailymailDataset['text'] = cnn_dailymailDataset['text'].apply(lambda x: remove_content_between_parenthsis(x))
cnn_dailymailDataset['summary'] = cnn_dailymailDataset['summary'].apply(lambda x: remove_content_between_parenthsis(x))

In [17]:
# 9. Eliminate punctuations and special characters
import string
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

cnn_dailymailDataset['text'] = cnn_dailymailDataset['text'].apply(lambda x: remove_punctuation(x))
cnn_dailymailDataset['summary'] = cnn_dailymailDataset['summary'].apply(lambda x: remove_punctuation(x))

In [209]:
# 11. Remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_stopwords(x))
gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_stopwords(x))

In [210]:
# # 12. Remove short words
# def remove_shortwords(text):
#     return ' '.join([word for word in text.split() if len(word) > 2])

# gigawordDataset['text'] = gigawordDataset['text'].apply(lambda x: remove_shortwords(x))
# gigawordDataset['summary'] = gigawordDataset['summary'].apply(lambda x: remove_shortwords(x))

In [18]:
# 13. Remove the rows that have empty text or summary
def remove_empty_rows(text, summary):
    return (text != '') & (summary != '')

cnn_dailymailDataset = cnn_dailymailDataset[cnn_dailymailDataset.apply(lambda x: remove_empty_rows(x['text'], x['summary']), axis=1)]
cnn_dailymailDataset.shape

(5000, 2)

In [19]:
# 14. remove extra lines and trim spaces
def remove_extra_lines(text):
    return text.strip()

cnn_dailymailDataset['text'] = cnn_dailymailDataset['text'].apply(lambda x: remove_extra_lines(x))
cnn_dailymailDataset['summary'] = cnn_dailymailDataset['summary'].apply(lambda x: remove_extra_lines(x))

In [20]:
# 15. Removing Emojis from the text
import re
def remove_emojis(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

cnn_dailymailDataset['text'] = cnn_dailymailDataset['text'].apply(lambda x: remove_emojis(x))
cnn_dailymailDataset['summary'] = cnn_dailymailDataset['summary'].apply(lambda x: remove_emojis(x))

In [21]:
# 16. Removing URLs
import re
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

cnn_dailymailDataset['text'] = cnn_dailymailDataset['text'].apply(lambda x: remove_urls(x))
cnn_dailymailDataset['summary'] = cnn_dailymailDataset['summary'].apply(lambda x: remove_urls(x))

In [22]:
cnn_dailymailDataset.shape

(5000, 2)

In [23]:
# Saving the cleaned data to a csv file
cnn_dailymailDataset.to_csv('../../dataset/cnn_dailymail/cleaned_cnn_dailymail.csv', index=False)

In [24]:
cnn_dailymailDataset.shape

(5000, 2)

In [25]:
cnn_dailymailDataset.shape

(5000, 2)