In [38]:
pip install contractions

Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-win_amd64.whl (39 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
     ------------------------------------ 287.5/287.5 kB 172.3 kB/s eta 0:00:00
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.24
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.2 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [148]:
import pandas as pd
import contractions
import nltk

In [149]:
# Dataset
newsSummaryPath = "../../dataset/news_summary/news_summary.csv"
moviePath = {
    "small": "../../dataset/movie/movie_reviews_small.csv",
    "medium": "../../dataset/movie/movie_reviews_medium.csv"
}
gigawordPath = {
    "medium": "../../dataset/generalization/gigaword_medium.csv",
    "large": "../../dataset/generalization/gigaword_large.csv",
    "xlarge": "../../dataset/generalization/gigaword_xlarge.csv",
    "xxlarge": "../../dataset/generalization/gigaword_xxlarge.csv"
}

In [150]:
newsSummaryDataset = pd.read_csv(newsSummaryPath, encoding='latin-1')
movieSmallDataset = pd.read_csv(moviePath["small"], encoding='latin-1')
movieMediumDataset = pd.read_csv(moviePath["medium"], encoding='latin-1')
gigawordMediumDataset = pd.read_csv(gigawordPath["medium"], encoding='latin-1')
gigawordLargeDataset = pd.read_csv(gigawordPath["large"], encoding='latin-1')
gigawordXLargeDataset = pd.read_csv(gigawordPath["xlarge"], encoding='latin-1')
gigawordXXLargeDataset = pd.read_csv(gigawordPath["xxlarge"], encoding='latin-1')

### Text Preprocessing News Summary Dataset


In [151]:
# 1. Removing unwanted columns
newsSummaryDataset = newsSummaryDataset.drop(['author', 'date', 'headlines', 'read_more'], axis=1)
newsSummaryDataset

Unnamed: 0,text,ctext
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...
...,...,...
4509,Fruit juice concentrate maker Rasna is eyeing ...,"Mumbai, Feb 23 (PTI) Fruit juice concentrate m..."
4510,Former Indian cricketer Sachin Tendulkar atten...,Former cricketer Sachin Tendulkar was spotted ...
4511,"Aamir Khan, while talking about reality shows ...","Aamir Khan, whose last film Dangal told the st..."
4512,The Maharashtra government has initiated an in...,Maharahstra Power Minister Chandrashekhar Bawa...


In [152]:
# 2. Renaming columns
newsSummaryDataset = newsSummaryDataset.rename(columns={'text': 'summary', 'ctext': 'text'})
newsSummaryDatasetCopy = newsSummaryDataset.copy()
newsSummaryDataset.shape

(4514, 2)

In [153]:
# 3. Drop NA values
newsSummaryDataset = newsSummaryDataset.dropna()
newsSummaryDataset.shape

(4396, 2)

In [154]:
# 4. Drop duplicates values
newsSummaryDataset = newsSummaryDataset.drop_duplicates("text")
newsSummaryDataset.shape

(4341, 2)

In [155]:
# 5. Converting to lowercase
newsSummaryDataset['text'] = newsSummaryDataset['text'].apply(lambda x: x.lower())
newsSummaryDataset['summary'] = newsSummaryDataset['summary'].apply(lambda x: x.lower())

In [156]:
# 6. Remove HTML tags
from bs4 import BeautifulSoup
newsSummaryDataset['text'] = newsSummaryDataset['text'].apply(lambda x: BeautifulSoup(x, "html.parser").text)
newsSummaryDataset['summary'] = newsSummaryDataset['summary'].apply(lambda x: BeautifulSoup(x, "html.parser").text)

In [157]:
# 7. Contraction Mapping [Expansion] eg:- "aren't" ==> "are not"
newsSummaryDataset['text'] = newsSummaryDataset['text'].apply(lambda x: [contractions.fix(word) for word in x.split()])
newsSummaryDataset['text'] = newsSummaryDataset['text'].apply(lambda x: ' '.join(x))

newsSummaryDataset['summary'] = newsSummaryDataset['summary'].apply(lambda x: [contractions.fix(word) for word in x.split()])
newsSummaryDataset['summary'] = newsSummaryDataset['summary'].apply(lambda x: ' '.join(x))

In [158]:
# 8. Remove (‘s)
import re
def remove_s(text):
    text = re.sub("'s", "", text)
    return text

newsSummaryDataset['text'] = newsSummaryDataset['text'].apply(lambda x: remove_s(x))
newsSummaryDataset['summary'] = newsSummaryDataset['summary'].apply(lambda x: remove_s(x))

In [159]:
# 5. Remove any text inside any form of parenthesis ( ) [] {} < >
def remove_content_between_parenthsis(text):
    return re.sub(r'\([^)]*\)', '', text)

newsSummaryDataset['text'] = newsSummaryDataset['text'].apply(lambda x: remove_content_between_parenthsis(x))
newsSummaryDataset['summary'] = newsSummaryDataset['summary'].apply(lambda x: remove_content_between_parenthsis(x))

In [160]:
# 10. Eliminate punctuations and special characters
import string
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

newsSummaryDataset['text'] = newsSummaryDataset['text'].apply(lambda x: remove_punctuation(x))
newsSummaryDataset['summary'] = newsSummaryDataset['summary'].apply(lambda x: remove_punctuation(x))


In [161]:
# 11. Remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

newsSummaryDataset['text'] = newsSummaryDataset['text'].apply(lambda x: remove_stopwords(x))
newsSummaryDataset['summary'] = newsSummaryDataset['summary'].apply(lambda x: remove_stopwords(x))

In [162]:
# 12. Remove short words
def remove_shortwords(text):
    return ' '.join([word for word in text.split() if len(word) > 2])

newsSummaryDataset['text'] = newsSummaryDataset['text'].apply(lambda x: remove_shortwords(x))
newsSummaryDataset['summary'] = newsSummaryDataset['summary'].apply(lambda x: remove_shortwords(x))

In [163]:
# 13. Remove the rows that have empty text or summary
def remove_empty_rows(text, summary):
    return (text != '') & (summary != '')

newsSummaryDataset = newsSummaryDataset[newsSummaryDataset.apply(lambda x: remove_empty_rows(x['text'], x['summary']), axis=1)]
newsSummaryDataset.shape

(4341, 2)

In [164]:
# 14. remove extra lines and trim spaces
def remove_extra_lines(text):
    return text.strip()

newsSummaryDataset['text'] = newsSummaryDataset['text'].apply(lambda x: remove_extra_lines(x))
newsSummaryDataset['summary'] = newsSummaryDataset['summary'].apply(lambda x: remove_extra_lines(x))

In [165]:
# 15. Removing Emojis from the text
import re
def remove_emojis(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

newsSummaryDataset['text'] = newsSummaryDataset['text'].apply(lambda x: remove_emojis(x))
newsSummaryDataset['summary'] = newsSummaryDataset['summary'].apply(lambda x: remove_emojis(x))

In [166]:
# 16. Removing URLs
import re
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

newsSummaryDataset['text'] = newsSummaryDataset['text'].apply(lambda x: remove_urls(x))
newsSummaryDataset['summary'] = newsSummaryDataset['summary'].apply(lambda x: remove_urls(x))

In [167]:
# Saving the cleaned data to a csv file
newsSummaryDataset.to_csv('../../dataset/news_summary/cleaned_news_summary.csv', index=False)

In [168]:
newsSummaryDataset

Unnamed: 0,summary,text
0,administration union territory daman diu revok...,daman diu administration wednesday withdrew ci...
1,malaika arora slammed instagram user trolled d...,special numbers tvappearances bollywood actor ...
2,indira gandhi institute medical sciences patna...,indira gandhi institute medical sciences patna...
3,lashkaretaiba kashmir commander abu dujana kil...,lashkaretaiba kashmir commander abu dujana kil...
4,hotels maharashtra train staff spot signs sex ...,hotels mumbai indian cities train staff spot s...
...,...,...
4509,fruit juice concentrate maker rasna eyeing rev...,mumbai feb fruit juice concentrate maker rasna...
4510,former indian cricketer sachin tendulkar atten...,former cricketer sachin tendulkar spotted rajy...
4511,aamir khan talking reality shows television fe...,aamir khan whose last film dangal told story t...
4512,maharashtra government initiated inquiry 83yea...,maharahstra power minister chandrashekhar bawa...


In [169]:
newsSummaryDatasetCopy

Unnamed: 0,summary,text
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...
...,...,...
4509,Fruit juice concentrate maker Rasna is eyeing ...,"Mumbai, Feb 23 (PTI) Fruit juice concentrate m..."
4510,Former Indian cricketer Sachin Tendulkar atten...,Former cricketer Sachin Tendulkar was spotted ...
4511,"Aamir Khan, while talking about reality shows ...","Aamir Khan, whose last film Dangal told the st..."
4512,The Maharashtra government has initiated an in...,Maharahstra Power Minister Chandrashekhar Bawa...
