In [1]:
pip install contractions

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.2 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import contractions
import nltk

In [41]:
# Dataset
moviePath = {
    "small": "../../dataset/movie/movie_reviews_small.csv",
    "medium": "../../dataset/movie/movie_reviews_medium.csv"
}

In [42]:
movieDataset = pd.read_csv(moviePath["medium"], encoding='latin-1')
movieDatasetCopy = movieDataset.copy()

In [43]:
movieDataset.shape

(15000, 2)

In [44]:
for i in range(10):
    print(movieDataset.iloc[i]['summary'])

 "There Is So Much Darkness Now ~ Come For The Miracle"

 Worthwhile and Important Story Hampered by Poor Script and Production

 This movie needed to be made.

 distantly based on a real tragedy

 "What's going on down in Juarez and shining a light on it"

 Pretty pointless fictionalization

 This is junk, stay away

 A  Rock N Roll History Lesson

 A  MUST-HAVE  video if you grew up in the 50's or 60's

 If You Like DooWop You Gotta Have This DVD



In [45]:
# 1. Drop NA values
movieDataset = movieDataset.dropna()
movieDataset.shape

(15000, 2)

In [46]:
# 2. Drop duplicates values
movieDataset = movieDataset.drop_duplicates("text")
movieDataset.shape

(14400, 2)

In [47]:
# 3. Converting to lowercase
movieDataset['text'] = movieDataset['text'].apply(lambda x: x.lower())
movieDataset['summary'] = movieDataset['summary'].apply(lambda x: x.lower())

In [48]:
# 4. Remove HTML tags
from bs4 import BeautifulSoup
movieDataset['text'] = movieDataset['text'].apply(lambda x: BeautifulSoup(x, "html.parser").text)
movieDataset['summary'] = movieDataset['summary'].apply(lambda x: BeautifulSoup(x, "html.parser").text)



In [49]:
# 5. Contraction Mapping [Expansion] eg:- "aren't" ==> "are not"
movieDataset['text'] = movieDataset['text'].apply(lambda x: [contractions.fix(word) for word in x.split()])
movieDataset['text'] = movieDataset['text'].apply(lambda x: ' '.join(x))

movieDataset['summary'] = movieDataset['summary'].apply(lambda x: [contractions.fix(word) for word in x.split()])
movieDataset['summary'] = movieDataset['summary'].apply(lambda x: ' '.join(x))

In [50]:
# 6. Remove (‘s)
import re
def remove_s(text):
    text = re.sub("'s", "", text)
    return text

movieDataset['text'] = movieDataset['text'].apply(lambda x: remove_s(x))
movieDataset['summary'] = movieDataset['summary'].apply(lambda x: remove_s(x))

In [51]:
# 7. Remove any text inside any form of parenthesis ( ) [] {} < >
def remove_content_between_parenthsis(text):
    return re.sub(r'\([^)]*\)', '', text)

movieDataset['text'] = movieDataset['text'].apply(lambda x: remove_content_between_parenthsis(x))
movieDataset['summary'] = movieDataset['summary'].apply(lambda x: remove_content_between_parenthsis(x))

In [52]:
# 8. Eliminate punctuations and special characters
import string
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

movieDataset['text'] = movieDataset['text'].apply(lambda x: remove_punctuation(x))
movieDataset['summary'] = movieDataset['summary'].apply(lambda x: remove_punctuation(x))


In [53]:
# 9. Remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

movieDataset['text'] = movieDataset['text'].apply(lambda x: remove_stopwords(x))
movieDataset['summary'] = movieDataset['summary'].apply(lambda x: remove_stopwords(x))

In [54]:
# 10. Remove short words
def remove_shortwords(text):
    return ' '.join([word for word in text.split() if len(word) > 2])

movieDataset['text'] = movieDataset['text'].apply(lambda x: remove_shortwords(x))
movieDataset['summary'] = movieDataset['summary'].apply(lambda x: remove_shortwords(x))

In [55]:
# 11. Remove the rows that have empty text or summary
def remove_empty_rows(text, summary):
    return (text != '') & (summary != '')

movieDataset = movieDataset[movieDataset.apply(lambda x: remove_empty_rows(x['text'], x['summary']), axis=1)]
movieDataset.shape

(14316, 2)

In [56]:
# 12. remove extra lines and trim spaces
def remove_extra_lines(text):
    return text.strip()

movieDataset['text'] = movieDataset['text'].apply(lambda x: remove_extra_lines(x))
movieDataset['summary'] = movieDataset['summary'].apply(lambda x: remove_extra_lines(x))

In [57]:
# 13. Removing Emojis from the text
import re
def remove_emojis(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

movieDataset['text'] = movieDataset['text'].apply(lambda x: remove_emojis(x))
movieDataset['summary'] = movieDataset['summary'].apply(lambda x: remove_emojis(x))

In [58]:
# 14. Removing URLs
import re
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

movieDataset['text'] = movieDataset['text'].apply(lambda x: remove_urls(x))
movieDataset['summary'] = movieDataset['summary'].apply(lambda x: remove_urls(x))

In [59]:
# Saving the cleaned data to a csv file
movieDataset.to_csv('../../dataset/movie/cleaned_movie_reviews_medium.csv', index=False)