In [4]:
import pandas as pd
import numpy as np

# Load Dataset

In [9]:
df = pd.read_csv('Dataset/imbd_dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Text Preprocessing

### 1. Lowercasing

In [19]:
df['review'][3]
df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [27]:
df['review'] = df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### 2. Remove regular expression, HTML Tags

In [31]:
import re

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [35]:
# For eg:
text = "<p>This is a paragraph of text explaining a concept.</p>"

# calling the function to remove html tags
remove_html_tags(text)


'This is a paragraph of text explaining a concept.'

In [43]:
# Now applying to dataset
df['review'] = df['review'].apply(remove_html_tags)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### 3. Remove URLs

In [46]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www.\.\S+')
    return pattern.sub(r'', text)

In [50]:
# for Example:
text1 = "Check out the link: https://www.google.com"

# calling the function
remove_url(text1)

'Check out the link: '

### 4. Remove Punctuations

In [55]:
import string, time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [57]:
exclude = string.punctuation
def remove_punc(text):
    for char in exclude:
        text = text.replace(char, '')
    return text

In [59]:
# for example:
txt = 'How are you? wOW! nice job.'
remove_punc(txt)

'How are you wOW nice job'

In [65]:
# the above method is too slow
# so another method is given below:
def remove_punct(txt):
    return txt.translate(str.maketrans('', '', exclude))
text = "How are you> woW! nice job."
remove_punct(text)

'How are you woW nice job'

In [71]:
# Applying in a dataset:
df['review'] = df['review'].apply(remove_punct)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive
