In [35]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords

In [36]:
df_raw = pd.read_csv('.\IMDB Dataset.csv')
df_raw.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [37]:
df_raw['sentiment'].replace(['positive', 'negative'], [1, 0], inplace = True)
df_raw.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [38]:
def drop_duplicates(df):
    duplicates = df.duplicated().sum()
    print(f'Number of duplicates: {duplicates}')
    if duplicates > 0:
        df.drop_duplicates(inplace=True)
    
    
drop_duplicates(df_raw)

Number of duplicates: 418


In [39]:
len(df_raw)

49582

## Preprocessing

+ lowercase
+ remove html strips, square brackets
+ remove special characters
+ remove stopwords
+ stemming/lemmatization

In [40]:
def remove_html(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

df_raw['review'] = df_raw['review'].apply(remove_html)

  soup = BeautifulSoup(text, 'html.parser')


In [41]:
def special_characters(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

df_raw['review'] = df_raw['review'].apply(special_characters)

In [42]:
df_raw['review'] = df_raw['review'].str.lower()

In [43]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dduqu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
stop_words = set(stopwords.words('english'))

In [45]:
pattern = r'''(?x)                  # Flag para iniciar el modo verbose
              (?:[A-Z]\.)+            # Hace match con abreviaciones como U.S.A.
              | \w+(?:-\w+)*         # Hace match con palabras que pueden tener un guión interno
              | \$?\d+(?:\.\d+)?%?  # Hace match con dinero o porcentajes como $15.5 o 100%
              | \.\.\.              # Hace match con puntos suspensivos
              | [][.,;"'?():-_`]    # Hace match con signos de puntuación
'''

def remove_stopwords(text):
    tokens = nltk.regexp_tokenize(text, pattern)
    filtered_tokens = []
    for token in tokens:
        if token not in stop_words and len(token) > 2:
            filtered_tokens.append(token)
    return filtered_tokens
    
df_raw['review'] = df_raw['review'].apply(remove_stopwords)

In [47]:
df_raw.head()

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, episode,...",1
1,"[wonderful, little, production, filming, techn...",1
2,"[thought, wonderful, way, spend, time, hot, su...",1
3,"[basically, theres, family, little, boy, jake,...",0
4,"[petter, matteis, love, time, money, visually,...",1


In [48]:
df_raw.to_csv('IMDB_reviews_cleaned.csv', index=False)