In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

import re
import string
import pandas as pd
import spacy
import tqdm

from tqdm._tqdm_notebook import tqdm_notebook

tqdm_notebook.pandas()
nlp = spacy.load('en_core_web_sm')

In [2]:
df = pd.read_csv('../data/Dataset.csv')
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [3]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [4]:
def clean_text(text: str):
    text = re.sub(r'<[^>]*>', ' ', text)  # remove <br /> tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # remove punctuations 
    text = re.sub(r'\w*\d\w*', '', text)  # remove words containing numbers
    return text

def preprocess_text(text: str):
    sentence = list()
    doc = nlp(text)
    for word in doc:
        sentence.append(word.lemma_)
    return ' '.join(sentence)

In [5]:
df['cleaned_review'] = df['review'].progress_apply(lambda x:clean_text(x))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))




In [6]:
df['cleaned_review'] = df['cleaned_review'].progress_apply(lambda x:preprocess_text(x))

df.tail()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))




Unnamed: 0,review,sentiment,cleaned_review
49995,I thought this movie did a down right good job...,positive,-PRON- think this movie do a down right good j...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad act idiotic direct t...
49997,I am a Catholic taught in parochial elementary...,negative,-PRON- be a Catholic teach in parochial elemen...
49998,I'm going to have to disagree with the previou...,negative,-PRON- be go to have to disagree with the prev...
49999,No one expects the Star Trek movies to be high...,negative,no one expect the Star Trek movie to be high a...


In [7]:
df['cleaned_review'] = df['cleaned_review'].str.replace('-PRON-', '')
df.tail()

Unnamed: 0,review,sentiment,cleaned_review
49995,I thought this movie did a down right good job...,positive,think this movie do a down right good job be...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad act idiotic direct t...
49997,I am a Catholic taught in parochial elementary...,negative,be a Catholic teach in parochial elementary s...
49998,I'm going to have to disagree with the previou...,negative,be go to have to disagree with the previous c...
49999,No one expects the Star Trek movies to be high...,negative,no one expect the Star Trek movie to be high a...


In [16]:
final_df = df.drop([
    'review'
], axis=1)

In [17]:
final_df.tail()

Unnamed: 0,sentiment,cleaned_review
49995,positive,think this movie do a down right good job be...
49996,negative,bad plot bad dialogue bad act idiotic direct t...
49997,negative,be a Catholic teach in parochial elementary s...
49998,negative,be go to have to disagree with the previous c...
49999,negative,no one expect the Star Trek movie to be high a...


In [18]:
final_df.to_csv('../data/final_data.csv', index=False)