In [1]:
import os
import sys  
from pathlib import Path
sys.path.insert(0, str(Path().resolve().parents[1]))

import pandas as pd
from utils.data_preprocessing import text_preprocessors

In [2]:
DATA_SET = 'imdb_review'
DATA_DIR = Path('data/')
OUT_DIR = Path('data/preprocessed/imdb_review/')

REVIEW_COL = 'Review'
PREPROCESSED_COL = f'Processed {REVIEW_COL}'

CWD = Path(os.getcwd()).parent.parent

import importlib
importlib.reload(text_preprocessors);
PREPROCESSOR = text_preprocessors.PREPROCESSOR

# Preprocessing Enable

In [3]:
# ORDERED List of Processors State
PREPROCESSOR_STATE = {
    'LOWER_CASE': True,
    'CONVERT_EMOTE': True,
    'REMOVE_URLs': True,
    'REMOVE_XML': True,
    'REMOVE_PUNCTUATION': True,
    'REMOVE_STOPWORDS': True,
    'STEM': True,
    'REMOVE_MOST_FREQ': True,
}

In [4]:
DATA_DIR = CWD / DATA_DIR
OUT_DIR = CWD / OUT_DIR
train_data = pd.read_csv(DATA_DIR / DATA_SET / 'train.csv')
test_data = pd.read_csv(DATA_DIR / DATA_SET / 'test.csv')

In [5]:
def preprocess_reviews(data: pd.DataFrame):
    reviews: pd.Series = data[REVIEW_COL]

    for key, enabled in PREPROCESSOR_STATE.items():
        if enabled: 
            reviews = PREPROCESSOR[key](reviews)

    data.insert(
        data.columns.get_loc(REVIEW_COL) + 1,
        PREPROCESSED_COL,
        reviews
    )

In [6]:
preprocess_reviews(train_data)
train_data.head()

  lambda s: BeautifulSoup(s, 'lxml').text


N Most Freq Words: {'movi': 47423, 'film': 45019, 'one': 25421, 'like': 21719, 'time': 14672, 'good': 14449, 'make': 14436, 'get': 13942, 'charact': 13759, 'see': 13434}


Unnamed: 0,ID,Movie Id,Rating,Review,Processed Review,Sentiment
0,1821,114057,4,Working with one of the best Shakespeare sourc...,work best shakespear sourc manag credit sourc ...,-1
1,10402,334541,1,"Well...tremors I, the original started off in ...",welltremor origin start 1990 found quit enjoy ...,-1
2,1062,337640,4,Ouch! This one was a bit painful to sit throug...,ouch bit pain sit cute amus premis goe hell ma...,-1
3,9056,219400,1,"I've seen some crappy movies in my life, but t...",ive seen crappi life must among worst defin bo...,-1
4,5392,806203,3,"""Carriers"" follows the exploits of two guys an...",carrier follow exploit two guy two gal stolen ...,-1


In [7]:
preprocess_reviews(test_data)
test_data.head()

  lambda s: BeautifulSoup(s, 'lxml').text


N Most Freq Words: {'movi': 47200, 'film': 44459, 'one': 25386, 'like': 21412, 'time': 14297, 'make': 14010, 'good': 13947, 'charact': 13722, 'get': 13612, 'see': 13429}


Unnamed: 0,ID,Movie Id,Rating,Review,Processed Review,Sentiment
0,1821,138541,4,Alan Rickman & Emma Thompson give good perform...,alan rickman emma thompson give perform southe...,-1
1,9487,202521,1,I have seen this movie and I did not care for ...,seen care anyhow would think go pari countri n...,-1
2,4604,417658,4,"In Los Angeles, the alcoholic and lazy Hank Ch...",lo angel alcohol lazi hank chinaski matt dillo...,-1
3,2828,66105,2,"This film is bundled along with ""Gli fumavano ...",bundl along gli fumavano le colt lo chiamavano...,-1
4,10890,787505,1,I only comment on really very good films and o...,comment realli utter rubbish aim help peopl wa...,-1


In [9]:
train_data.to_csv(OUT_DIR / f'train.csv', index=False)
test_data.to_csv(OUT_DIR / f'test.csv', index=False)