In [1]:
import os
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import tarfile  # this is to extract the data from that .tgz file
from IPython.display import display
from ydata_profiling import ProfileReport
import numpy as np
import multiprocessing as mp
import string
import spacy 
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator
from unicodedata import normalize
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_DIR = os.getcwd()  # or use pathlib.Path().resolve()
_resources_path = os.path.join(BASE_DIR, 'resources')
_profiling_path = os.path.join(_resources_path, 'profiling')
_data_path = os.path.join(_resources_path, 'data')

In [3]:
tgz_path = os.path.join(_resources_path, 'amazon_review_polarity_csv.tgz')

if not os.path.isfile(tgz_path):
    raise FileNotFoundError(f"Archive not found: {tgz_path}")

if not os.path.isdir(_data_path):
    with tarfile.open(tgz_path, 'r:gz') as amazon_reviews:
        amazon_reviews.extractall(_data_path)
        print(f"Extracted contents to: {_data_path}")

In [4]:
train_df = pd.read_csv(os.path.join(_data_path, 'amazon_review_polarity_csv', 'train.csv'), header=None)
test_df = pd.read_csv(os.path.join(_data_path, 'amazon_review_polarity_csv', 'test.csv'), header=None)

In [5]:
train_df.head()

Unnamed: 0,0,1,2
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [6]:
test_df.head()

Unnamed: 0,0,1,2
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [7]:
print("Train Data Size: ", train_df.shape, "\nTest Data Size: ", test_df.shape)

Train Data Size:  (3600000, 3) 
Test Data Size:  (400000, 3)


In [8]:
# profile_test_df = ProfileReport(test_df, title="Train EDA Report", explorative=True)

In [9]:
# profile_test_df.to_file(os.path.join(_profiling_path, "test_eda_report.html"))

In [None]:
nlp = spacy.load('en_core_web_sm')

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, variety="BrE", user_abbrevs={}, n_jobs=1):
        """
        Text preprocessing transformer includes steps:
        1. Text normalization
        2. Punctuation removal
        3. Stop words removal
        4. Lemmatization
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return pd.Series(self._preprocess_texts(part), index=part.index)

    def _preprocess_texts(self, texts):
        results = []
        for doc in nlp.pipe(texts, batch_size=1000, n_process=1):  # Keep `n_process=1` on Kaggle
            doc = self._remove_punct(doc)
            doc = self._remove_stop_words(doc)
            results.append(self._lemmatize(doc))
        return results

    def _normalize(self, text: str) -> str:
        try:
            text = normalize('NFKC', text)
            text = text.lower()
            text = re.sub(r'\s+', ' ', text)
            return text.strip()
        except Exception as e:
            return text


    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])

In [None]:
test_df_processed = TextPreprocessor(n_jobs=-1).transform(test_df[2])

  return bound(*args, **kwds)
