### Get dataset

In [10]:
help(tarfile)

Help on module tarfile:

NAME
    tarfile - Read from and write to tar format archives.

MODULE REFERENCE
    https://docs.python.org/3.9/library/tarfile
    
    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

CLASSES
    builtins.Exception(builtins.BaseException)
        TarError
            CompressionError
            ExtractError
            HeaderError
            ReadError
            StreamError
    builtins.object
        TarFile
        TarInfo
    
    class CompressionError(TarError)
     |  Exception for unavailable compression methods.
     |  
     |  Method resolution order:
     |      CompressionError
     |      TarError
     |      builtins.Exception
     |      builtins.BaseException
     |      built

In [4]:
import os
import urllib.request
import tarfile
import pandas as pd

source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
target = '/tmp/aclImdb_v1.tar.gz'


if not os.path.isdir('/tmp/aclImdb') and not os.path.isfile('/tmp/aclImdb_v1.tar.gz'):
    urllib.request.urlretrieve(source, target)
    
if not os.path.isdir('/tmp/aclImdb'):
    with tarfile.open(target, 'r:gz') as tar:
        tar.extractall()

In [5]:
# Build dataframe

basepath = '/tmp/aclImdb'

labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], 
                           ignore_index=True)
df.columns = ['review', 'sentiment']

In [None]:
# Shuffling the dataset

import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.head(3)

### Cleaning text with regex

In [None]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [None]:
preprocessor('is seven.<br /><br />Title (Brazil): Not Available')

In [None]:
preprocessor('</a>This :) is :( a test :-)!')

In [None]:
df['review'] = df['review'].apply(preprocessor)

In [None]:
df.head(3)