### Get dataset

In [1]:
import os
import urllib.request
import tarfile
import pandas as pd

source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
target = 'aclImdb_v1.tar.gz'


if not os.path.isdir('aclImdb') and not os.path.isfile('aclImdb_v1.tar.gz'):
    urllib.request.urlretrieve(source, target)
    
if not os.path.isdir('aclImdb'):
    with tarfile.open(target, 'r:gz') as tar:
        tar.extractall()

In [2]:
# Build dataframe

basepath = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], 
                           ignore_index=True)
df.columns = ['review', 'sentiment']

In [3]:
# Shuffling the dataset

import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.head(3)

Unnamed: 0,review,sentiment
11841,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
19602,OK... so... I really like Kris Kristofferson a...,0
45519,"***SPOILER*** Do not read this, if you think a...",0


### Cleaning text with regex

In [4]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [7]:
preprocessor('is seven.<br /><br />Title (Brazil): Not Available')

'is seven title brazil not available'

In [8]:
preprocessor('</a>This :) is :( a test :-)!')

'this is a test :) :( :)'

In [9]:
df['review'] = df['review'].apply(preprocessor)

In [10]:
df.head(3)

Unnamed: 0,review,sentiment
11841,in 1974 the teenager martha moxley maggie grac...,1
19602,ok so i really like kris kristofferson and his...,0
45519,spoiler do not read this if you think about w...,0
