In [1]:
%pip install pyprind pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
import tarfile
with tarfile.open('aclImdb_v1.tar.gz' ,'r:gz') as tar:
    tar.extractall()

In [2]:
import pyprind
import pandas as pd
import os
import sys

In [3]:

basepath = 'aclImdb'

labels = {'pos': 1, "neg": 0}
pbar = pyprind.ProgBar(50000, stream=sys.stdout)



In [5]:
data = []
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file),
                        'r', encoding='utf-8') as infile:
                txt = infile.read()
            data.append([txt, labels[l]])
            pbar.update()
df = pd.DataFrame(data, columns=['review', 'sentiment'])

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:25


In [6]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [7]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
# the following column renaming is necessary on some computers:
df = df.rename(columns={"0": "review", "1": "sentiment"})
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [8]:
df.shape

(50000, 2)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining',
                 "The weather is sweet",
                 "The sub us shining, the weather is sweet",
                 "and one and one is two"])
bag = count.fit_transform(docs)

In [10]:
print(count.vocabulary_)

{'the': 7, 'sun': 5, 'is': 1, 'shining': 3, 'weather': 10, 'sweet': 6, 'sub': 4, 'us': 9, 'and': 0, 'one': 2, 'two': 8}


In [11]:
print(bag.toarray())

[[0 1 0 1 0 1 0 1 0 0 0]
 [0 1 0 0 0 0 1 1 0 0 1]
 [0 1 0 1 1 0 1 2 0 1 1]
 [2 1 2 0 0 0 0 0 1 0 0]]


In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,
                        norm='l2',
                        smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.34 0.   0.52 0.   0.66 0.   0.42 0.   0.   0.  ]
 [0.   0.38 0.   0.   0.   0.   0.57 0.46 0.   0.   0.57]
 [0.   0.22 0.   0.33 0.42 0.   0.33 0.53 0.   0.42 0.33]
 [0.66 0.17 0.66 0.   0.   0.   0.   0.   0.33 0.   0.  ]]


In [14]:
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [15]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [None]:
def tokenizer(text):
    return text.split()