In [122]:
import re
import os
import string
from collections import Counter
from nltk.corpus import stopwords

# load a particular document in memory
def load_doc(filename):
    # open file as read only
    file = open(filename, "r")
    text = file.read()
    file.close()
    return text

In [123]:
def clean_doc(doc):

    # split into tokens by whitespace
    tokens = doc.split()

    # prepare regex for char filtering
    re_punc = re.compile("[%s]" % re.escape(string.punctuation))

    # remove puncs from each word
    tokens = [re_punc.sub('', w) for w in tokens]

    # remove tokens taht are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]

    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]

    # filter out short tokens
    tokens = [w for w in tokens if len(w) > 1]

    return tokens

In [124]:
# load the document
filename = "datasets/txt_sentoken/neg/cv023_13847.txt"
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)

['snake', 'eyes', 'aggravating', 'kind', 'movie', 'kind', 'shows', 'much', 'potential', 'becomes', 'unbelievably', 'disappointing', 'brian', 'depalma', 'film', 'since', 'hes', 'great', 'director', 'one', 'whos', 'films', 'always', 'greeted', 'least', 'fanfare', 'even', 'film', 'starring', 'nicolas', 'cage', 'since', 'gives', 'brauvara', 'performance', 'film', 'hardly', 'worth', 'talents', 'worse', 'aggravating', 'sole', 'reason', 'story', 'could', 'much', 'could', 'totally', 'intelligent', 'opens', 'absolutely', 'subtlety', 'handled', 'complexly', 'intensely', 'one', 'point', 'movie', 'makes', 'wrong', 'turn', 'leads', 'hall', 'fame', 'halfassedness', 'deservedly', 'hall', 'fame', 'eighthassedness', 'certain', 'circles', 'snake', 'eyes', 'advertised', 'kind', 'modern', 'day', 'version', 'kurosawas', 'classic', 'rashomon', 'crime', 'told', 'four', 'different', 'mean', 'different', 'perspectives', 'looks', 'though', 'may', 'actually', 'like', 'opening', 'might', 'add', 'superb', 'one', '

In [125]:
# load the doc and then add to vocab

def add_doc_to_vocab(filename, vocab:Counter):
    # load the doc
    doc = load_doc(filename)

    # clean the doc
    tokens = clean_doc(doc)

    # update counts
    vocab.update(tokens)

In [126]:
# load all docs in a directory
def process_docs(dir, vocab):
    for filename in os.listdir(dir):
        if filename.endswith(".txt"):
            path = os.path.join(dir, filename)
            add_doc_to_vocab(path, vocab)


In [127]:
def save_list(lines, filename):
    data = "\n".join(lines)
    file = open(filename, mode="w")
    file.write(data)
    file.close()

In [128]:
# define vocab
vocab = Counter()

# add all docs to vocab
process_docs(dir=r"datasets/txt_sentoken/neg", vocab=vocab)
process_docs(dir=r"datasets/txt_sentoken/pos", vocab=vocab)

# print the size of the vocab
print(len(vocab))

46557


In [129]:
# print the top words in the vocab
print(vocab.most_common(50))

[('film', 8860), ('one', 5521), ('movie', 5440), ('like', 3553), ('even', 2555), ('good', 2320), ('time', 2283), ('story', 2118), ('films', 2102), ('would', 2042), ('much', 2024), ('also', 1965), ('characters', 1947), ('get', 1921), ('character', 1906), ('two', 1825), ('first', 1768), ('see', 1730), ('well', 1694), ('way', 1668), ('make', 1590), ('really', 1563), ('little', 1491), ('life', 1472), ('plot', 1451), ('people', 1420), ('movies', 1416), ('could', 1395), ('bad', 1374), ('scene', 1373), ('never', 1364), ('best', 1301), ('new', 1277), ('many', 1268), ('doesnt', 1267), ('man', 1266), ('scenes', 1265), ('dont', 1210), ('know', 1207), ('hes', 1150), ('great', 1141), ('another', 1111), ('love', 1089), ('action', 1078), ('go', 1075), ('us', 1065), ('director', 1056), ('something', 1048), ('end', 1047), ('still', 1038)]


In [130]:
# keep tokens with > 5 occurance
min_occurance = 2
tokens = [k for k,c in vocab.items() if c >= min_occurance]
print(len(tokens))

27139


In [131]:
# finally save the tokens to a vocab file
save_list(tokens, 'vocab.txt')

In [132]:
# prepare the reviews
# load doc into memory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [133]:
# load th vocab
vocab_filename = r'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [134]:
# load the doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    # load the doc
    doc = load_doc(filename)

    #clean the doc
    tokens = clean_doc(doc)

    # filter by vocab
    tokens = [w for w in tokens if w in vocab]

    return ' '.join(tokens)\

In [135]:
# load all docs in a directory
def process_docs_new(dir, vocab):
    lines = []

    for filename in os.listdir(dir):
        if filename.endswith(".txt"):
            path = os.path.join(dir, filename)

            # load and clean the doc
            line = doc_to_line(path, vocab)

            # add to list
            lines.append(line)

    return lines

In [136]:
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

# prepare negative reviews
negative_lines = process_docs_new(dir=r"datasets/txt_sentoken/neg", vocab=vocab)
save_list(negative_lines, 'negative.txt')

positive_lines = process_docs_new(dir="datasets/txt_sentoken/pos", vocab=vocab)
save_list(lines=positive_lines, filename='positive.txt')