In [1]:
from sklearn.model_selection import train_test_split
import os
import unicodedata
import re
import nltk
from collections import Counter

In [2]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

In [3]:
def remove_html_markup(s):
    tag = False
    quote = False
    out = ""
    for c in s:
        if c == '<' and not quote:
            tag = True
        elif c == '>' and not quote:
            tag = False
        elif (c == '"' or c == "'") and tag:
            quote = not quote
        elif not tag:
            out = out + c
    return out

In [4]:
def preprocess_sentence(w):
    w = remove_html_markup(w)
    w = w.lower().strip()
    if w == '':
        return 0
    else:
        w = unicode_to_ascii(w)
        w = re.sub(r"[^a-z]+", " ", w)
        w = w.strip()
        w = re.sub(r'\s+', ' ', w)
    w = w.split(' ')
    w = [i for i in w if i != '']
    stopwords = set(nltk.corpus.stopwords.words('english'))
    w = [i for i in w if i not in stopwords]
    stemmer = nltk.stem.porter.PorterStemmer()
    w = [stemmer.stem(i) for i in w]
    return w

In [5]:
def lines_to_text(lines, sep):
    text = ''
    for i in range(len(lines)):
        if i == len(lines) - 1:
            text += str(lines[i])
        else:
            text += str(lines[i]) + sep
    return text

In [6]:
def retrieve_frequent_words(unique_words):
    new_unique_words = ['<unk>']
    for i in list(unique_words.keys()):
        if unique_words[i] >= 5:
            new_unique_words.append(i)
    return new_unique_words

In [7]:
def create_vocabulary(pos_lines, neg_lines):
    lines = pos_lines + neg_lines
    text = lines_to_text(lines, ' ')
    unique_words = Counter(text.split(' '))
    print('No. of unique words in training dataset: ', len(unique_words.keys()))
    print()
    unique_words = retrieve_frequent_words(unique_words)
    print('New no. of unique words in training dataset: ', len(unique_words))
    print()
    word_index = {i: unique_words.index(i) for i in unique_words}
    return word_index

In [8]:
def create_dataset(lines):
    new_lines = []
    for i in lines:
        s = preprocess_sentence(i)
        if s != 0:
            new_lines.append(s)
    return new_lines

In [9]:
def text_retrieve(file_names):
    text_files = []
    for i in file_names:
        with open(i, 'r') as f:
            text_files.append(f.read())
        f.close()
    return text_files

In [10]:
def find_files():
    path = '/Users/preethamganesh/Downloads/aclImdb/'
    pos_files_train = [os.path.join(path + 'train/pos/', f) for f in os.listdir(path + 'train/pos/') if
                       os.path.isfile(os.path.join(path + 'train/pos/', f))]
    pos_files_test = [os.path.join(path + 'test/pos/', f) for f in os.listdir(path + 'test/pos/') if
                      os.path.isfile(os.path.join(path + 'test/pos/', f))]
    pos_files = pos_files_train + pos_files_test
    neg_files_train = [os.path.join(path + 'train/neg/', f) for f in os.listdir(path + 'train/neg/') if
                       os.path.isfile(os.path.join(path + 'train/neg/', f))]
    neg_files_test = [os.path.join(path + 'test/neg/', f) for f in os.listdir(path + 'test/neg/') if
                      os.path.isfile(os.path.join(path + 'test/neg/', f))]
    neg_files = neg_files_train + neg_files_test
    print('No. of positive reviews: ', len(pos_files))
    print('No. of negative reviews: ', len(neg_files))
    print()
    return pos_files, neg_files

In [24]:
def remove_unk_words(lines, vocabulary):
    new_lines = []
    for i in lines:
        new_line = []
        for j in i:
            if j in vocabulary:
                new_line.append(j)
            else:
                new_line.append('<unk>')
        new_lines.append(new_line)
    return new_lines

In [23]:
pos_files, neg_files = find_files()
pos_text = text_retrieve(pos_files)
del pos_files
neg_text = text_retrieve(neg_files)
del neg_files
pos_lines = create_dataset(pos_text)
del pos_text
print('New no. of positive reviews after data preprocessing: ', len(pos_lines))
neg_lines = create_dataset(neg_text)
del neg_text
print('New no. of negative reviews after data preprocessing: ', len(neg_lines))
print()

No. of positive reviews:  25000
No. of negative reviews:  25000

New no. of positive reviews after data preprocessing:  25000
New no. of negative reviews after data preprocessing:  25000



In [25]:
train_pos, val_pos = train_test_split(pos_lines, test_size=0.2)
val_pos, test_pos = train_test_split(val_pos, test_size=0.5)
train_neg, val_neg = train_test_split(neg_lines, test_size=0.2)
val_neg, test_neg = train_test_split(val_neg, test_size=0.5)
del pos_lines, neg_lines
print('No. of positive reviews in training set: ', len(train_pos))
print('No. of negative reviews in training set: ', len(train_neg))
print('No. of positive reviews in validation set: ', len(val_pos))
print('No. of negative reviews in validation set: ', len(val_neg))
print('No. of positive reviews in testing set: ', len(test_pos))
print('No. of negative reviews in testing set: ', len(test_neg))
print()

No. of positive reviews in training set:  20000
No. of negative reviews in training set:  20000
No. of positive reviews in validation set:  2500
No. of negative reviews in validation set:  2500
No. of positive reviews in testing set:  2500
No. of negative reviews in testing set:  2500



In [26]:
vocabulary = create_vocabulary(train_pos, train_neg)

No. of unique words in training dataset:  75495

New no. of unique words in training dataset:  26533



In [15]:
val_pos = remove_unk_words(val_pos, vocabulary)

In [27]:
print(val_neg[0])

['woman', 'find', 'caught', 'appar', 'inexplic', 'rash', 'suicid', 'realli', 'suicid', 'seem', 'though', 'ghost', 'loos', 'guess', 'ghost', 'long', 'black', 'hair', 'hide', 'face', 'move', 'head', 'arm', 'twist', 'way', 'make', 'weird', 'look', 'specter', 'recurr', 'theme', 'music', 'play', 'unexpect', 'locat', 'announc', 'arriv', 'ghost', 'bizarr', 'sound', 'effect', 'resembl', 'sound', 'small', 'twig', 'snap', 'alway', 'accompani', 'hapless', 'spirit', 'sound', 'familiar', 'well', 'seen', 'asian', 'horror', 'movi', 'last', 'year', 'borrow', 'heavili', 'unabashedli', 'ring', 'one', 'last', 'call', 'recent', 'horror', 'hit', 'film', 'tri', 'carri', 'uninterest', 'worn', 'plot', 'climax', 'seen', 'avoid', 'unless', 'desper', 'seen', 'asian', 'horror', 'movi', 'last', 'year']


In [28]:
val_neg = remove_unk_words(val_neg, vocabulary)

In [30]:
print(val_neg[0])

['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>']


In [31]:
print(vocabulary)

{'<unk>': 0, "['probabl',": 1, "'one',": 2, "'best',": 3, "'thriller',": 4, "'ever',": 5, "'seen',": 6, "'action',": 7, "'bullet',": 8, "'fli',": 9, "'good',": 10, "'guy',": 11, "'bad',": 12, "'van',": 13, "'damm',": 14, "'stallon',": 15, "'quick',": 16, "'realist',": 17, "'nervou',": 18, "'plot',": 19, "'caus',": 20, "'till',": 21, "'end',": 22, "'movi',": 23, "'know',": 24, "'gonna',": 25, "'charact',": 26, "'aidan',": 27, "'quinn',": 28, "'donald',": 29, "'sutherland',": 30, "'ben',": 31, "'kingsley',": 32, "'perfect',": 33, "'suspens',": 34, "'let',": 35, "'go',": 36, "'away',": 37, "'though',": 38, "'find',": 39, "'difficult',": 40, "'keep',": 41, "'attent',": 42, "'stori',": 43, "'basic',": 44, "'tom',": 45, "'clanci',": 46, "'simpl',": 47, "'raw',": 48, "'great',": 49, "'act',": 50, "'watch',": 51, "'promis',": 52, "'end']": 53, "['would',": 54, "'rate',": 55, "'higher',": 56, "'serious',": 57, "'uneven',": 58, "'irish',": 59, "'accent',": 60, "'barbara',": 61, "'hershey',": 62,