# Stats for Human-Provided Rationales

In this notebook, we experiment with different regular expressions to parse the documents and the human-provided rationales.

In [1]:
from glob import glob
import csv
import operator
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re

In [2]:
def load_imdb_data(path_to_imdb):
    print("Loading the imdb reviews data")
    train_neg_files = glob(path_to_imdb + r"/train/neg/*.txt")
    train_pos_files = glob(path_to_imdb + r"/train/pos/*.txt")
    train_corpus = []
    y_train = []
    for tnf in train_neg_files:
            with open(tnf, 'r', errors='replace') as f:
                line = f.read()
                train_corpus.append(line)
                y_train.append(0)

    for tpf in train_pos_files:
        with open(tpf, 'r', errors='replace') as f:
            line = f.read()
            train_corpus.append(line)
            y_train.append(1)

    test_neg_files = glob(path_to_imdb + r"/test/neg/*.txt")
    test_pos_files = glob(path_to_imdb + r"/test/pos/*.txt")

    test_corpus = []

    y_test = []

    for tnf in test_neg_files:
        with open(tnf, 'r', errors='replace') as f:
            test_corpus.append(f.read())
            y_test.append(0)

    for tpf in test_pos_files:
        with open(tpf, 'r', errors='replace') as f:
            test_corpus.append(f.read())
            y_test.append(1)

    print("Data loaded.")
    return train_corpus, y_train, test_corpus, y_test

In [3]:
def parse_human_phrases(phrase_file, to_lower=False):
    with open(phrase_file) as csvfile:
        reader=csv.reader(csvfile, delimiter='\t')
        phrases={} # key:phrase value:number of times appeared in the collection
        for row in reader:            
            for phrase in row:
                if phrase is not "":
                    if to_lower:
                        phrase = phrase.lower()
                    if phrase not in phrases:
                        phrases[phrase] = 1
                    else:
                        phrases[phrase] += 1
    return phrases

In [4]:
path_to_imdb = "C:/Users/mbilgic/Desktop/aclImdb"

In [5]:
train_corpus, y_train, test_corpus, y_test = load_imdb_data(path_to_imdb)

Loading the imdb reviews data
Data loaded.


In [6]:
train_corpus = np.array(train_corpus)
test_corpus = np.array(test_corpus)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [9]:
# Test for single character
# Test for .,?!;
# Test for '/<>"
test_case = "Here is a sentence. I'm here! Are you?? Here, there. No space between.dot. Give it a 3/10. \"Quote\" Many.<br /><br" 

In [12]:
tokenizer = re.compile(r"(?u)\b\w\w+\b") # the default token pattern
" ".join(tokenizer.findall(test_case))

'Here is sentence here Are you Here there No space between dot Give it 10 Quote Many br br'

In [13]:
token_pattern = r"(?u)\b\w+\b"
tokenizer = re.compile(token_pattern)
" ".join(tokenizer.findall(test_case))

'Here is a sentence I m here Are you Here there No space between dot Give it a 3 10 Quote Many br br'

In [14]:
token_pattern = r"(?u)\b\S+\b"
tokenizer = re.compile(token_pattern)
" ".join(tokenizer.findall(test_case))

"Here is a sentence I'm here Are you Here there No space between.dot Give it a 3/10 Quote Many.<br br"

In [26]:
token_pattern = r"(?u)\b[\w\'/]+\b"
tokenizer = re.compile(token_pattern)
" ".join(tokenizer.findall(test_case))

"Here is a sentence I'm here Are you Here there No space between dot Give it a 3/10 Quote Many br br"

In [27]:
neg_phrases_dict = parse_human_phrases('Negative.tsv', to_lower=True)

In [28]:
neg_set = set()
for k in neg_phrases_dict.keys():
    p = " ".join(tokenizer.findall(k))
    if p != '':
        neg_set.add(p)

In [29]:
neg_list = list(neg_set)

In [66]:
vectorizer = CountVectorizer(vocabulary=neg_list, lowercase=True, ngram_range=(1,20), binary=True, token_pattern=token_pattern)
X = vectorizer.fit_transform(train_corpus[y_train==0])
vocab = vectorizer.get_feature_names()

In [67]:
all_counts = np.sum(X, axis=0)
all_counts_array = all_counts.A1
all_counts_sorted_indices = np.argsort(all_counts_array)

In [103]:
np.sum(all_counts_array==0)

22

In [104]:
for i in all_counts_sorted_indices:
    if all_counts_array[i] > 0:
        break
    terms = tokenizer.findall(vocab[i])
    if len(terms) <= 20:
        print(vocab[i])

his little pile of garbage
cenes that are so bad you can have a good laugh at them
ails to distinguish himself in the title role
his is awkward bad
as placed on the video nasties list
ne loses hope for it early on
adly dubbed poorly acted and slow
ast is wasted
oo many non native english speakers play parts of native english speakers
ever got comfortable with her
big emotional moments and climaxes and character relationships come completely out of no where
ome gross close ups of the cannibals mouth


In [105]:
pos_phrases_dict = parse_human_phrases('Positive.tsv', to_lower=True)

In [106]:
pos_set = set()
for k in pos_phrases_dict.keys():
    p = " ".join(tokenizer.findall(k))
    if p != '':
        pos_set.add(p)

In [107]:
pos_list = list(pos_set)

In [108]:
vectorizer = CountVectorizer(vocabulary=pos_list, lowercase=True, ngram_range=(1,20), binary=True, token_pattern=token_pattern)
X = vectorizer.fit_transform(train_corpus[y_train==1])
vocab = vectorizer.get_feature_names()

In [109]:
all_counts = np.sum(X, axis=0)
all_counts_array = all_counts.A1
all_counts_sorted_indices = np.argsort(all_counts_array)

In [110]:
np.sum(all_counts_array==0)

7

In [111]:
for i in all_counts_sorted_indices:
    if all_counts_array[i] > 0:
        break
    terms = tokenizer.findall(vocab[i])
    if len(terms) <= 20:
        print(vocab[i])

mazing in both its irony and its technical complexity
eal winner
his is an aggressive unsettling glorious deeply emotional wildly imaginative piece of storytelling that you'll never forget
