In [None]:
import csv
import os
import re

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import nltk
import pandas as pd
import seaborn as sns

from IPython.display import display

sns.set(color_codes=True)
from matplotlib import rcParams
rcParams['figure.figsize'] = 15, 10

if not os.getcwd().endswith("notebooks"):
    os.chdir("notebooks")
assert os.getcwd().endswith("notebooks")

In [None]:
data_dir = os.path.join("..", "data")
train_file = os.path.join(data_dir, "stories.train.csv")
eval_file = os.path.join(data_dir, "stories.eval.csv")
# test_file = ""

UNK = "<unk>"
PAD = "<pad>"

nonalphanumeric_pattern = re.compile('([^0-9a-zA-Z ])')

def uncommon_filter(df):
    vocab = {}
    for i, row in df.iterrows():
        row_vocab = set()
        for col in row.keys():
            if not col.startswith("sentence") and not col.startswith("ending"):
                continue
            sentence = row[col]
            assert isinstance(sentence, str)
            tokens = sentence.split()
            row_vocab.update(set(tokens))
        for word in row_vocab:
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 0
    removed = []
    for word in vocab.keys():
        if vocab[word] <= 1:
            removed.append(word)
    print(len(removed))
#     for i, row in df.iterrows():
#         ifor_val = something
#         if <condition>:
#             ifor_val = something_else
#         df.set_value(i,'ifor',ifor_val)
    return df

def skip_ner(tokens_cased):
    pos_tags = nltk.pos_tag(tokens_cased)
    ne_chunks = nltk.ne_chunk(pos_tags, binary=True)
    idx = 0
    ne_lst = []
    for ne in ne_chunks:
        if isinstance(ne, nltk.tree.Tree):
            for leaf in ne.leaves():
                ne_lst.append(idx)
                idx += 1
        else:
            idx += 1

    res = []
    for i, token in enumerate(tokens_cased):
        if i in ne_lst:
#             res.append("NE_" + str(token)) # TODO: Mark them for attention
            # TODO: Consider finidng all occurrences of token that was marked as NE in surrounding sentences and marking it as NE as well, if capitalization matches
            res.append(UNK)
        else:
            res.append(token)
    return res

def normalize_sentence(sentence, NER=False, uncommon_words=False):
    tokens = nltk.word_tokenize(sentence)

    if NER:
        tokens = skip_ner(tokens)
        
    tokens_lo = [token.lower() for token in tokens]
    sentence = " ".join(tokens_lo) # normalize spaces
    return sentence

FLAG_NER = False
FLAG_UNCOMMON = True

def read_train(file):
    def _read(file):
        df = pd.read_csv(file)
        df = df[:200] # TODO: REMOVE ME
        del df['storytitle']
        df = df.rename(index=str, columns={"sentence5": "ending"})
        return df
    
    # storyid, sentence1, sentence2, sentence3, sentence4, ending
    char_df = _read(file)
    word_df = _read(file)
    sentence_selector = ['sentence1', 'sentence2', 'sentence3', 'sentence4', 'ending']
    word_df[sentence_selector] = word_df[sentence_selector].applymap(lambda sent: normalize_sentence(sent, NER=FLAG_NER))
    if FLAG_UNCOMMON:
        word_df = uncommon_filter(word_df)
    
    return char_df, word_df

def read_eval(file):
    def _read(file):
        df = pd.read_csv(file)
        df = df[:200] # TODO: REMOVE ME
        df = df.rename(index=str, columns={"InputStoryid": "storyid", "InputSentence1": "sentence1", "InputSentence2": "sentence2", "InputSentence3": "sentence3", "InputSentence4": "sentence4", "RandomFifthSentenceQuiz1": "ending1", "RandomFifthSentenceQuiz2": "ending2", "AnswerRightEnding": "label"})
        return df
    
    # storyid, sentence1, sentence2, sentence3, sentence4, ending1, ending2, answer
    char_df = _read(file)
    word_df = _read(file)
    sentence_selector = ['sentence1', 'sentence2', 'sentence3', 'sentence4', 'ending1', 'ending2']
    word_df[sentence_selector] = word_df[sentence_selector].applymap(lambda sent: normalize_sentence(sent, NER=FLAG_NER))
    if FLAG_UNCOMMON:
        word_df = uncommon_filter(word_df)
    
    return char_df, word_df

train_char_df, train_word_df = read_train(train_file)
eval_char_df, eval_word_df = read_eval(eval_file)

display(train_word_df.iloc[190])

In [None]:
list(eval_word_df['sentence1'].values)

In [None]:
train_word_df.to_csv('../outputs/words_train.csv', index=False, quotechar='"', quoting=csv.QUOTE_ALL)
train_char_df.to_csv('../outputs/chars_train.csv', index=False, quotechar='"', quoting=csv.QUOTE_ALL)
eval_word_df.to_csv('../outputs/words_eval.csv', index=False, quotechar='"', quoting=csv.QUOTE_ALL)
eval_char_df.to_csv('../outputs/chars_eval.csv', index=False, quotechar='"', quoting=csv.QUOTE_ALL)

## TODO
* How many uncommon words are in the last sentence?
* Link NEs to label sentence, same for eval
* Find words that only occur in <= C stories in dataset (C=1), and make them UNK?

## Train vocab

In [None]:
def vocab_train(df):
    d = {PAD: 0, UNK: 1}
    uid = 2
    for idx, story_id, title, *sentences in df.itertuples():
        for sentence in sentences:
            for word in sentence.strip().split(" "):
                if word not in d:
                    d[word] = uid
                    uid += 1
    return d

In [None]:
train_voc = vocab_train(train_word_df)

In [None]:
print("train vocab", len(train_voc))
print(train_voc["youtube"])

## Output
* CRLF line breaks
* Empty line at end
* answer.txt name

In [None]:
output_folder = os.path.join("..", "outputs")
eval_out_file = os.path.join(output_folder, "eval", "answer.txt")
test_out_file = os.path.join(output_folder, "test", "answer.txt")

def output_official(results, out_file):
    out_df = results[["InputStoryId", "AnswerRightEnding"]]
    pass

def output_nlu(results, out_file):
    pass