In [127]:
import os
import nltk
import re
import operator
from nltk.tag import StanfordPOSTagger
from nltk.tag import StanfordNERTagger


nltk.internals.config_java(options='-Xmx3024m')
st = StanfordPOSTagger('english-left3words-distsim.tagger', path_to_jar='stanford-postagger.jar')
ner = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz', 'stanford-ner.jar')

def clean_and_tokenize_corpus(directories):
    file_list = []
    uncleaned_list = []
    for directory in directories:
        for file_name in os.listdir(directory):
            if file_name != '.DS_Store':
                file = open(directory + file_name)
                raw_text = file.read()
                raw_text_lowercase = raw_text.lower()
                tokens = nltk.word_tokenize(raw_text_lowercase)
                no_punct_tokens = [x for x in tokens if re.match('\w+', x)]
                file_list.append(no_punct_tokens)
                uncleaned_list.append(nltk.word_tokenize(raw_text))
                file.close()
    return (file_list, uncleaned_list)

def count_total_vocab(file_wordfreqs):
    vocab = {}
    for words in file_wordfreqs:
        for word in words:
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 1
    return vocab

def get_corpus_vocab(corpus, unknown_threshold):
    initial_dict = count_total_vocab(corpus)
    sorted_dict = sorted(initial_dict.items(), key=operator.itemgetter(1), reverse=True)
    filtered_dict = [key for key, value in sorted_dict[:unknown_threshold]]
    return filtered_dict

def get_idf_dict(vocab, corpus):
    idf_dict = {}
    num_docs = len(corpus) 
    for word in vocab:
        df_count = 1
        for doc in corpus:
            if word in doc:
                df_count += 1
        idf_dict[word] = math.log(num_docs / df_count)
    return idf_dict

def get_named_entities(novel):
    named_entities = ner.tag(novel)
    entities = []
    cur_entity = []
    last_tag = 'O'
    for entity in named_entities:
        if entity[1] == 'O':
            if len(cur_entity) > 0:
                new_entry = (' '.join(cur_entity), last_tag)
                if new_entry not in entities:
                    entities.append(new_entry)
                cur_entity = []
            last_tag = 'O'
        elif entity[1] == last_tag:
            cur_entity.append(entity[0])
        else:
            if len(cur_entity) > 0:
                new_entry = (' '.join(cur_entity), last_tag)
                if new_entry not in entities:
                    entities.append(new_entry)
                cur_entity = []
            cur_entity.append(entity[0])
            last_tag = entity[1]
    return entities

def get_ne_corpus(corpus):
    return [get_named_entities(x) for x in corpus]
    

corpus = clean_and_tokenize_corpus(['data/JaneAusten/', 'data/CarrollLewis/', 'data/CharlesDickens/'])
ugram_corpus = corpus[0]
vocab = get_corpus_vocab(ugram_corpus, 500)
idf_dict = get_idf_dict(vocab, ugram_corpus)
ne_corpus = get_ne_corpus(corpus[1])
ne_vocab = get_corpus_vocab(ne_corpus, 100)
ne_idf_dict = get_idf_dict(ne_vocab, ne_corpus)

In [45]:
from gensim.models import Word2Vec
import os

def train_word2vec(corpus):
    model = Word2Vec(corpus, size=100)
    model.train(corpus, total_examples=len(corpus), epochs=50)
    return model

word2vec = train_word2vec(ugram_corpus)


In [129]:
import nltk
import os
import numpy
from nltk import word_tokenize
from numpy import array
from collections import Counter
import math
import readability
from sklearn.metrics.pairwise import cosine_similarity

def clean_and_tokenize_file(file_name):
    file = open(file_name)
    raw_text = file.read()
    raw_text_lower = raw_text.lower()
    caseless_tokens = nltk.word_tokenize(raw_text_lower)
    sentences = nltk.sent_tokenize(raw_text_lower)
    raw_sentence_tokens = nltk.sent_tokenize(raw_text)
    case_tokens = nltk.word_tokenize(raw_text)
    sentence_list = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        sentence_list.append([x for x in words if re.match('\w+', x)])
    return (caseless_tokens, sentence_list, raw_sentence_tokens, case_tokens)


def tf_idf(vocab, document, idf_dict):
    counts = Counter(document)
    doc_len = len(document)
    doc_num = len(corpus)
    tf_idfs = []
    for word in vocab:
        doc_counter = 0
        tf = counts[word] / len(document)
        idf = idf_dict[word]
        tf_idfs.append(tf*idf)
    return tf_idfs

def get_avg_word2vec(novel):
    vec_list = []
    for word in novel:
        if word in word2vec:
            vec_list.append(word2vec[word])
    vector_array = numpy.array(vec_list)
    vector_array = numpy.mean(vector_array, axis=0)
    return vector_array.tolist()

def get_pairwise_similarity(sentences):
    sentence_vectors = []
    for sentence in sentences:
        vec_list = []
        for word in sentence:
            if word in word2vec:
                vec_list.append(word2vec[word])
        if len(vec_list) > 0:
            vector_array = numpy.array(vec_list)
            vector_array = numpy.mean(vector_array, axis=0)
            sentence_vectors.append(vector_array)
    cs_sim = cosine_similarity(sentence_vectors, sentence_vectors)
    return numpy.mean(cs_sim)

def get_pos_vector(novel, pos_tags):
    tagged_tokens = st.tag(novel)
    tags = [tag for (word, tag) in tagged_tokens]
    num_tokens = len(novel)
    pos_vector = [tags.count(pos_tag) / num_tokens for pos_tag in pos_tags]
    return pos_vector

def get_readability_score(sentences):
    newline_regex = re.compile(r'\n')
    clean_sents = []
    for sentence in sentences:
        clean_sents.append(re.sub(newline_regex, ' ', sentence))
    readability_text = '\n'.join(clean_sents)
    readability_results = readability.getmeasures(readability_text, lang='en')
    flesch_score = readability_results['readability grades']['FleschReadingEase']   
    return flesch_score

def feature_vector(novel, vocab, idf_dict, pos_tags):
    tokens = novel[0] # for unigrams
    token_sents = novel[1] #  for sent similarity
    case_tokens = novel[3] # for pos tagging and ner
    
    unigram_vector = tf_idf(vocab, tokens, idf_dict)
    pos_vector = get_pos_vector(case_tokens, pos_tags)
    word_2_vec_vector = get_avg_word2vec(tokens)
    word_vector = unigram_vector + pos_vector + word_2_vec_vector
    
    # gets ratio of unique words to total # of words
    novel_vocab = count_total_vocab([tokens])
    vocab_size = len(novel_vocab) / len(tokens)
    word_vector.append(vocab_size)
      
    cs_similarity = get_pairwise_similarity(token_sents)
    word_vector.append(cs_similarity)
    
    return word_vector

def feature_vector_time_period(novel, vocab, idf_dict, pos_tags, ne_vocab, ne_novel, ne_idf_dict):
    word_vector = feature_vector(novel, vocab, idf_dict, pos_tags)
    raw_sents = novel[2]
    word_vector.append(get_readability_score(raw_sents))
    ne_vector = tf_idf(ne_vocab, ne_novel, ne_idf_dict)
    time_period_vector = word_vector + ne_vector
    return time_period_vector

def create_vector_arrays(training_data, corpus_dict, vocab, idf_dict, pos_tags, is_time, ner_vocab, ner_corpus, ner_idf_dict):
    if is_time:
        len_feature_vector = len(vocab) + len(pos_tags) + len(ner_vocab) + 103
    else:
        len_feature_vector = len(vocab) + len(pos_tags) + 102
    vector_array = numpy.zeros((len(training_data), len_feature_vector))
    results_array = numpy.zeros(len(training_data))
    index = 0
    for data, age in training_data.items():
        results_array[index] = age
        novel = clean_and_tokenize_file(corpus_dict + data)
        if is_time:
            vector_array[index] = feature_vector_time_period(novel, vocab, idf_dict, pos_tags, ner_vocab, ner_corpus[index], ner_idf_dict)
        else:
            vector_array[index] = feature_vector(novel, vocab, idf_dict, pos_tags)
        index += 1
    return (vector_array, results_array)
    
    
training_data = {'121-0.txt': 28}

corpus_dict = 'data/JaneAusten/'    
tagset = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']

create_vector_arrays(training_data, corpus_dict, vocab, idf_dict, tagset, True, ne_vocab, ne_corpus, ne_idf_dict)



(array([[ 4.11301321e-03,  2.98351949e-03,  2.89683440e-03,
          3.04691604e-03,  1.98987553e-03,  1.65736707e-03,
          1.63278473e-03,  1.39989943e-03,  1.44000746e-03,
          2.01445787e-03,  1.03763340e-03,  7.03831137e-04,
          1.16054509e-03,  6.79248800e-04,  1.41930654e-03,
          8.84964150e-04,  9.09546488e-04,  1.34167811e-03,
          8.56500391e-04,  1.02857675e-03,  9.35422633e-04,
          6.62429306e-04,  7.62052463e-04,  6.35259354e-04,
          6.88305450e-04,  5.29167160e-04,  4.47657304e-04,
          3.58384605e-04,  2.32885303e-04,  6.11970823e-04,
          4.43775882e-04,  6.14558438e-04,  2.62642869e-04,
          4.02374051e-04,  4.29544003e-04,  3.77791713e-04,
          4.46363497e-04,  5.54071709e-04,  4.89059136e-04,
          4.94234365e-04,  3.73910292e-04,  3.99786436e-04,
          3.79085521e-04,  4.70945834e-04,  3.44152725e-04,
          3.23451809e-04,  2.62642869e-04,  2.65230484e-04,
          2.30297688e-04,  3.16982773e-0

In [None]:
from numpy import array
from torch import tensor, Size
from torch import nn
from torch import optim
from torch import functional
from torch import from_numpy
from torch.utils.data import TensorDataset

input_array = array(features_vector_array[0], dtype='float32')
input_vector = from_numpy(input_array)

output_array = array(features_vector_array[1], dtype='float32')
target_vector = from_numpy(output_array)

input_and_target = TensorDataset(input_vector, target_vector)

predict_array = array(test_vector, dtype='float32')
predict_vector = from_numpy(predict_array)

dickens_predict_array = array(dickens_test_array, dtype='float32')
dickens_predict_vector = from_numpy(dickens_predict_array)

linear_model = nn.Linear(115, 1)
loss_func = nn.functional.mse_loss 
optimize = optim.SGD(linear_model.parameters(), lr=0.001)
loss = loss_func(linear_model(input_vector), target_vector)

def train(num_epochs, model, loss_func, optimizer):
    for epoch in range(num_epochs):
        for xb,yb in input_and_target:
            
            # Predictions
            pred = model(xb)
            loss = loss_func(pred, yb)
            
            # Stochastic radient descent
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
    print(loss)

train(500, linear_model, loss_func, optimize)

pred = linear_model(predict_vector)
print(pred)

def goodness(predicted, actual, range_size):
    return float((predicted - actual)) / float(range_size)

print("Jane Austen")
print("Age predicted for Mansfield Park: %s" % pred)
print("Goodness Metric: %s\n" % abs(goodness(pred, 39, 15)))

pred = linear_model(dickens_predict_vector)
print("Charles Dickens")
print("Age predicted for Oliver Twist: %s" % pred)
print("Goodness Metric: %s" % abs(goodness(pred, 25, 10)))