In [44]:
import os
import nltk
import re

def clean_and_tokenize_corpus(directories):
    word_file_list = []
    sent_file_list = []
    for directory in directories:
        for file_name in os.listdir(directory):
            if file_name != '.DS_Store':
                file = open(directory + file_name)
                raw_text = file.read().lower()
                tokens = nltk.word_tokenize(raw_text)
                no_punct_tokens = [x for x in tokens if re.match('\w+', x)]
                file_list.append(no_punct_tokens)
                file.close()
    return file_list

corpus = clean_and_tokenize_corpus(['data/JaneAusten/', 'data/CarrollLewis/', 'data/CharlesDickens/'])

In [45]:
from gensim.models import Word2Vec
import os

def train_word2vec(corpus):
    model = Word2Vec(corpus, size=100)
    model.train(corpus, total_examples=len(corpus), epochs=50)
    return model

word2vec = train_word2vec(corpus)


In [66]:
import nltk
import os
import numpy
from nltk import word_tokenize
from numpy import array
from collections import Counter
import math
from sklearn.metrics.pairwise import cosine_similarity

def clean_and_tokenize_file(file_name):
    file = open(file_name)
    raw_text = file.read().lower()
    tokens = nltk.word_tokenize(raw_text)
    sentences = nltk.sent_tokenize(raw_text)
    sentence_list = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        sentence_list.append([x for x in words if re.match('\w+', x)])
    return (tokens, sentence_list)

def count_total_vocab(file_wordfreqs):
    vocab = {}
    for words in file_wordfreqs:
        for word in words:
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 1
    return vocab

def get_corpus_vocab(corpus, unknown_threshold):
    initial_dict = count_total_vocab(corpus)
    filtered_dict = [word for word, size in initial_dict.items()][:unknown_threshold]
    return filtered_dict

def get_idf_dict(vocab, corpus):
    idf_dict = {}
    num_docs = len(corpus)
    for word in vocab:
        df_count = 0
        for doc in corpus:
            if word in doc:
                df_count += 1
        idf_dict[word] = math.log(num_docs / df_count)
    return idf_dict

def tf_idf(vocab, document, idf_dict):
    counts = Counter(document)
    doc_len = len(document)
    doc_num = len(corpus)
    tf_idfs = []
    for word in vocab:
        doc_counter = 0
        tf = counts[word] / len(document)
        idf = idf_dict[word]
        tf_idfs.append(tf*idf)
    return tf_idfs

def get_avg_word2vec(novel, word_2_vec):
    vec_list = []
    for word in novel:
        if word in word2vec:
            vec_list.append(word2vec[word])
    vector_array = numpy.array(vec_list)
    vector_array = numpy.mean(vector_array, axis=0)
    return vector_array.tolist()

def get_pairwise_similarity(sentences, word2vec):
    sentence_vectors = []
    for sentence in sentences:
        vec_list = []
        for word in sentence:
            if word in word2vec:
                vec_list.append(word2vec[word])
        if len(vec_list) > 0:
            vector_array = numpy.array(vec_list)
            vector_array = numpy.mean(vector_array, axis=0)
            sentence_vectors.append(vector_array)
    cs_sim = cosine_similarity(sentence_vectors, sentence_vectors)
    return numpy.mean(cs_sim)

def get_pos_vector(novel, pos_tags):
    tagged_tokens = nltk.pos_tag(novel)
    tags = [tag for (word, tag) in tagged_tokens]
    num_tokens = len(novel)
    pos_vector = [tags.count(pos_tag) / num_tokens for pos_tag in pos_tags]
    return pos_vector

def feature_vector(novel, vocab, idf_dict, pos_tags, word_2_vec):
    tokenized_novel = clean_and_tokenize_file(novel)
    tokens = tokenized_novel[0]
    sents = tokenized_novel[1]
    unigram_vector = tf_idf(vocab, tokens, idf_dict)
    pos_vector = get_pos_vector(tokens, pos_tags)
    word_2_vec_vector = get_avg_word2vec(tokens, word_2_vec)
    word_vector = unigram_vector + pos_vector + word_2_vec_vector
    
    # gets ratio of unique words to total # of words
    novel_vocab = count_total_vocab([tokens])
    vocab_size = len(novel_vocab) / len(tokens)
    word_vector.append(vocab_size)
    
    cs_similarity = get_pairwise_similarity(sents, word_2_vec)
    word_vector.append(cs_similarity)
    
    return word_vector


def create_vector_arrays(training_data, corpus_dict, vocab, idf_dict, pos_tags, word_2_vec):
    len_feature_vector = len(vocab) + len(pos_tags) + 102
    print(len_feature_vector)
    vector_array = numpy.zeros((len(training_data), len_feature_vector))
    results_array = numpy.zeros(len(training_data))
    index = 0
    for data, age in training_data.items():
        results_array[index] = age
        vector_array[index] = feature_vector(corpus_dict + data, vocab, idf_dict, pos_tags, word_2_vec)
        index += 1
    return (vector_array, results_array)
    
    
training_data = {'121-0.txt': 28, '158-0.txt':40}
vocab = get_corpus_vocab(corpus, 200)
idf_dict = get_idf_dict(vocab, corpus)
corpus_dict = 'data/JaneAusten/'    

tagset = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']

create_vector_arrays(training_data, corpus_dict, vocab, idf_dict, tagset, word2vec)

337




(array([[ 1.14299160e-03,  5.34683929e-04,  9.51302116e-04,
          5.40968941e-06,  1.78155942e-05,  5.90456348e-05,
          1.65217869e-05,  6.36671459e-03,  3.17497667e-05,
          4.48413777e-03,  5.84800459e-04,  2.02276871e-04,
          3.20438608e-05,  2.22905106e-03,  4.00850869e-05,
          2.52745952e-03,  3.20438608e-05,  4.61832143e-03,
          2.48454442e-05,  1.44798196e-03,  2.48454442e-05,
          7.24073304e-05,  2.16696608e-03,  2.50225933e-05,
          4.71645576e-03,  3.08021612e-03,  4.82715536e-05,
          1.52208339e-04,  1.65217869e-05,  7.61041693e-05,
          3.20438608e-05,  3.31272589e-05,  6.90945748e-04,
          1.78732809e-05,  5.18729738e-05,  1.86254941e-04,
          3.16433125e-04,  5.78792235e-04,  9.93817766e-05,
          1.60219304e-05,  1.60619852e-03,  3.44471503e-04,
          3.00411195e-04,  2.76378299e-04,  9.52493000e-05,
          1.08193788e-05,  5.32729185e-04,  1.08949127e-03,
          3.24444090e-04,  2.07684273e-0

In [None]:
from numpy import array
from torch import tensor, Size
from torch import nn
from torch import optim
from torch import functional
from torch import from_numpy
from torch.utils.data import TensorDataset

input_array = array(features_vector_array[0], dtype='float32')
input_vector = from_numpy(input_array)

output_array = array(features_vector_array[1], dtype='float32')
target_vector = from_numpy(output_array)

input_and_target = TensorDataset(input_vector, target_vector)

predict_array = array(test_vector, dtype='float32')
predict_vector = from_numpy(predict_array)

dickens_predict_array = array(dickens_test_array, dtype='float32')
dickens_predict_vector = from_numpy(dickens_predict_array)

linear_model = nn.Linear(115, 1)
loss_func = nn.functional.mse_loss 
optimize = optim.SGD(linear_model.parameters(), lr=0.001)
loss = loss_func(linear_model(input_vector), target_vector)

def train(num_epochs, model, loss_func, optimizer):
    for epoch in range(num_epochs):
        for xb,yb in input_and_target:
            
            # Predictions
            pred = model(xb)
            loss = loss_func(pred, yb)
            
            # Stochastic radient descent
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
    print(loss)

train(500, linear_model, loss_func, optimize)

pred = linear_model(predict_vector)
print(pred)

def goodness(predicted, actual, range_size):
    return float((predicted - actual)) / float(range_size)

print("Jane Austen")
print("Age predicted for Mansfield Park: %s" % pred)
print("Goodness Metric: %s\n" % abs(goodness(pred, 39, 15)))

pred = linear_model(dickens_predict_vector)
print("Charles Dickens")
print("Age predicted for Oliver Twist: %s" % pred)
print("Goodness Metric: %s" % abs(goodness(pred, 25, 10)))