In [30]:
import nltk
import os
import numpy
from nltk import word_tokenize
from numpy import array

def open_and_tokenize_file(file_name):
    file = open(file_name, encoding= "utf8")
    raw_text = file.read()
    return word_tokenize(raw_text)

def most_frequent_unigrams(num_unigrams):
    tokens = open_and_tokenize_file('data/full_training_corpus.txt')
    fdist = nltk.FreqDist(tokens)
    return fdist.most_common(num_unigrams)
    
def get_vector(file_name, unigrams, pos_tags):
    tokens = open_and_tokenize_file(file_name)
    num_tokens = len(tokens)
    tagged_tokens = nltk.pos_tag(tokens)
    tags = [tag for (word, tag) in tagged_tokens]
    unigram_vector = [tokens.count(unigram[0]) / num_tokens for unigram in unigrams]
    pos_vector = [tags.count(pos_tag) / num_tokens for pos_tag in pos_tags]
    return unigram_vector + pos_vector

def create_vector_arrays(training_data, corpus_dict, unigrams, pos_tags):
    vector_array = numpy.zeros((len(training_data), 115))
    results_array = numpy.zeros(len(training_data))
    index = 0
    for data, age in training_data.items():
        results_array[index] = age
        vector_array[index] = get_vector(corpus_dict + data, unigrams, pos_tags)
        index += 1
    return (vector_array, results_array)
    
    
training_data = {'121-0.txt': 28, '158-0.txt':40, '161.txt':36, '1342-0.txt':38,'pg946.txt':19, 'pg105.txt':42, '1212-0.txt':15, '1400-0.txt': 48, '98-0.txt': 47, '786-0.txt': 42}

test_data = ['141-0.txt']
corpus_dict = 'data/JaneAusten/' 
dickens_corpus_dict = 'data/CharlesDickens/'

tagset = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
unigrams = most_frequent_unigrams(80)
features_vector_array = create_vector_arrays(training_data, corpus_dict, unigrams, tagset)
test_vector = get_vector(corpus_dict + test_data[0], unigrams, tagset)
dickens_test_array = get_vector(dickens_corpus_dict + 'pg730.txt', unigrams, tagset)
print("gets here")

gets here


In [32]:
from numpy import array
from torch import tensor, Size
from torch import nn
from torch import optim
from torch import functional
from torch import from_numpy
from torch.utils.data import TensorDataset

input_array = array(features_vector_array[0], dtype='float32')
input_vector = from_numpy(input_array)

output_array = array(features_vector_array[1], dtype='float32')
target_vector = from_numpy(output_array)

input_and_target = TensorDataset(input_vector, target_vector)

predict_array = array(test_vector, dtype='float32')
predict_vector = from_numpy(predict_array)

dickens_predict_array = array(dickens_test_array, dtype='float32')
dickens_predict_vector = from_numpy(dickens_predict_array)

linear_model = nn.Linear(115, 1)
loss_func = nn.functional.mse_loss 
optimize = optim.SGD(linear_model.parameters(), lr=0.001)
loss = loss_func(linear_model(input_vector), target_vector)

def train(num_epochs, model, loss_func, optimizer):
    for epoch in range(num_epochs):
        for xb,yb in input_and_target:
            
            # Predictions
            pred = model(xb)
            loss = loss_func(pred, yb)
            
            # Stochastic radient descent
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
    print(loss)

train(500, linear_model, loss_func, optimize)

pred = linear_model(predict_vector)
print(pred)

def goodness(predicted, actual, range_size):
    return float((predicted - actual)) / float(range_size)

print("Jane Austen")
print("Age predicted for Mansfield Park: %s" % pred)
print("Goodness Metric: %s\n" % abs(goodness(pred, 39, 15)))

pred = linear_model(dickens_predict_vector)
print("Charles Dickens")
print("Age predicted for Oliver Twist: %s" % pred)
print("Goodness Metric: %s" % abs(goodness(pred, 25, 10)))


tensor(40.9405, grad_fn=<MseLossBackward>)
tensor([35.5401], grad_fn=<AddBackward0>)
Jane Austen
Age predicted for Mansfield Park: tensor([35.5401], grad_fn=<AddBackward0>)
Goodness Metric: 0.2306610107421875

Charles Dickens
Age predicted for Oliver Twist: tensor([35.5576], grad_fn=<AddBackward0>)
Goodness Metric: 1.0557559967041015
