In [12]:
import nltk
import os
import numpy
from nltk import word_tokenize
from numpy import array
from collections import Counter
import math


def clean_and_tokenize_file(file_name):
    file = open(file_name)
    raw_text= file.read()
    words = raw_text.split()
    return words

def clean_and_tokenize_corpus(directories):
    file_list = []
    for directory in directories:
        for file_name in os.listdir(directory):
            if file_name != '.DS_Store':
                file = open(directory + file_name)
                raw_text = file.read()
                tokens = raw_text.split()
                file_list.append(tokens)
                file.close()
    return file_list

def count_total_vocab(file_wordfreqs):
    vocab = {}
    for words in file_wordfreqs:
        for word in words:
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 1
    return vocab

def get_corpus_vocab(corpus, unknown_threshold):
    initial_dict = count_total_vocab(corpus)
    filtered_dict = [word for word, size in initial_dict.items()][:unknown_threshold]
    return filtered_dict

def feature_vector(novel, vocab, corpus, pos_tags):
    tokenized_novel = clean_and_tokenize_file(novel)
    word_vector = tf_idf(vocab, tokenized_novel, corpus)
    tagged_tokens = nltk.pos_tag(tokenized_novel)
    tags = [tag for (word, tag) in tagged_tokens]
    num_tokens = len(tokenized_novel)
    pos_vector = [tags.count(pos_tag) / num_tokens for pos_tag in pos_tags]
    word_vector = word_vector + pos_vector
    
    # gets ratio of unique words to total # of words
    novel_vocab = count_total_vocab([tokenized_novel])
    vocab_size = len(novel_vocab) / len(tokenized_novel)
    word_vector.append(vocab_size)
    
    return word_vector

def tf_idf(vocab, document, corpus):
    counts = Counter(document)
    doc_len = len(document)
    doc_num = len(corpus)
    tf_idfs = []
    for word in vocab:
        doc_counter = 0
        tf = counts[word] / len(document)
        for doc in corpus:
            if word in doc:
                doc_counter += 1
        idf = math.log(doc_num / doc_counter)
        tf_idfs.append(tf*idf)
    return tf_idfs

def create_vector_arrays(training_data, corpus_dict, vocab, corpus, pos_tags):
    len_feature_vector = len(vocab) + len(pos_tags) + 1
    print(len_feature_vector)
    vector_array = numpy.zeros((len(training_data), len_feature_vector))
    results_array = numpy.zeros(len(training_data))
    index = 0
    for data, age in training_data.items():
        results_array[index] = age
        vector_array[index] = feature_vector(corpus_dict + data, vocab, corpus, pos_tags)
        index += 1
    return (vector_array, results_array)
    
    
training_data = {'121-0.txt': 28, '158-0.txt':40}
corpus = clean_and_tokenize_corpus(['data/JaneAusten/', 'data/CarrollLewis/', 'data/CharlesDickens/'])
vocab = get_corpus_vocab(corpus, 200)
corpus_dict = 'data/JaneAusten/'    

tagset = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']

create_vector_arrays(training_data, corpus_dict, vocab, corpus, tagset)

236


(array([[7.49373681e-05, 7.49373681e-05, 1.05411408e-03, 7.61964020e-06,
         1.05123114e-05, 3.74686841e-05, 3.74686841e-05, 1.94977690e-05,
         1.05123114e-05, 3.74686841e-05, 8.98545755e-06, 1.22433156e-05,
         2.24531026e-04, 2.36348449e-05, 2.55729021e-03, 3.15369343e-05,
         2.83854487e-03, 6.87301289e-03, 1.18174224e-05, 3.74686841e-05,
         4.83096229e-03, 3.79668607e-05, 1.57408067e-03, 2.93207225e-05,
         3.74686841e-05, 2.59983294e-04, 2.53112405e-05, 5.49982840e-03,
         5.13348831e-03, 3.46959523e-03, 2.84832265e-05, 1.62371384e-03,
         1.72534368e-04, 3.74686841e-05, 3.78157518e-05, 2.12713604e-05,
         3.58364386e-05, 6.16869451e-04, 2.69563726e-05, 4.27248398e-05,
         2.10350119e-04, 3.59249642e-04, 6.66502625e-04, 1.43430363e-04,
         2.84832265e-05, 6.38140811e-05, 3.92338425e-04, 3.74686841e-05,
         3.52159189e-04, 2.95435561e-04, 1.12406052e-04, 1.52392804e-05,
         4.79787351e-04, 1.01157136e-03, 3.16706921

In [None]:
from numpy import array
from torch import tensor, Size
from torch import nn
from torch import optim
from torch import functional
from torch import from_numpy
from torch.utils.data import TensorDataset

input_array = array(features_vector_array[0], dtype='float32')
input_vector = from_numpy(input_array)

output_array = array(features_vector_array[1], dtype='float32')
target_vector = from_numpy(output_array)

input_and_target = TensorDataset(input_vector, target_vector)

predict_array = array(test_vector, dtype='float32')
predict_vector = from_numpy(predict_array)

dickens_predict_array = array(dickens_test_array, dtype='float32')
dickens_predict_vector = from_numpy(dickens_predict_array)

linear_model = nn.Linear(115, 1)
loss_func = nn.functional.mse_loss 
optimize = optim.SGD(linear_model.parameters(), lr=0.001)
loss = loss_func(linear_model(input_vector), target_vector)

def train(num_epochs, model, loss_func, optimizer):
    for epoch in range(num_epochs):
        for xb,yb in input_and_target:
            
            # Predictions
            pred = model(xb)
            loss = loss_func(pred, yb)
            
            # Stochastic radient descent
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
    print(loss)

train(500, linear_model, loss_func, optimize)

pred = linear_model(predict_vector)
print(pred)

def goodness(predicted, actual, range_size):
    return float((predicted - actual)) / float(range_size)

print("Jane Austen")
print("Age predicted for Mansfield Park: %s" % pred)
print("Goodness Metric: %s\n" % abs(goodness(pred, 39, 15)))

pred = linear_model(dickens_predict_vector)
print("Charles Dickens")
print("Age predicted for Oliver Twist: %s" % pred)
print("Goodness Metric: %s" % abs(goodness(pred, 25, 10)))