In [1]:
import nltk
import os
import numpy
from nltk import word_tokenize
from numpy import array


def open_and_tokenize_file(file_name):
    file = open(file_name)
    raw_text = file.read()
    return word_tokenize(raw_text)


def most_frequent_unigrams(num_unigrams):
    tokens = open_and_tokenize_file('data/full_training_corpus.txt')
    fdist = nltk.FreqDist(tokens)
    return fdist.most_common(num_unigrams)
    
def get_vector(file_name, unigrams, pos_tags):
    tokens = open_and_tokenize_file(file_name)
    num_tokens = len(tokens)
    tagged_tokens = nltk.pos_tag(tokens)
    tags = [tag for (word, tag) in tagged_tokens]
    unigram_vector = [tokens.count(unigram[0]) / num_tokens for unigram in unigrams]
    pos_vector = [tags.count(pos_tag) / num_tokens for pos_tag in pos_tags]
    return unigram_vector + pos_vector

def create_vector_arrays(training_data, corpus_dict, unigrams, pos_tags):
    vector_array = numpy.zeros((len(training_data), 135))
    results_array = numpy.zeros(len(training_data))
    index = 0
    for data, age in training_data.items():
        results_array[index] = age
        vector_array[index] = get_vector(corpus_dict + data, unigrams, pos_tags)
        index += 1
    return (vector_array, results_array)
    
    
training_data = {'121-0.txt': 28, '158-0.txt':40, '161.txt':36, '1342-0.txt':38,'pg946.txt':19, 'pg105.txt':42, '1212-0.txt':15}

test_data = ['141-0.txt']
corpus_dict = 'data/JaneAusten/'    

tagset = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
unigrams = most_frequent_unigrams(100)
create_vector_arrays(training_data, corpus_dict, unigrams, tagset)

(array([[6.68387522e-02, 3.21726714e-02, 2.41542179e-02, 2.40553603e-02,
         2.57798770e-02, 2.41761863e-02, 1.41036907e-02, 1.62126538e-02,
         1.63115114e-02, 1.34116872e-02, 1.21485062e-02, 1.28734622e-02,
         1.11599297e-02, 1.03141476e-02, 8.24912127e-03, 8.72144112e-03,
         1.03141476e-02, 7.66695958e-03, 8.78734622e-03, 7.23857645e-03,
         4.60237258e-03, 7.75483304e-03, 1.18629174e-02, 1.17640598e-02,
         5.01977153e-03, 6.98594025e-03, 5.45913884e-03, 5.25043937e-03,
         5.71177504e-03, 5.33831283e-03, 4.63532513e-03, 5.05272408e-03,
         4.75615114e-03, 5.07469244e-03, 3.72363796e-03, 4.00922671e-03,
         4.34973638e-03, 3.04261863e-03, 4.52548330e-03, 3.13049209e-03,
         3.89938489e-03, 3.62478032e-03, 3.17442882e-03, 0.00000000e+00,
         3.36115993e-03, 3.37214411e-03, 3.04261863e-03, 4.30579965e-03,
         3.07557118e-03, 1.70254833e-03, 2.20782074e-03, 0.00000000e+00,
         3.60281195e-03, 2.38356766e-03, 2.37258348