In [21]:
import nltk
import os
import numpy
from nltk import word_tokenize
from numpy import array


def open_and_tokenize_file(file_name):
    file = open(file_name)
    raw_text = file.read()
    return word_tokenize(raw_text)


def most_frequent_unigrams(num_unigrams):
    tokens = open_and_tokenize_file('data/full_training_corpus.txt')
    fdist = nltk.FreqDist(tokens)
    return fdist.most_common(num_unigrams)
    
def get_vector(file_name, unigrams, pos_tags):
    tokens = open_and_tokenize_file(file_name)
    tagged_tokens = nltk.pos_tag(tokens)
    tags = [tag for (word, tag) in tagged_tokens]
    unigram_vector = [tokens.count(unigram[0]) for unigram in unigrams]
    pos_vector = [tags.count(pos_tag) for pos_tag in pos_tags]
    return unigram_vector + pos_vector

def create_vector_arrays(training_data, corpus_dict, unigrams, pos_tags):
    vector_array = numpy.zeros((len(training_data), 135))
    results_array = numpy.zeros(len(training_data))
    index = 0
    for data, age in training_data.items():
        results_array[index] = age
        vector_array[index] = get_vector(corpus_dict + data, unigrams, pos_tags)
        index += 1
    return (vector_array, results_array)
    
    
training_data = {'121-0.txt': 28, '158-0.txt':40, '161.txt':36, '1342-0.txt':38,'pg946.txt':19, 'pg105.txt':42, '1212-0.txt':15}

test_data = ['141-0.txt']
corpus_dict = 'data/JaneAusten/'    

tagset = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
unigrams = most_frequent_unigrams(100)
create_vector_arrays(training_data, corpus_dict, unigrams, tagset)

(array([[6.0850e+03, 2.9290e+03, 2.1990e+03, 2.1900e+03, 2.3470e+03,
         2.2010e+03, 1.2840e+03, 1.4760e+03, 1.4850e+03, 1.2210e+03,
         1.1060e+03, 1.1720e+03, 1.0160e+03, 9.3900e+02, 7.5100e+02,
         7.9400e+02, 9.3900e+02, 6.9800e+02, 8.0000e+02, 6.5900e+02,
         4.1900e+02, 7.0600e+02, 1.0800e+03, 1.0710e+03, 4.5700e+02,
         6.3600e+02, 4.9700e+02, 4.7800e+02, 5.2000e+02, 4.8600e+02,
         4.2200e+02, 4.6000e+02, 4.3300e+02, 4.6200e+02, 3.3900e+02,
         3.6500e+02, 3.9600e+02, 2.7700e+02, 4.1200e+02, 2.8500e+02,
         3.5500e+02, 3.3000e+02, 2.8900e+02, 0.0000e+00, 3.0600e+02,
         3.0700e+02, 2.7700e+02, 3.9200e+02, 2.8000e+02, 1.5500e+02,
         2.0100e+02, 0.0000e+00, 3.2800e+02, 2.1700e+02, 2.1600e+02,
         2.5300e+02, 2.4100e+02, 1.7100e+02, 2.4300e+02, 1.9600e+02,
         2.2900e+02, 1.8000e+02, 1.7300e+02, 1.9000e+02, 2.0900e+02,
         1.5800e+02, 2.1100e+02, 2.1600e+02, 2.0300e+02, 1.7100e+02,
         1.6300e+02, 1.9700e+02, 2