In [10]:
import nltk
import os
import numpy
from nltk import word_tokenize
from numpy import array
from collections import Counter
import math


def clean_and_tokenize_file(file_name):
    file = open(file_name)
    raw_text= file.read()
    words = raw_text.split()
    return words

def clean_and_tokenize_corpus(directories):
    file_list = []
    for directory in directories:
        for file_name in os.listdir(directory):
            if file_name != '.DS_Store':
                file = open(file_name)
                raw_text = file.read()
                tokens = raw_text.split()
                file_list.append(tokens)
                file.close()
    return file_list

def count_total_vocab(file_wordfreqs):
    vocab = {}
    for words in file_wordfreqs:
        for word in words:
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 1
    return vocab

def get_corpus_vocab(corpus, unknown_threshold):
    initial_dict = count_total_vocab(corpus)
    filtered_dict = [word for word, size in initial_dict.items()][:unknown_threshold]
    return filtered_dict

def feature_vector(novel, vocab, corpus, pos_tags):
    tokenized_novel = clean_and_tokenize_file(novel)
    word_vector = tf_idf(vocab, tokenized_novel, corpus)
    tagged_tokens = nltk.pos_tag(tokenized_novel)
    tags = [tag for (word, tag) in tagged_tokens]
    num_tokens = len(tokenized_novel)
    pos_vector = [tags.count(pos_tag) / num_tokens for pos_tag in pos_tags]
    word_vector = word_vector + pos_vector
    
    # gets ratio of unique words to total # of words
    novel_vocab = count_total_vocab([tokenized_novel])
    vocab_size = len(novel_vocab) / len(tokenized_novel)
    word_vector.append(vocab_size)
    
    return word_vector

def tf_idf(vocab, document, corpus):
    counts = Counter(document)
    doc_len = len(document)
    doc_num = len(corpus)
    tf_idfs = []
    for word in vocab:
        doc_counter = 0
        tf = counts[word] / len(document)
        for doc in corpus:
            if word in doc:
                doc_counter += 1
        idf = math.log(doc_num / doc_counter)
        tf_idfs.append(tf*idf)
    return tf_idfs

def create_vector_arrays(training_data, corpus_dict, vocab, corpus, pos_tags):
    len_feature_vector = len(vocab) + len(pos_tags) + 1
    print(len_feature_vector)
    vector_array = numpy.zeros((len(training_data), len_feature_vector))
    results_array = numpy.zeros(len(training_data))
    index = 0
    for data, age in training_data.items():
        results_array[index] = age
        vector_array[index] = feature_vector(corpus_dict + data, vocab, corpus, pos_tags)
        index += 1
    return (vector_array, results_array)
    
    
training_data = {'121-0.txt': 28, '158-0.txt':40}
corpus = clean_and_tokenize_corpus(['data/JaneAusten/', 'data/CarrollLewis/', 'data/CharlesDickens/'])
vocab = get_corpus_vocab(corpus, 200)
corpus_dict = 'data/JaneAusten/'    

tagset = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']

create_vector_arrays(training_data, corpus_dict, vocab, corpus, tagset)

3036


(array([[1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
         3.75934976e-04, 4.60196264e-03, 1.45499799e-01],
        [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
         2.41360256e-04, 4.56043851e-03, 1.10822467e-01]]), array([28., 40.]))