# Frequency Bag-of-Words

In [1]:
import numpy as np

In [2]:
def get_frequency_bag_of_words(idfile, nwords=5000):
    with open(idfile, 'r') as textfile:
        num_reviews = len(textfile.readlines())

    frequency_bow = np.zeros((num_reviews, nwords)) # The entries will be floating points (because it is a relative probability, so its a decimal between 0 and 1)
    yvector = np.array([], dtype=int)

    with open(idfile, 'r') as textfile:
        for n, line in enumerate(textfile):
            # Store the relative frequency features for the n-th review
            words = line.split()
            words_ids = words[1:] # Use every single entry, but the first one (since all entries are ids, except for the first one which is the class)
            words_ids = list(map(int, words_ids))
            relfreq, bins = np.histogram(words_ids, bins=[i for i in range(1, nwords+2)]) # range(1,5002) = [1, 2, 3, ..., 5000, 5001] --> resulting in classes 1, 2, 3, ..., 5000
            total = np.sum(relfreq)

            # Condition to avoid division by zero
            # Example: when a line contains a single word ("D-scust-ing", probably meaning "disgusting"), 
            # which is not in the vocabulary and, therefore, the 'total' variable for this row has a value of 0
            if total == 0:
                print(idfile, n)
                pass
            else: 
                frequency_bow[n] = relfreq / total

            # Target vector
            yvector = np.append([yvector], int(words[0])) # Use the first entry (i.e., the true label)

    return frequency_bow, yvector

In [None]:
# Get the frequency bag of words matrix and the target vector

# AG News Dataset

#get_frequency_bag_of_words(idfile, nwords=5000)
train_freqbow, ytrain_true = get_frequency_bag_of_words("../Output/Dataset_with_ids/agnews-train.txt")
print("\n\nTrain\n\nFrequency Bag-of-Wors")
print(train_freqbow)
print(train_freqbow.shape)
print("\nTarget Vector")
print(ytrain_true)
print(ytrain_true.shape)

test_freqbow, ytest_true = get_frequency_bag_of_words("../Output/Dataset_with_ids/agnews-test.txt")
print("\n\nTest\n\nFrequency Bag-of-Words")
print(test_freqbow)
print(test_freqbow.shape)
print("\nTarget Vector")
print(ytest_true)
print(ytest_true.shape)

In [None]:
# DBpedia Dataset

#get_frequency_bag_of_words(idfile, nwords=5000)
train_freqbow, ytrain_true = get_frequency_bag_of_words("../Output/Dataset_with_ids/dbpedia-train.txt")
print("\n\nTrain\n\nFrequency Bag-of-Wors")
print(train_freqbow)
print(train_freqbow.shape)
print("\nTarget Vector")
print(ytrain_true)
print(ytrain_true.shape)

test_freqbow, ytest_true = get_frequency_bag_of_words("../Output/Dataset_with_ids/dbpedia-test.txt")
print("\n\nTest\n\nFrequency Bag-of-Words")
print(test_freqbow)
print(test_freqbow.shape)
print("\nTarget Vector")
print(ytest_true)
print(ytest_true.shape)

In [None]:
# Yahoo Answers Dataset

#get_frequency_bag_of_words(idfile, nwords=5000)
train_freqbow, ytrain_true = get_frequency_bag_of_words("../Output/Dataset_with_ids/yahoo_answers-train.txt")
print("\n\nTrain\n\nFrequency Bag-of-Wors")
print(train_freqbow)
print(train_freqbow.shape)
print("\nTarget Vector")
print(ytrain_true)
print(ytrain_true.shape)

test_freqbow, ytest_true = get_frequency_bag_of_words("../Output/Dataset_with_ids/yahoo_answers-test.txt")
print("\n\nTest\n\nFrequency Bag-of-Words")
print(test_freqbow)
print(test_freqbow.shape)
print("\nTarget Vector")
print(ytest_true)
print(ytest_true.shape)

In [None]:
# Amazon Reviews (Full) Dataset

#get_frequency_bag_of_words(idfile, nwords=5000)
train_freqbow, ytrain_true = get_frequency_bag_of_words("../Output/Dataset_with_ids/amazon_review_full-train.txt")
print("\n\nTrain\n\nFrequency Bag-of-Wors")
print(train_freqbow)
print(train_freqbow.shape)
print("\nTarget Vector")
print(ytrain_true)
print(ytrain_true.shape)

test_freqbow, ytest_true = get_frequency_bag_of_words("../Output/Dataset_with_ids/amazon_review_full-test.txt")
print("\n\nTest\n\nFrequency Bag-of-Words")
print(test_freqbow)
print(test_freqbow.shape)
print("\nTarget Vector")
print(ytest_true)
print(ytest_true.shape)

In [None]:
# Amazon Reviews (Polarity) Dataset

#get_frequency_bag_of_words(idfile, nwords=5000)
train_freqbow, ytrain_true = get_frequency_bag_of_words("../Output/Dataset_with_ids/amazon_review_polarity-train.txt")
print("\n\nTrain\n\nFrequency Bag-of-Wors")
print(train_freqbow)
print(train_freqbow.shape)
print("\nTarget Vector")
print(ytrain_true)
print(ytrain_true.shape)

test_freqbow, ytest_true = get_frequency_bag_of_words("../Output/Dataset_with_ids/amazon_review_polarity-test.txt")
print("\n\nTest\n\nFrequency Bag-of-Words")
print(test_freqbow)
print(test_freqbow.shape)
print("\nTarget Vector")
print(ytest_true)
print(ytest_true.shape)