## Reformatting of <span style="color:brown">Train/Valid/Test Files</span> as Matrices of IDs of the Words Included

In [1]:
import re
import numpy as np
import csv

In [2]:
def get_vocab_dictionary(vocabfile, nwords=5000):
    """
    Get a dictionary from the vocabulary file (which contains the top 10k most common words in the training set)
    By default this dictionary contains the top 5k most common words, using: (key, value) = (word, id)
    """
    vocab_dict = {}
    with open(vocabfile, "r") as f:
        i = 0
        for line in f:
            if i < nwords:
                (word, idnum) = line.split()[0:2]
                vocab_dict[word] = idnum
            else:
                break
            i = i + 1
    return vocab_dict


def get_file_of_ids(input_file, output_file, vocab_dict):
    """ 
    Translate the original csv file into a file of ids (here store in the 'idtext' variable)
    Each entry of the file looks like this: label_number, sequence_of_ids
    Where the label_number is separated from the sequence_of_ids by a tabulation
    And the ids in the sequence_of_ids are separated by spaces.
    """
    # Output file of ids corresponding to the ids of the words in the dictionary
    with open(output_file, "w") as idtext:
   
        # The original csv file (with the original text)
        with open(input_file) as origtext:
            reader = csv.reader(origtext)
            
            for row in reader:               
                # A) Write the class
                idtext.write(row[0])
                idtext.write('\t')

                # B) Write the ids of the corresponding text (cells of the row from columns 1 onward)
                # B.1) Join all cells after the first one (which has the label), since these contain the text
                text_in_row = ' '.join(row[1:])
                # B.2) Preprocessing: Removal of punctuation, and lower case
                text_in_row = ''.join(re.findall("[a-zA-Z0-9 '-]*", text_in_row)).lower()
                # B.3) Get all the words (list)
                words_in_row = text_in_row.split()
                # B.4) For each of the words in the review:
                for word in words_in_row:
                    # Determine if the word appears in the vocabulary 
                    try:
                        idtext.write(vocab_dict[word]) # That is the ID of the word
                        idtext.write(" ")
                    except:
                        pass
            
                # Next line (row)
                idtext.write("\n")

In [3]:
# Call the previous functions to generate the train, valid, and test text files that 
# contain a sequence of ids corresponding to the words in each review for the datasets

# AG News Dataset
agnews_dict = get_vocab_dictionary("../Output/Vocabulary/ag_news.txt")

#get_file_of_ids(input_file, output_file, vocab_dict)
get_file_of_ids("../Datasets/ag_news_csv/train.csv", "../Output/Dataset_with_ids/agnews-train.txt", agnews_dict)
get_file_of_ids("../Datasets/ag_news_csv/test.csv", "../Output/Dataset_with_ids/agnews-test.txt", agnews_dict)

In [4]:
# DBpedia Dataset
dbpedia_dict = get_vocab_dictionary("../Output/Vocabulary/dbpedia.txt")

#get_file_of_ids(input_file, output_file, vocab_dict)
get_file_of_ids("../Datasets/dbpedia_csv/train.csv", "../Output/Dataset_with_ids/dbpedia-train.txt", dbpedia_dict)
get_file_of_ids("../Datasets/dbpedia_csv/test.csv", "../Output/Dataset_with_ids/dbpedia-test.txt", dbpedia_dict)

In [5]:
# Yahoo Answers Dataset
yahoo_answers_dict = get_vocab_dictionary("../Output/Vocabulary/yahoo_answers.txt")

#get_file_of_ids(input_file, output_file, vocab_dict)
get_file_of_ids("../Datasets/yahoo_answers_csv/train.csv", "../Output/Dataset_with_ids/yahoo_answers-train.txt", yahoo_answers_dict)
get_file_of_ids("../Datasets/yahoo_answers_csv/test.csv", "../Output/Dataset_with_ids/yahoo_answers-test.txt", yahoo_answers_dict)

In [6]:
# Amazon Review (Full) Dataset
amazon_review_full_dict = get_vocab_dictionary("../Output/Vocabulary/amazon_review_full.txt")

#get_file_of_ids(input_file, output_file, vocab_dict)
get_file_of_ids("../Datasets/amazon_review_full_csv/train.csv", "../Output/Dataset_with_ids/amazon_review_full-train.txt", amazon_review_full_dict)
get_file_of_ids("../Datasets/amazon_review_full_csv/test.csv", "../Output/Dataset_with_ids/amazon_review_full-test.txt", amazon_review_full_dict)

In [7]:
# Amazon Review (Polarity) Dataset
amazon_review_polarity_dict = get_vocab_dictionary("../Output/Vocabulary/amazon_review_polarity.txt")

#get_file_of_ids(input_file, output_file, vocab_dict)
get_file_of_ids("../Datasets/amazon_review_polarity_csv/train.csv", "../Output/Dataset_with_ids/amazon_review_polarity-train.txt", amazon_review_polarity_dict)
get_file_of_ids("../Datasets/amazon_review_polarity_csv/test.csv", "../Output/Dataset_with_ids/amazon_review_polarity-test.txt", amazon_review_polarity_dict)