In [1]:
import re
import numpy as np
import pandas as pd

from collections import OrderedDict
from operator import itemgetter 

In [2]:
def get_vocab(ifile):
    dataset = pd.read_csv(ifile, header=None) #ISSUE: Detects first row as headings

    dt_array = np.array(dataset, dtype="<U2000") 
    #ISSUE 1: Without dtype all cells are saved a 'str' except for the empty ones (which are saved as 'float')
    # --> Using dtype = 'str' is not a solution (it crops the string after 64 'unicode' characters)
    # --> We can use as dtype a unicode with a large number ('str' = '<U64'), something like '<U1400'
    # --> Or don't use a dtype but then cast each cell as str().
    #ISSUE 2: We have html code here (e.g. 'web site:\\nAir Fleet<br />\\n  <br />\\n670 aircraft, including: <br />\\n47 Airbus A300-600s  17 Boeing DC10-30s')
    text_array = dt_array[:,1:]

    vocab = {}
    for row in text_array:  
        text_in_row = ' '.join(row)
        text_in_row = ''.join(re.findall("[a-zA-Z0-9 '-]*", text_in_row)).lower()
        words_in_row = text_in_row.split()
        #print(words_in_row, '\n')
        for word in words_in_row:
            try:
                vocab[word] = vocab[word] + 1
            except:
                vocab[word] = 1

    ord_vocab = OrderedDict(sorted(vocab.items(), key = itemgetter(1), reverse = True))
    
    return ord_vocab

In [3]:
def write_vocab_file(input_file, vocab_file, num_words=10000):
    ord_vocab = get_vocab(input_file)
    with open(vocab_file, 'w') as textfile:
        for i, word in enumerate(ord_vocab):
            if i < num_words:
                textfile.write(word + "\t" + str(i+1) + "\t" + str(ord_vocab[word]) + "\n")

In [4]:
# Generate Vocabulary files for EACH dataset
# write_vocab_file(input_file, vocab_file, num_words=10000)

# AG News Dataset
write_vocab_file("../Datasets/ag_news_csv/train.csv", "./Output/Vocabulary2/ag_news.txt")

In [None]:
# Amazon Review (Full) Dataset
write_vocab_file("../Datasets/amazon_review_full_csv/train.csv", "./Output/Vocabulary2/amazon_review_full.txt")

In [None]:
# Amazon Review (Polarity) Dataset
write_vocab_file("../Datasets/amazon_review_polarity_csv/train.csv", "./Output/Vocabulary2/amazon_review_polarity.txt")

In [5]:
# DBpedia Dataset
write_vocab_file("../Datasets/dbpedia_csv/train.csv", "./Output/Vocabulary2/dbpedia.txt")

In [None]:
# Sogou News Dataset
write_vocab_file("../Datasets/sogou_news_csv/train.csv", "./Output/Vocabulary2/sogou_news.txt")

In [6]:
# Yahoo Answers Dataset
write_vocab_file("../Datasets/yahoo_answers_csv/train.csv", "./Output/Vocabulary2/yahoo_answers.txt")