In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

## Function get_freqBoW_and_targetvect
This function obtains the <b>frequency bag-of-words matrix</b> representation and the <b>target vector</b> <u>for both training and test sets</u> of a given dataset.

Parameters:
1. dataset: string{"ag_news", "dbpedia", "sogou_news", "yahoo", "amazon_full", "amazon_polarity"}

2. vocab_length: int, default=5000
   
   Represents the number of features in the frequency bag-of-words matrix (or, similarly, the number of words in the vocabulary)
   
3. ngram_min_max: tuple (min_n, max_n), default=(1,1).
   
   The lower and upper boundary of the range of n-values for different n-grams to be extracted. Examples: 
    - A value of (1,1) represents a 1-gram or single word,
    - A value of (2,2) represents a bigram,
    - And a value of (1,2) takes both 1-grams and bigrams (up to the number of features given by vocab_length).

In [2]:
def get_freqBoW_and_targetvect(dataset, vocab_length=5000, ngram_min_max=(1,1)):
    
    # Dictionary of the train and test files for all the datasets
    dict_datasets_files = {
        "ag_news": ["../Datasets/ag_news_csv/train.csv", "../Datasets/ag_news_csv/test.csv"],
        "dbpedia": ["../Datasets/dbpedia_csv/train.csv", "../Datasets/dbpedia_csv/test.csv"],
        "sogou_news": ["../Datasets/sogou_news_csv/train.csv", "../Datasets/sogou_news_csv/test.csv"],
        "yahoo": ["../Datasets/yahoo_answers_csv/train.csv", "../Datasets/yahoo_answers_csv/test.csv"],
        "amazon_full": ["../Datasets/amazon_review_full_csv/train.csv", "../Datasets/amazon_review_full_csv/test.csv"],
        "amazon_polarity": ["../Datasets/amazon_review_polarity_csv/train.csv", "../Datasets/amazon_review_polarity_csv/test.csv"],
        #"prb": ["./datos_de_prueba.csv", "./datos_de_prueba_copy.csv"]
    }
    
    # Get the corresponding train and test files according to the dataset chosen
    file_train, file_test = dict_datasets_files[dataset]   
    
    
    # TRAIN (original)

    # I. Get frequency bag-of-words:
    # I-1. Training data
    data = pd.read_csv(file_train, header=None)
    #print("TRAIN DATA")
    #print(data)
    # Combining the columns of text (eliminating the entries with 'Nan' value)
    if dataset == "yahoo":
        lines = data.loc[:,1].fillna("") + " " + data.loc[:,2].fillna("") + " " + data.loc[:,3].fillna("") # 3-columns text
    else:
        lines = data.loc[:,1].fillna("") + " " + data.loc[:,2].fillna("") # 2-columns text
    #print(lines)
    
    # I-2. Initialize the CountVectorizer, it will retrieve the frequency bag-of-words for the top n most common words (corresponding to the value stored in the vocab_length parameter)
    if dataset == "sogou_news":
        # For Chinesse news, don't use a any stop words
        vectorizer_train = CountVectorizer(stop_words=None, max_features = vocab_length, ngram_range = ngram_min_max)
    else:
        # For the other datasets, use the English most common stop words
        vectorizer_train = CountVectorizer(stop_words='english', max_features = vocab_length, ngram_range = ngram_min_max)

    # I-3. Apply this CountVectorizer to the training data
    x_train = vectorizer_train.fit_transform(lines)
    
    # II. Get the target vector:
    y_train = data.loc[:,0].as_matrix()
    

    # TEST

    # I. Get frequency bag-of-words:
    # I-1. Test data
    data = pd.read_csv(file_test, header=None)
    #print("TEST DATA")
    #print(data)
    # Combining the columns of text (eliminating the entries with 'Nan' value)
    if dataset == "yahoo":
        lines = data.loc[:,1].fillna("") + " " + data.loc[:,2].fillna("") + " " + data.loc[:,3].fillna("") # 3-columns text
    else:
        lines = data.loc[:,1].fillna("") + " " + data.loc[:,2].fillna("") # 2-columns text
    #print(lines)
    
    # I-2. Initialize the CountVectorizer with the vocabulary obtained from the training data
    if dataset == "sogou_news":
        # For Chinesse news, don't use a any stop words
        vectorizer_test = CountVectorizer(stop_words=None, vocabulary = vectorizer_train.vocabulary_)
    else:
        # For the other datasets, use the English most common stop words
        vectorizer_test = CountVectorizer(stop_words='english', vocabulary = vectorizer_train.vocabulary_)
    
    # I-3. Apply this CountVectorizer to the test data
    x_test = vectorizer_test.fit_transform(lines)
    #print(vectorizer_train.vocabulary_ == vectorizer_test.vocabulary_)
    #print(vectorizer_train.vocabulary_)

    # II. Get the target vector:
    y_test = data.loc[:,0].as_matrix()
    
    # 4. Return the frequency bag-of-words for the train and test sets
    return x_train, y_train, x_test, y_test

## AG News

In [3]:
x_train, y_train, x_test, y_test = get_freqBoW_and_targetvect("ag_news")

print("TRAIN")
print(x_train.shape)
print(y_train.shape)

print("\n\nTEST")
print(x_test.shape)
print(y_test.shape)

TRAIN
(120000, 5000)
(120000,)


TEST
(7600, 5000)
(7600,)


## DBpedia

In [4]:
x_train, y_train, x_test, y_test = get_freqBoW_and_targetvect("dbpedia")

print("TRAIN")
print(x_train.shape)
print(y_train.shape)

print("\n\nTEST")
print(x_test.shape)
print(y_test.shape)

TRAIN
(560000, 5000)
(560000,)


TEST
(70000, 5000)
(70000,)


## Sogou News

In [5]:
x_train, y_train, x_test, y_test = get_freqBoW_and_targetvect("sogou_news")

print("TRAIN")
print(x_train.shape)
print(y_train.shape)

print("\n\nTEST")
print(x_test.shape)
print(y_test.shape)

TRAIN
(450000, 5000)
(450000,)


TEST
(60000, 5000)
(60000,)


## Yahoo Answers

In [6]:
x_train, y_train, x_test, y_test = get_freqBoW_and_targetvect("yahoo")

print("TRAIN")
print(x_train.shape)
print(y_train.shape)

print("\n\nTEST")
print(x_test.shape)
print(y_test.shape)

TRAIN
(1400000, 5000)
(1400000,)


TEST
(60000, 5000)
(60000,)


## Amazon Reviews (Full)

In [7]:
x_train, y_train, x_test, y_test = get_freqBoW_and_targetvect("amazon_full")

print("TRAIN")
print(x_train.shape)
print(y_train.shape)

print("\n\nTEST")
print(x_test.shape)
print(y_test.shape)

TRAIN
(3000000, 5000)
(3000000,)


TEST
(650000, 5000)
(650000,)


## Amazon Reviews (Polarity)

In [8]:
x_train, y_train, x_test, y_test = get_freqBoW_and_targetvect("amazon_polarity")

print("TRAIN")
print(x_train.shape)
print(y_train.shape)

print("\n\nTEST")
print(x_test.shape)
print(y_test.shape)

TRAIN
(3600000, 5000)
(3600000,)


TEST
(400000, 5000)
(400000,)
