In [9]:
import numpy as np 
import pickle 
from tqdm import tqdm 
import os
from scipy import sparse 
import gc

In [10]:
path_in = '/data/datn/final_data/holdout_SOLA-TPS-idrop-nograd-nobonus/SOLA-TPS-idrop-nograd-nobonus/dataset/6-statictarget-datasets/'
path_out = '/data/datn/final_data/holdout_data/'

In [11]:
lst_dataset = os.listdir(path_in)
lst_dataset

['Yahoo',
 'TMN',
 'TMNtitle',
 'Grolier',
 'Agnews-title',
 'NYtimes',
 'Agnews',
 'Twitter',
 '20newgroups']

In [12]:
def convert_to_bow(path_bow, vocab_len):
    with open(path_bow, 'r') as f:
        data = f.read().splitlines()
    dense_vector = np.zeros((len(data), vocab_len), dtype = np.int32)
    for i in tqdm(range(len(data))):
        terms = data[i].split()[1:]
        for j in range(len(terms)):
            idx, cnt = terms[j].split(':')
            dense_vector[i][int(idx)] = int(cnt)
    sparse_vector = sparse.csr_matrix(dense_vector)
    return sparse_vector

In [13]:
def convert_prior_vector(prior):
    prior_vector = []
    for i in tqdm(range(len(prior))):
        prior_vector.append(prior[i].split())
    prior_vector = np.array(prior_vector, dtype = np.float64)
    return prior_vector

In [14]:
def write_file(data, path, is_pickle = True):
    if is_pickle: 
        with open(path,'wb') as f:
            pickle.dump(data, f, protocol = pickle.HIGHEST_PROTOCOL)
    else:
        with open(path,'w') as f:
            f.write('\n'.join(data))
def read_file(path):
    with open(path,'r') as f:
        data = f.read().splitlines()
    return data

In [15]:
def process_data(path_in, path_out, dataset):
    lst_file = os.listdir(path_in + dataset)
    # create path dataset out 
    if not os.path.exists(path_out + dataset):
        os.mkdir(path_out + dataset)
    vocab = read_file(path_in + dataset + '/vocab.txt')
    setting = read_file(path_in + dataset + '/setting.txt')
    write_file(data = vocab,
              path = path_out + dataset + '/vocab.txt',
              is_pickle = False)
    write_file(data = setting,
              path = path_out + dataset + '/setting.txt',
              is_pickle = False)
    
    for f in lst_file: 
        if 'train' in f: 
            sparse_vector = convert_to_bow(path_bow = path_in + dataset + '/' + f,
                                          vocab_len = len(vocab))
            write_file(data = sparse_vector, 
                      path = path_out + dataset + '/' + f.split('.')[0] + '.pkl',
                      is_pickle = True)
            del sparse_vector
            _ = gc.collect()
        elif 'prior' in f:
            prior = read_file(path_in + dataset + '/' + f)
            prior = convert_prior_vector(prior)
            write_file(data = prior,
                      path = path_out + dataset + '/' + f.split('.')[0] + '.pkl',
                      is_pickle = True)
            del prior
        elif 'test' in f:
            test = read_file(path_in + dataset + '/' + f)
            write_file(data = test,
                      path = path_out + dataset + '/' + f.split('.')[0] + '.txt',
                      is_pickle = False)
            _ = gc.collect()

In [16]:
# lst_dataset_use = ['Agnews', 'Agnews-title','TMN','TMNtitle',\
#                       'Yahoo', 'Grolier']
# lst_dataset_use = ['20newgroups']
lst_dataset_use = ['Agnews', 'TMN','20newgroups']
for dataset in lst_dataset:
    if dataset in lst_dataset_use:
        print('Process dataset: ', dataset)
        process_data(path_in, path_out, dataset)

Process dataset:  TMN


100%|██████████████████████████████████| 11599/11599 [00:00<00:00, 77336.94it/s]
100%|██████████████████████████████████| 31604/31604 [00:00<00:00, 40827.08it/s]


Process dataset:  Agnews


100%|██████████████████████████████████| 32483/32483 [00:00<00:00, 70872.71it/s]
100%|████████████████████████████████| 110000/110000 [00:03<00:00, 33959.42it/s]


Process dataset:  20newgroups


100%|██████████████████████████████████| 24792/24792 [00:00<00:00, 73597.82it/s]
100%|██████████████████████████████████| 17846/17846 [00:01<00:00, 14684.64it/s]


# get docs embedding

In [17]:
import pickle 
import numpy as np 
import os 
from tqdm import tqdm 
import gc
from scipy import sparse 

In [18]:
path_folder = '/data/datn/final_data/holdout_data/'
lst_data = ['Agnews', 'TMN','20newgroups']
# lst_data = ['20newgroups']
lst_path = [path_folder + f for f in lst_data]

In [23]:
def read_data(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data
def get_docs_vector(prior, bows):
    docs_vector = []
    for i in tqdm(range(bows.shape[0])):
        bow = bows[i].toarray().squeeze()
        idx = bow.nonzero()[0]
        cnt = bow[idx]
        word_idx_appear = []
        for j in range(len(idx)):
            word_idx_appear += [idx[j]]* cnt[j]
        if len(word_idx_appear) == 0:
            vector = np.zeros(200)
        else:
            vector = prior[word_idx_appear]
            vector = np.mean(vector, axis = 0)
        docs_vector.append(vector)
    docs_vector = np.array(docs_vector)
    return docs_vector

def write_data(path, data):
    with open(path, 'wb') as f:
        pickle.dump(data, f, protocol = pickle.HIGHEST_PROTOCOL)
        
def process_docs_vector(path):
    prior = read_data(path + '/prior.pkl')
    bows = read_data(path + '/train.pkl')
    docs_vector = get_docs_vector(prior, bows)
    write_data(path + '/docs_vector.pkl',docs_vector)
    del prior, bows, docs_vector
    _ = gc.collect()

In [24]:
for path_data in lst_path:
    print('process data:', path_data)
    process_docs_vector(path_data)

process data: /data/datn/final_data/holdout_data/Agnews


100%|█████████████████████████████████| 110000/110000 [00:16<00:00, 6762.61it/s]


process data: /data/datn/final_data/holdout_data/TMN


100%|███████████████████████████████████| 31604/31604 [00:03<00:00, 8764.54it/s]


process data: /data/datn/final_data/holdout_data/20newgroups


100%|███████████████████████████████████| 17846/17846 [00:03<00:00, 5551.83it/s]
