In [23]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
from os import listdir
from os.path import isfile, join, isdir
import pickle


In [2]:
data_folder = '../data/'
#test_folder = data_folder + 'test/'
train_folder = data_folder + 'train/'
list_train_paths = [join(train_folder,f+'/train.txt') for f in listdir(train_folder)]


In [3]:
list_train_paths

['../data/train/cr/train.txt',
 '../data/train/cardio/train.txt',
 '../data/train/pc/train.txt',
 '../data/train/subj/train.txt',
 '../data/train/sst2/train.txt',
 '../data/train/kaggle_med/train.txt',
 '../data/train/trec/train.txt']

In [10]:

def check_format_correction(path):
    with open(path,'r') as f:    
        lines = f.readlines()    
        for line in lines:
            if '\t' not in line:
                try:
                    line.replace(' ', '\t', 1)

                except:
                    print('error')
                    return False
    return True


def load_data(path):
    """
    Loads data from a txt file.
    """
    # check file format
    if path.endswith('.txt'):
        df = pd.read_csv(path, sep='|', header=None, names=['text'])
        try:
            df['class'] = df['text'].apply(lambda x: x.split('\t')[0])
            df['text'] = df['text'].apply(lambda x: x.split('\t')[1])
        except:
            df['class'] = df['text'].apply(lambda x: x.split(' ',1)[0])
            df['text'] = df['text'].apply(lambda x: x.split(' ',1)[1])

        df = df[['class', 'text']]
        return df
    else:
        raise ValueError('File format not supported.')


def generate_sample(path,sample_size):    
    if check_format_correction(path):
        output_path = path.replace('.txt','1000_sample.txt')
        df = load_data(path)
        number_of_classes = df['class'].nunique()
        lowest_class_count = df['class'].value_counts().min()
        samples_per_class = int(sample_size/number_of_classes)
        if samples_per_class > lowest_class_count:
            samples_per_class = lowest_class_count
        new_df = df.groupby('class').apply(lambda x: x.sample(samples_per_class))
        new_df = new_df.sample(frac=1).reset_index(drop=True)   
        np.savetxt(output_path, new_df.values,fmt='%s',delimiter='\t')
        sample_name = output_path.split('/')[-2]
        print(f'sample {sample_name} with {len(new_df)} sentences is saved to: ',output_path)
    else:
        print('check the format of the file: ',path)

In [7]:
list_train_paths = ['../data/train/cr/train.txt',
 '../data/train/pc/train.txt',
 '../data/train/subj/train.txt',
 ]
sample_size = 1000                 
for path in tqdm(list_train_paths):
    generate_sample(path,sample_size)


100%|██████████| 3/3 [00:00<00:00, 17.91it/s]

sample cr with 1000 sentences is saved to:  ../data/train/cr/train1000_sample.txt
sample pc with 1000 sentences is saved to:  ../data/train/pc/train1000_sample.txt
sample subj with 1000 sentences is saved to:  ../data/train/subj/train1000_sample.txt





In [93]:
train_folder = '../eda_code/txt_for_test/train/'
list_train_paths = [join(train_folder,f+'/train1000_sample.txt') for f in listdir(train_folder)]
list_train_paths

['../eda_code/txt_for_test/train/sst2/train1000_sample.txt',
 '../eda_code/txt_for_test/train/trec/train1000_sample.txt',
 '../eda_code/txt_for_test/train/cr/train1000_sample.txt',
 '../eda_code/txt_for_test/train/cardio/train1000_sample.txt',
 '../eda_code/txt_for_test/train/subj/train1000_sample.txt',
 '../eda_code/txt_for_test/train/pc/train1000_sample.txt',
 '../eda_code/txt_for_test/train/kaggle_med/train1000_sample.txt']

In [94]:
for i in list_train_paths:
    check_format_correction(i)

In [8]:
df = load_data('../data/train/cr/train1000_sample.txt')

In [16]:
df.groupby(['class']).count()

Unnamed: 0_level_0,text
class,Unnamed: 1_level_1
0,500
1,500


## prepare word2vec dictionary for 1000 sample data

In [5]:
data_folder = '../experiments/data/'
list_train_paths = [join(data_folder,f+'/train1000_sample.txt') for f in listdir(data_folder)]

In [6]:
list_train_paths

['../experiments/data/cr/train1000_sample.txt',
 '../experiments/data/pc/train1000_sample.txt',
 '../experiments/data/subj/train1000_sample.txt']

In [18]:
def gen_vocab(file_path,glove_path):
    vocab = set()
    vocab_not_in_glove = set()
    _glove = open(glove_path, 'r').readlines()
    w2v = {}
    df = load_data(file_path)
    for i in df['text']:
        for j in i.split(' '):
            if j not in vocab:
                vocab.add(j)
    print(f'size of unique words in {file_path} : {len(vocab)}')
    for i in _glove:
        word = i.split(' ')[0]
        if word in vocab:
            vec = i.split(' ')[1:]
            w2v[i.split(' ')[0]] = np.asarray(vec, dtype='float32')
    for i in vocab:
        if i not in w2v:
            vocab_not_in_glove.add(i)
            

    print(f'matches between vocab and glove for {file_path}: {len(w2v)}')
    print(f'words in vocab but not in glove for {file_path}: {len(vocab_not_in_glove)}')
    
    return w2v, vocab_not_in_glove
    


In [30]:
glove = 'glove.840B.300d.txt'
file_path = '../experiments/data/subj/train1000_sample.txt'
pickel_path = '../experiments/data/subj/wor2vec_1000_sample.p'

In [31]:
new_w2v, not_in_vocab = gen_vocab(file_path,glove)

size of unique words in ../experiments/data/subj/train1000_sample.txt : 6013
matches between vocab and glove for ../experiments/data/subj/train1000_sample.txt: 5547
words in vocab but not in glove for ../experiments/data/subj/train1000_sample.txt: 466


In [32]:
for i in not_in_vocab:
    print(i)


shyamalan's
breen's
'fun
pouqussimos
b-movie-and-proud-of-it
mazzotta
'grasp'
'great
'enough'
hartdegen
there's
self-narrated
petser&#180
half-sister's
chill'
hope'
risotadas
maslakh
1970's
danny's
mouglalis
'my
sarah's
hobbit'
prozium
women's
brosnan's
mackendrick
renner's
jesus'
bale's
version's
lemle
guzmn
disney's
wedding]
guys'
shouldn't
stuart's
[griffith]
talkiness
[dong]
geonosis
roommate's
shreve's
unconned
script's
girlfriend's
boss's
invadir&#237
naipaul
payami
haven't
tornatore's
year's
busc&#243
salenger
bielinsky's
$20
superlarge
snowball's
enfrentarse
bogdanich
makhmalbaf
you've
resources]
feature-
alicia's
pulpiness
walter's
'starchildren'
fererra's
zimmett
necessary-manipulation
budcasso
they've
kasner
pair's
po'boys
world's
serry
it'll
&#193
'safe
'monsters'
bettien
#626
perrini
[a]
rock'
otte's
'difficult'
'been
tomaselli
harris's
natalio
waiting'
tornatore
couldn't
sprechers
dean's
baumel
wells'
ana's
woman's
margr&#233
won't
andr&#225
flash/tony
scherfig
kid's]
th

In [28]:
pickle.dump(new_w2v, open(pickel_path, 'wb'))