In [1]:
from os import listdir
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm
import os

In [2]:
def parse_vector(s):
    s = s.replace('[', '')
    s = s.replace(']', '')
    s = s.strip()
    l = map(float, s.split(' '))
    return l

def save_dataframe(path, df):
    path_l = path.split('/')
    file_name = path_l[-1]
    del path_l[-1]
    
    if not os.path.exists('/'.join(path_l)):
        os.makedirs('/'.join(path_l))
    df.to_pickle(path)
    print 'file saved - {}'.format(path)

# TRAIN FILES

In [3]:
TRAIN_PATH = 'train_db/'
TEST_PATH = 'test_db/'

In [4]:
# convert all txt files to pandas DataFrame

train_files = sorted(listdir(TRAIN_PATH))

vecs = []
person_ids = []

for txt in tqdm(train_files):
    with open('{}{}'.format(TRAIN_PATH, txt), 'r') as f:
        content = f.read()
        parsed = parse_vector(content)
        person_id = txt[:4]
    vecs.append(parsed)
    person_ids.append(person_id)
        
df = pd.DataFrame({'vec':vecs, 'person_id': person_ids})

100%|██████████| 52284/52284 [00:05<00:00, 9875.19it/s] 


In [5]:
# random shuffle two parts of raw_train.h5 in a loop and compare each with each

full_vec1 = []
full_vec2 = []
is_duplicate = []
person_id1 = []
person_id2 = []
for i in range(2):
    left = df.sample(frac=0.5, replace=False)
    right = df.sample(frac=0.5, replace=False)
    vec1 = left['vec'].values
    vec2 = right['vec'].values
    person_id1 = left['person_id'].values
    person_id2 = right['person_id'].values
    
    for i in range(len(person_id1)):
        if person_id1[i] == person_id2[i]:
            is_duplicate.append(1)
        else:
            is_duplicate.append(0)

    full_vec1 += list(vec1)
    full_vec2 += list(vec2)

shuffled = pd.DataFrame({'vec1': full_vec1, 'vec2': full_vec2, 
                         'is_duplicate': is_duplicate})
print shuffled.info()
print 'shuffled: len - {}, 0 - {}, 1 - {}'.format(len(shuffled), len(shuffled[shuffled.is_duplicate == 0]), 
                                                  len(shuffled[shuffled.is_duplicate == 1]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52284 entries, 0 to 52283
Data columns (total 3 columns):
is_duplicate    52284 non-null int64
vec1            52284 non-null object
vec2            52284 non-null object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB
None
shuffled: len - 52284, 0 - 52027, 1 - 257


In [6]:
# create pairs of duplicates

full_vec1 = []
full_vec2 = []
for i in range(10):
    for i in df.person_id.unique():
        one_person = df[df.person_id == i]
        left = one_person.sample(frac=0.99, replace=True)
        right = one_person.sample(frac=0.99, replace=True)

        vec1 = left['vec'].values
        vec2 = right['vec'].values

        full_vec1 += list(vec1)
        full_vec2 += list(vec2)

is_duplicate = [1 for i in xrange(len(full_vec1))]
duplicates = pd.DataFrame({'vec1': full_vec1, 'vec2': full_vec2, 
                           'is_duplicate': is_duplicate})  
print duplicates.info()
print 'duplicates: len - {}, 0 - {}, 1 - {}'.format(len(duplicates), len(duplicates[duplicates.is_duplicate == 0]), 
                                        len(duplicates[duplicates.is_duplicate == 1]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517020 entries, 0 to 517019
Data columns (total 3 columns):
is_duplicate    517020 non-null int64
vec1            517020 non-null object
vec2            517020 non-null object
dtypes: int64(1), object(2)
memory usage: 11.8+ MB
None
duplicates: len - 517020, 0 - 0, 1 - 517020


In [7]:
# create dataset where target distribution is equal

full = pd.concat([shuffled, duplicates], ignore_index=True)

print full.info()
print 'len - {}, 0 - {}, 1 - {}'.format(len(full), len(full[full.is_duplicate == 0]), 
                                        len(full[full.is_duplicate == 1]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569304 entries, 0 to 569303
Data columns (total 3 columns):
is_duplicate    569304 non-null int64
vec1            569304 non-null object
vec2            569304 non-null object
dtypes: int64(1), object(2)
memory usage: 13.0+ MB
None
len - 569304, 0 - 52027, 1 - 517277


In [8]:
save_dataframe('dataset/train/raw_train.h5', df)
save_dataframe('dataset/train/shuffled+duplicates.h5', full)
save_dataframe('dataset/train/shuffled_min.h5', shuffled)

file saved - dataset/train/raw_train.h5
file saved - dataset/train/shuffled+duplicates.h5
file saved - dataset/train/shuffled_min.h5


# TEST FILES

In [9]:
test_files = sorted(listdir(TEST_PATH))

vecs = []
person_ids = []

for txt in tqdm(test_files):
    with open('{}{}'.format(TEST_PATH, txt), 'r') as f:
        content = f.read()
        parsed = parse_vector(content)
        person_id = txt[:4]
    vecs.append(parsed)
    person_ids.append(person_id)
        
raw_test = pd.DataFrame({'vec':vecs, 'person_id': person_ids})
print raw_test.info()

100%|██████████| 953/953 [00:00<00:00, 12107.87it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 2 columns):
person_id    953 non-null object
vec          953 non-null object
dtypes: object(2)
memory usage: 15.0+ KB
None





In [10]:
vec1 = []
vec2 = []
is_duplicate = []

for i in tqdm(range(len(raw_test))):
    current_vec = raw_test.vec[i]
    current_person = raw_test.person_id[i]
    
    exception = raw_test.index.isin([i])
    sample = raw_test[~exception]
    vecs = sample['vec'].values
    persons = sample['person_id'].values
    
    for person in zip(persons, vecs):
        vec1.append(current_vec)
        vec2.append(person[1])
        if person[0] == current_person:
            is_duplicate.append(1)
        else:
            is_duplicate.append(0)
            

test_df = pd.DataFrame({'vec1':vec1, 'vec2': vec2, 'is_duplicate': is_duplicate})
print test_df.info()
print 'len - {}, 0 - {}, 1 - {}'.format(len(test_df), len(test_df[test_df.is_duplicate == 0]), 
                                        len(test_df[test_df.is_duplicate == 1]))

100%|██████████| 953/953 [00:00<00:00, 1125.12it/s]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907256 entries, 0 to 907255
Data columns (total 3 columns):
is_duplicate    907256 non-null int64
vec1            907256 non-null object
vec2            907256 non-null object
dtypes: int64(1), object(2)
memory usage: 20.8+ MB
None
len - 907256, 0 - 899640, 1 - 7616


In [11]:
save_dataframe('dataset/test/raw_test.h5', raw_test)
save_dataframe('dataset/test/test.h5', test_df)

file saved - dataset/test/raw_test.h5
file saved - dataset/test/test.h5
