In [1]:
from os import listdir
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm

In [2]:
def parse_vector(s):
    s = s.replace('[', '')
    s = s.replace(']', '')
    s = s.strip()
    l = map(float, s.split(' '))
    return l

# TRAIN FILES

In [3]:
train_files = sorted(listdir('train_db/'))

vecs = []
person_ids = []

for txt in tqdm(train_files):
    with open('train_db/{}'.format(txt), 'r') as f:
        content = f.read()
        parsed = parse_vector(content)
        person_id = txt[:4]
    vecs.append(parsed)
    person_ids.append(person_id)
        
df = pd.DataFrame({'vec':vecs, 'person_id': person_ids})
df.to_pickle('raw_train.h5')

100%|██████████| 52284/52284 [03:35<00:00, 243.18it/s] 


In [5]:
df = pd.read_pickle('raw_train.h5')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52284 entries, 0 to 52283
Data columns (total 2 columns):
person_id    52284 non-null object
vec          52284 non-null object
dtypes: object(2)
memory usage: 817.0+ KB


In [7]:
full = pd.DataFrame({'vec1': [], 'vec2': [], 'is_duplicate': [], 'person_id1': [], 'person_id2': []})

In [8]:
# random shuffle
full_vec1 = []
full_vec2 = []
is_duplicate = []
person_id1 = []
person_id2 = []
for i in range(2):
    left = df.sample(frac=0.5, replace=False)
    right = df.sample(frac=0.5, replace=False)
    vec1 = left['vec'].values
    vec2 = right['vec'].values
    person_id1 = left['person_id'].values
    person_id2 = right['person_id'].values
    
    for i in range(len(person_id1)):
        if person_id1[i] == person_id2[i]:
            is_duplicate.append(1)
        else:
            is_duplicate.append(0)

    full_vec1 += list(vec1)
    full_vec2 += list(vec2)

shuffled = pd.DataFrame({'vec1': full_vec1, 'vec2': full_vec2, 
                         'is_duplicate': is_duplicate})    

In [9]:
print shuffled.info()
print 'len - {}, 0 - {}, 1 - {}'.format(len(shuffled), len(shuffled[shuffled.is_duplicate == 0]), 
                                        len(shuffled[shuffled.is_duplicate == 1]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52284 entries, 0 to 52283
Data columns (total 3 columns):
is_duplicate    52284 non-null int64
vec1            52284 non-null object
vec2            52284 non-null object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB
None
len - 52284, 0 - 52013, 1 - 271


In [None]:
shuffled.to_pickle('train_shuffled.h5')

In [10]:
full_vec1 = []
full_vec2 = []
for i in df.person_id.unique():
    one_person = df[df.person_id == i]
    left = one_person.sample(frac=0.9, replace=True)
    right = one_person.sample(frac=0.9, replace=True)
    
    vec1 = left['vec'].values
    vec2 = right['vec'].values
    
    full_vec1 += list(vec1)
    full_vec2 += list(vec2)

is_duplicate = [1 for i in range(len(full_vec1))]
duplicates = pd.DataFrame({'vec1': full_vec1, 'vec2': full_vec2, 
                           'is_duplicate': is_duplicate})  

In [11]:
print duplicates.info()
print 'len - {}, 0 - {}, 1 - {}'.format(len(duplicates), len(duplicates[duplicates.is_duplicate == 0]), 
                                        len(duplicates[duplicates.is_duplicate == 1]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47047 entries, 0 to 47046
Data columns (total 3 columns):
is_duplicate    47047 non-null int64
vec1            47047 non-null object
vec2            47047 non-null object
dtypes: int64(1), object(2)
memory usage: 1.1+ MB
None
len - 47047, 0 - 0, 1 - 47047


In [12]:
full = pd.concat([shuffled, duplicates], ignore_index=True)

print full.info()
print 'len - {}, 0 - {}, 1 - {}'.format(len(full), len(full[full.is_duplicate == 0]), 
                                        len(full[full.is_duplicate == 1]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99331 entries, 0 to 99330
Data columns (total 3 columns):
is_duplicate    99331 non-null int64
vec1            99331 non-null object
vec2            99331 non-null object
dtypes: int64(1), object(2)
memory usage: 2.3+ MB
None
len - 99331, 0 - 52013, 1 - 47318


In [13]:
full.to_pickle('train_shuffled+duplicates.h5')

# TEST FILES

In [84]:
test_files = sorted(listdir('test_db/'))

vecs = []
person_ids = []

for txt in tqdm(test_files):
    with open('test_db/{}'.format(txt), 'r') as f:
        content = f.read()
        parsed = parse_vector(content)
        person_id = txt[:4]
    vecs.append(parsed)
    person_ids.append(person_id)
        
df = pd.DataFrame({'vec':vecs, 'person_id': person_ids})



  0%|          | 0/953 [00:00<?, ?it/s][A[A

100%|██████████| 953/953 [00:00<00:00, 11893.34it/s][A[A

In [85]:
df.to_pickle('raw_test.h5')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 2 columns):
person_id    953 non-null object
vec          953 non-null object
dtypes: object(2)
memory usage: 15.0+ KB


In [86]:
vec1 = []
vec2 = []
is_duplicate = []

for i in tqdm(range(len(df))):
    current_vec = df.vec[i]
    current_person = df.person_id[i]
    
    exception = df.index.isin([i])
    sample = df[~exception]
    vecs = sample['vec'].values
    persons = sample['person_id'].values
    
    for person in zip(persons, vecs):
        vec1.append(current_vec)
        vec2.append(person[1])
        if person[0] == current_person:
            is_duplicate.append(1)
        else:
            is_duplicate.append(0)
            

test_df = pd.DataFrame({'vec1':vec1, 'vec2': vec2, 'is_duplicate': is_duplicate})



  0%|          | 0/953 [00:00<?, ?it/s][A[A

 14%|█▎        | 131/953 [00:00<00:00, 1304.40it/s][A[A

 27%|██▋       | 258/953 [00:00<00:00, 1289.43it/s][A[A

 41%|████      | 392/953 [00:00<00:00, 1301.69it/s][A[A

 55%|█████▌    | 528/953 [00:00<00:00, 1314.15it/s][A[A

 69%|██████▉   | 659/953 [00:00<00:00, 1309.71it/s][A[A

 83%|████████▎ | 791/953 [00:00<00:00, 1312.00it/s][A[A

 97%|█████████▋| 920/953 [00:00<00:00, 1301.52it/s][A[A

100%|██████████| 953/953 [00:00<00:00, 1297.11it/s][A[A

In [None]:
print test_df.info()
print 'len - {}, 0 - {}, 1 - {}'.format(len(test_df), len(test_df[test_df.is_duplicate == 0]), 
                                        len(test_df[test_df.is_duplicate == 1]))
test_df.to_pickle('test.h5')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907256 entries, 0 to 907255
Data columns (total 3 columns):
is_duplicate    907256 non-null int64
vec1            907256 non-null object
vec2            907256 non-null object
dtypes: int64(1), object(2)
memory usage: 20.8+ MB
None
len - 907256, 0 - 899640, 1 - 7616
