In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import glob
from scipy.stats import pearsonr
import seaborn as sns

In [2]:
def convert_data2string(df):
    vals = df.values.astype(str)
    vals = np.array([' '.join(x) for x in vals])

    # should be fine now, but this checks that nothing is getting truncated due to type casting
    test = np.array([[y.isdigit() for y in x.split(' ')] for x in vals])
    assert (test.sum(axis=-1) == df.shape[-1]).all()

    return vals

In [3]:
# Tracts random
files_orig_rand = glob.glob('../results/init_random/tracts_without_block/*original*.t7')
files_rec_rand = glob.glob('../results/init_random/tracts_without_block/*reconstructed*.t7')
files_orig_rand = sorted(files_orig_rand)
files_rec_rand = sorted(files_rec_rand)
print(len(files_orig_rand), len(files_rec_rand))

50 50


In [4]:
# The total number of records, and the total number of record prototypes in the dataset
records_in_orig = 0
prototypes_in_orig = 0
for i in range(len(files_orig_rand)):
    orig_data = torch.load(files_orig_rand[i])
    uniq_orig_data = orig_data.drop_duplicates().reset_index(drop=True)
    records_in_orig+= len(orig_data)
    prototypes_in_orig += len(uniq_orig_data)
print('Tract without block')
print('Records in D:', records_in_orig)
print('Prototypes in D:', prototypes_in_orig)

Tracts (without block)
Records in D: 103093
Prototypes in D: 12092


In [5]:
# The total number of records, and the total number of record prototypes in the reconstructions (random)
records_in_rec = 0
prototypes_in_rec = 0
for i in range(len(files_rec_rand)):
    rec_data = torch.load(files_rec_rand[i])
    all_rec_data = None
    for d in rec_data:
        if all_rec_data is None:
            all_rec_data = d
        else:
            all_rec_data = pd.concat([all_rec_data, d])
    
    uniq_rec_data = all_rec_data.drop_duplicates().reset_index(drop=True)
    records_in_rec+= len(all_rec_data)
    prototypes_in_rec+= len(uniq_rec_data)

print('Tract without block (random)')
print('Records in D\':', records_in_rec)
print('Prototypes in D\':', prototypes_in_rec)

Tracts (without block, random)
Records in D': 10309300
Prototypes in D': 96886


In [4]:
# The total number of records, and the total number of record prototypes in the reconstructions that are not in the dataset (random)
in_orig_rand = []
in_rec_rand = []

for i in range(len(files_orig_rand)):
    orig_data = torch.load(files_orig_rand[i])
    rec_data = torch.load(files_rec_rand[i])
    all_rec_data = None
    for d in rec_data:
        if all_rec_data is None:
            all_rec_data = d
        else:
            all_rec_data = pd.concat([all_rec_data, d])

    orig_data_str = convert_data2string(orig_data)
    all_rec_data_str = convert_data2string(all_rec_data)
    uniq_rec_data = all_rec_data.drop_duplicates().reset_index(drop=True)
    uniq_rec_data_str = convert_data2string(uniq_rec_data)
    
    
    for prototype in uniq_rec_data_str:
        n_in_orig = len(np.where(orig_data_str == prototype)[0])
        n_in_rec = len(np.where(all_rec_data_str == prototype)[0])
        in_orig_rand.append(n_in_orig)
        in_rec_rand.append(n_in_rec)
            
in_orig_rand = np.array(in_orig_rand)
in_rec_rand = np.array(in_rec_rand)
idxs = (in_orig_rand == 0)
print('Tract without block (random):')
print('Records in D\':', in_rec_rand[idxs].sum()) 
print('Prototypes in D\':', len(in_rec_rand[idxs]))

Tract (random):
Records in D': 1658178
Prototypes in D': 85033


In [4]:
# The total number of records, and the total number of record prototypes in the dataset that are not in the reconstructions(random)

in_orig_rand = []
in_rec_rand = []

for i in range(len(files_orig_rand)):
    orig_data = torch.load(files_orig_rand[i])
    rec_data = torch.load(files_rec_rand[i])
    all_rec_data = None
    for d in rec_data:
        if all_rec_data is None:
            all_rec_data = d
        else:
            all_rec_data = pd.concat([all_rec_data, d])

    orig_data_str = convert_data2string(orig_data)
    all_rec_data_str = convert_data2string(all_rec_data)
    uniq_orig_data_str = convert_data2string(orig_data.drop_duplicates().reset_index(drop=True))
    
    for prototype in uniq_orig_data_str:
        n_in_orig = len(np.where(orig_data_str == prototype)[0])
        n_in_rec = len(np.where(all_rec_data_str == prototype)[0])
        in_orig_rand.append(n_in_orig)
        in_rec_rand.append(n_in_rec)
            
in_orig_rand = np.array(in_orig_rand)
in_rec_rand = np.array(in_rec_rand)
idxs = (in_rec_rand == 0)
print('Tract without block (random):')
print('Records in D:', in_orig_rand[idxs].sum()) 
print('Prototypes in D:', len(in_orig_rand[idxs]))

Tract (random):
Records in D: 364
Prototypes in D: 239


In [6]:
# Percentage of appearances in reconstructions of rare record prototypes that occur once, twice, or three times in D
idxs1 = (in_orig_rand == 1) * (in_rec_rand == 1)
idxs2 = (in_rec_rand == 1)
r1 = in_rec_rand[idxs1].sum()/in_rec_rand[idxs2].sum()

print('Tract without block (random) 1 in D:{:0.2f}%'.format(r1*100))

idxs1 = (in_orig_rand == 2) * (in_rec_rand <= 2)
idxs2 = (in_rec_rand <= 2)
r2 = in_rec_rand[idxs1].sum()/in_rec_rand[idxs2].sum()

print('Tract without block (random) 2 in D:{:0.2f}%'.format(r2*100))


idxs1 = (in_orig_rand == 3) * (in_rec_rand <= 3)
idxs2 = (in_rec_rand <= 3)
r3 = in_rec_rand[idxs1].sum()/in_rec_rand[idxs2].sum()

print('Tract without block (random) 3 in D:{:0.2f}%'.format(r3*100))

Tract without block (random) 1 in D:0.15%
Tract without block (random) 2 in D:0.16%
Tract without block (random) 3 in D:0.15%


In [6]:
# For visualizating correlation
in_orig_rand = []
in_rec_rand = []

for i in range(len(files_orig_rand)):
    orig_data = torch.load(files_orig_rand[i])
    rec_data = torch.load(files_rec_rand[i])
    all_rec_data = None
    for d in rec_data:
        if all_rec_data is None:
            all_rec_data = d
        else:
            all_rec_data = pd.concat([all_rec_data, d])

    orig_data_prototypes = orig_data.drop_duplicates().reset_index(drop=True)
    orig_data_str = convert_data2string(orig_data)
    all_rec_data_str = convert_data2string(all_rec_data)
    orig_data_prototypes = convert_data2string(orig_data_prototypes)
    
    for prototype in orig_data_prototypes:
        n_in_orig = len(np.where(orig_data_str == prototype)[0])
        n_in_rec = len(np.where(all_rec_data_str == prototype)[0])
        in_orig_rand.append(n_in_orig)
        in_rec_rand.append(n_in_rec)
            

in_orig_rand = np.array(in_orig_rand)
in_rec_rand = np.array(in_rec_rand)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4), dpi= 300, facecolor='w', edgecolor='k')

ax = sns.scatterplot(x = in_orig_rand, y = in_rec_rand,
                edgecolor = "black", hue = in_rec_rand)
ax.set(xlabel='Occurences in D', ylabel='Frequency in reconstructions')
pr = pearsonr(in_orig_rand, in_rec_rand)
# legend.get_frame().set_edgecolor('1.0')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_linewidth(0.5)
ax.spines['left'].set_linewidth(0.5)

# # set the x-spine (see below for more info on `set_position`)
ax.spines['left'].set_position('zero')
# turn off the right spine/ticks
ax.spines['right'].set_color('none')
ax.yaxis.tick_left()
# set the y-spine
ax.spines['bottom'].set_position('zero')
# turn off the top spine/ticks
ax.spines['top'].set_color('none')
ax.xaxis.tick_bottom()
ax.get_legend().remove()

plt.xlabel('Occurences in D')
plt.ylabel('Frequency in reconstructions')
plt.figtext(0.3, 0.65, f'Pearson\'s r  = {pr.statistic:0.4f}')
plt.tight_layout()
plt.savefig('figures/corr_tract_without_block_random.png', dpi = 300)
plt.show()
