# Singular value decomposition
Natalia Vélez, July 2020

In [11]:
%matplotlib inline

import os, re, glob
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from scipy.linalg import svd
import matplotlib.pyplot as plt

Find matrix files:

In [12]:
mtx_files = glob.glob('outputs/jobmatrix/*[0-9].txt')
mtx_files.sort()
# mtx_files = mtx_files[:3] # Debug only!
print(*mtx_files[:10], sep='\n')

outputs/jobmatrix/jobmatrix_release-284_start-1573895672.txt
outputs/jobmatrix/jobmatrix_release-284_start-1573982073.txt
outputs/jobmatrix/jobmatrix_release-284_start-1574068473.txt
outputs/jobmatrix/jobmatrix_release-285_start-1574102503.txt
outputs/jobmatrix/jobmatrix_release-287_start-1574151678.txt
outputs/jobmatrix/jobmatrix_release-287_start-1574238079.txt
outputs/jobmatrix/jobmatrix_release-287_start-1574324479.txt
outputs/jobmatrix/jobmatrix_release-287_start-1574410879.txt
outputs/jobmatrix/jobmatrix_release-287_start-1574497279.txt
outputs/jobmatrix/jobmatrix_release-289_start-1574552311.txt


Find corresponding labels:

In [13]:
label_files = [f.replace('.txt', '_labels.txt') for f in mtx_files]
print(*label_files[:10], sep='\n')

outputs/jobmatrix/jobmatrix_release-284_start-1573895672_labels.txt
outputs/jobmatrix/jobmatrix_release-284_start-1573982073_labels.txt
outputs/jobmatrix/jobmatrix_release-284_start-1574068473_labels.txt
outputs/jobmatrix/jobmatrix_release-285_start-1574102503_labels.txt
outputs/jobmatrix/jobmatrix_release-287_start-1574151678_labels.txt
outputs/jobmatrix/jobmatrix_release-287_start-1574238079_labels.txt
outputs/jobmatrix/jobmatrix_release-287_start-1574324479_labels.txt
outputs/jobmatrix/jobmatrix_release-287_start-1574410879_labels.txt
outputs/jobmatrix/jobmatrix_release-287_start-1574497279_labels.txt
outputs/jobmatrix/jobmatrix_release-289_start-1574552311_labels.txt


Concatenate labels:

In [14]:
all_labels = [np.loadtxt(f) for f in label_files]
all_labels = np.concatenate(all_labels).astype(np.int)
print(all_labels)

[2276905 2276906 2276907 ... 3079302 3079308 3079310]


Sanity check: Are any labels repeated across log files?

In [5]:
unique_labels, label_counts = np.unique(all_labels, return_counts=True)
repeated_labels = unique_labels[label_counts > 1]
min_repeats = np.min(label_counts[label_counts > 1])

print('%i total entries' % len(all_labels))
print('%i unique player IDs' % len(unique_labels))
print('%i repeated labels (range %i-%i)' % (len(repeated_labels),
                                            min_repeats,
                                            np.max(label_counts)))

7193 total entries
7170 unique player IDs
23 repeated labels (range 2-2)


Concatenate matrices:

In [6]:
all_mtx = []
print('Loading matrix files...')
for f in tqdm_notebook(mtx_files):
    all_mtx.append(np.loadtxt(f))
print('Concatenating matrices...')
all_mtx = np.concatenate(all_mtx, axis=0).astype(int)
print(all_mtx.shape)

Loading matrix files...


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


Concatenating matrices...
(7193, 2805)


Sum over repeated labels:

In [7]:
unique_mtx = []
for group in tqdm_notebook(unique_labels):
    group_v = np.sum(all_mtx[all_labels == group],axis=0)
    unique_mtx.append(group_v)
unique_mtx = np.array(unique_mtx)
print('Reducing repeated labels:')
print(unique_mtx.shape)

HBox(children=(IntProgress(value=0, max=7170), HTML(value='')))


Reducing repeated labels:
(7170, 2805)


The main event: SVD!

In [8]:
U,s,Vh = svd(unique_mtx, full_matrices=False)

Check output

In [9]:
print('Check: Can we reconstruct the original values from the SVD?')
reconstruction = U.dot(np.diag(s)).dot(Vh)
print(np.all(np.isclose(unique_mtx, reconstruction)))

Check: Can we reconstruct the original values from the SVD?
True


Save outputs to file:

In [10]:
np.savetxt('outputs/svd/U.txt', U)
np.savetxt('outputs/svd/s.txt', s)
np.savetxt('outputs/svd/Vh.txt', Vh)
np.savetxt('outputs/svd/input_mtx.txt',unique_mtx)
np.savetxt('outputs/svd/input_playerIDs.txt', unique_labels)