In [45]:
from collections import defaultdict
import operator
import os
import re

import numpy as np
import pandas as pd
import pickle
import random
import sparse
#import tensorly as tl
#import tensorly.decomposition as decomp
import sktensor
import urllib3
#import wget

import matplotlib.pyplot as plt
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

import logging
logging.basicConfig(level=logging.DEBUG, 
                    format='%(msecs)d %(levelname)-8s [%(lineno)d] %(message)s')

if not os.path.exists('cp_orth.py'):
    wget.download('http://web.stanford.edu/~vsharan/cp_orth.py')
from cp_orth import orth_als

2019-03-03 17:43:54,663,663 DEBUG    [211] Loaded backend module://ipykernel.pylab.backend_inline version unknown.


Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [6]:
def mazsola_reader():
    pickle_path = '/mnt/store/home/makrai/project/verb-tensor/mazsola.pkl'
    if os.path.exists(pickle_path):
        logging.info('Loading mazsola dict from {}'.format(pickle_path))
        return pickle.load(open(pickle_path, mode='rb'))
    logging.info('Reading mazsola...'.format(pickle_path))
    path = '/mnt/permanent/Language/Hungarian/Dic/sass15-535k-igei-szerkezet/mazsola_adatbazis.txt'
    occurrence = defaultdict(int)#lambda: defaultdict(lambda: defaultdict(int)))
    margianls = [defaultdict(int) for _ in range(3)]
    with open(path) as infile:
        for i, line in enumerate(infile):
            if not i % 500000:
                logging.info('{:.0%}'.format(i/27970403))
            record = defaultdict(str)
            for token in line.strip().split():
                case_stem = re.split('@@', token)
                if len(case_stem) == 1:
                    continue
                try:
                    case, stem = case_stem
                except:
                    logging.warning(line.strip())
                record[case] = stem
            occurrence[record['NOM'], record['stem'], record['ACC']] += 1
            for i, mode in enumerate(['NOM', 'stem', 'ACC']):
                margianls[i][record[mode]] += 1            
    result = occurrence, margianls
    pickle.dump(result, open(pickle_path, mode='wb'))
    return result                

In [82]:
def get_tensor(middle_end='sktensor', cutoff=10):
    logging.info('Reweighting: log')
    verb_tensor_path = '/mnt/store/home/makrai/project/verb-tensor/exper/{}/tensor_{}.pkl'.format(middle_end, cutoff)
    if os.path.exists(verb_tensor_path):
        logging.info('Loading {}'.format(verb_tensor_path))
        tensor = pickle.load(open(verb_tensor_path, mode='rb'))
        logging.debug(tensor)
        return tensor
    occurrence, marginals = mazsola_reader()
    def get_index(freq_dict):
        items = sorted(filter(lambda item: item[1] >= cutoff, freq_dict.items()), key=operator.itemgetter(1), 
                       reverse=True)
        logging.debug(items[-3:])
        return dict([(w, i) for i, (w, f) in enumerate(items)])

    coords, data = ([], [], []), []
    indices = [get_index(fd) for fd in marginals]
    logging.info('Building tensor...')
    logging.info('  Pupulating lists...')
    for i, ((svo), freq) in enumerate(occurrence.items()):
        if not i % 2000000:
            logging.debug('    {:,}'.format(i))#'{} {}'.format(svo[1], freq))
        for i, word in enumerate(svo):
            if svo[i] not in indices[i]:
                break
        else:
            for i, word in enumerate(svo):
                coords[i].append(indices[i][svo[i]])
            data.append(np.log(freq))
    logging.info('  Creating array')
    shape = tuple(map(len, indices))
    logging.info(shape)
    if middle_end == 'tensorly':
        tensor = sparse.COO(coords, data, shape=shape)#, has_duplicates=False)
    elif middle_end == 'sktensor':
        tensor = sktensor.sptensor(coords, data, shape=shape)
    else:
        raise NotImplementedError
    pickle.dump(tensor, open(verb_tensor_path, mode='wb'))
    logging.info(tensor)
    return tensor

|cutoff|         shape         | d=2  | d=4  | d=5 | d=6 | d=7 | d=10|
|------|-----------------------|------|------|-----|-----|-----|-----|
|     1| 383 k, 110 k, 200 k   |      
|     2| 186 k,  60 k,  99 k   |
|     5|  97 k,  33 k,  50 k   |
|    10|  64 k,  23 k,  32 k   |
|    16|  49 k,  19 k,  24 k   |
|    32|  32 708 13 951 15 740 |14 min|
|    64|  21 033 10 294  9 874 | 6 min|
|   128|  13 093  7 345  6 116 | 2 min|
|   256|   7 957  5 054  3 740 | 2 min|
|   512|   4 598  3 269  2 183 | 1 min|
|  1024|   2 489  2 084  1 196 |  51 s|
|  2048|   1 251  1 293    579 |  27 s| 2 min|14 min|  
|  4096|     622    773    243 |  12 s| 1 min| 2 min|  m  |   s|15 min|
|  8192|     278    414    101 |     s|  55 s| 1 min|1 min|   s|4  min|
| 16384|      87    185     43 |     s|     s|  17 s| 29 s| 27s| 50 s |

10-es gyakorisági cutoffal a ginnyn még lefut, 5-össel már nem.

In [125]:
def decomp(cutoff, dim):
    vtensor = get_tensor(cutoff=cutoff)
    logging.info(vtensor.shape)
    result = orth_als(vtensor, dim)
    pickle.dump(result, open(
        '/mnt/store/home/makrai/project/verb-tensor/exper/sktensor/decomp_{}_{}.pkl'.format(cutoff, dim), 
        mode='wb'))

In [130]:
decomp(2048, 6)

2019-03-03 21:34:39,178,178 INFO     [2] Reweighting: log
2019-03-03 21:34:39,182,182 INFO     [5] Loading /mnt/store/home/makrai/project/verb-tensor/exper/sktensor/tensor_2048.pkl
2019-03-03 21:34:39,331,331 DEBUG    [7] <sktensor.sptensor.sptensor object at 0x7fdfdd6faf28>
2019-03-03 21:34:39,332,332 INFO     [3] (1251, 1293, 579)
2019-03-03 21:34:47,798,798 DEBUG    [218] [  0] fit: 0.41862 | delta: 4.2e-01 | secs: 7.70923
2019-03-03 21:34:47,799,799 DEBUG    [191] Zero norm, mode 0, count 0
2019-03-03 21:34:47,800,800 DEBUG    [191] Zero norm, mode 1, count 0
2019-03-03 21:34:47,801,801 DEBUG    [191] Zero norm, mode 2, count 0
2019-03-03 21:34:55,015,15 DEBUG    [218] [  1] fit: 0.43960 | delta: 2.1e-02 | secs: 7.21347
2019-03-03 21:34:55,016,16 DEBUG    [191] Zero norm, mode 0, count 0
2019-03-03 21:34:55,017,17 DEBUG    [191] Zero norm, mode 1, count 0
2019-03-03 21:34:55,018,18 DEBUG    [191] Zero norm, mode 2, count 0
2019-03-03 21:35:02,232,232 DEBUG    [218] [  2] fit: 0.453

2019-03-03 21:43:41,758,758 DEBUG    [218] [ 74] fit: 0.51183 | delta: 5.4e-05 | secs: 7.20975
2019-03-03 21:43:48,973,973 DEBUG    [218] [ 75] fit: 0.51189 | delta: 6.1e-05 | secs: 7.21128
2019-03-03 21:43:56,189,189 DEBUG    [218] [ 76] fit: 0.51196 | delta: 7.0e-05 | secs: 7.21308
2019-03-03 21:44:03,407,407 DEBUG    [218] [ 77] fit: 0.51204 | delta: 8.1e-05 | secs: 7.21463
2019-03-03 21:44:10,620,620 DEBUG    [218] [ 78] fit: 0.51214 | delta: 9.5e-05 | secs: 7.21021
2019-03-03 21:44:17,834,834 DEBUG    [218] [ 79] fit: 0.51225 | delta: 1.1e-04 | secs: 7.21050
2019-03-03 21:44:25,048,48 DEBUG    [218] [ 80] fit: 0.51238 | delta: 1.3e-04 | secs: 7.20995
2019-03-03 21:44:32,261,261 DEBUG    [218] [ 81] fit: 0.51254 | delta: 1.6e-04 | secs: 7.21071
2019-03-03 21:44:39,475,475 DEBUG    [218] [ 82] fit: 0.51273 | delta: 1.9e-04 | secs: 7.20962
2019-03-03 21:44:46,690,690 DEBUG    [218] [ 83] fit: 0.51295 | delta: 2.2e-04 | secs: 7.21259
2019-03-03 21:44:53,940,940 DEBUG    [218] [ 84] fi

In [None]:
decomp(16, 2)
decomp(15, 2)
decomp(10, 2)
for dim in rangee(3,11):
    for exp in range(14, 4, -1):
        decomp(2**exp, dim)

2019-03-03 21:51:13,929,929 INFO     [2] Reweighting: log
2019-03-03 21:51:13,932,932 INFO     [5] Loading /mnt/store/home/makrai/project/verb-tensor/exper/sktensor/tensor_16.pkl
2019-03-03 21:51:14,507,507 DEBUG    [7] <sktensor.sptensor.sptensor object at 0x7fe011543518>
2019-03-03 21:51:14,508,508 INFO     [3] (49356, 19026, 24551)
2019-03-03 21:51:58,161,161 DEBUG    [218] [  0] fit: 0.32377 | delta: 3.2e-01 | secs: 13.91326
2019-03-03 21:51:58,163,163 DEBUG    [191] Zero norm, mode 0, count 1
2019-03-03 21:51:58,165,165 DEBUG    [191] Zero norm, mode 1, count 1
2019-03-03 21:51:58,166,166 DEBUG    [191] Zero norm, mode 2, count 1


In [131]:
ktensor, fit, n_iterations, exectimes = pickle.load(open(
    '/mnt/store/home/makrai/project/verb-tensor/exper/sktensor/decomp_2048_6.pkl', mode='rb'))

In [132]:
sum(exectimes)/60

13.830557099999988

In [124]:
ktensor.U[0].shape

(1251, 10)

In [102]:
len(ktensor.U)

3

In [103]:
ktensor.U[0].shape

(622, 10)

In [56]:
ktensor.ndim

3

In [None]:
def plot_factor(factor=0, add_smooth=0):
    plt.scatter(ktensor.U[factor][1:,0]+add_smooth, ktensor.U[factor][1:,1]+add_smooth, c=ktensor.U[factor][1:,2])
    if add_smooth:
        plt.xscale('log')
        plt.yscale('log')

plot_factor(2)