In [None]:
from collections import defaultdict
from itertools import groupby
import operator
import os
import re

import numpy as np
import pandas as pd
import pickle
import random
import sparse
#import tensorly as tl
#import tensorly.decomposition as decomp
import sktensor
import urllib3
#import wget

import matplotlib.pyplot as plt
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

import logging
logging.basicConfig(level=logging.DEBUG, 
                    format='%(msecs)d %(levelname)-8s [%(lineno)d] %(message)s')

if not os.path.exists('cp_orth.py'):
    wget.download('http://web.stanford.edu/~vsharan/cp_orth.py')
from cp_orth import orth_als

In [2]:
def mazsola_reader():
    pickle_path = '/mnt/store/home/makrai/project/verb-tensor/mazsola.pkl'
    if os.path.exists(pickle_path):
        logging.info('Loading mazsola dict from {}'.format(pickle_path))
        return pickle.load(open(pickle_path, mode='rb'))
    logging.info('Reading mazsola...'.format(pickle_path))
    path = '/mnt/permanent/Language/Hungarian/Dic/sass15-535k-igei-szerkezet/mazsola_adatbazis.txt'
    occurrence = defaultdict(int)#lambda: defaultdict(lambda: defaultdict(int)))
    margianls = [defaultdict(int) for _ in range(3)]
    with open(path) as infile:
        for i, line in enumerate(infile):
            if not i % 500000:
                logging.info('{:.0%}'.format(i/27970403))
            record = defaultdict(str)
            for token in line.strip().split():
                case_stem = re.split('@@', token)
                if len(case_stem) == 1:
                    continue
                try:
                    case, stem = case_stem
                except:
                    logging.warning(line.strip())
                record[case] = stem
            occurrence[record['NOM'], record['stem'], record['ACC']] += 1
            for i, mode in enumerate(['NOM', 'stem', 'ACC']):
                margianls[i][record[mode]] += 1            
    result = occurrence, margianls
    pickle.dump(result, open(pickle_path, mode='wb'))
    return result                

In [3]:
def get_tensor(middle_end='sktensor', cutoff=10):
    logging.info('Reweighting: log')
    verb_tensor_path = '/mnt/store/home/makrai/project/verb-tensor/{}/tensor_{}.pkl'.format(middle_end, cutoff)
    if os.path.exists(verb_tensor_path):
        logging.info('Loading tensor from {}'.format(verb_tensor_path))
        tensor, indices = pickle.load(open(verb_tensor_path, mode='rb'))
        logging.debug(tensor.shape)
        return tensor, indices
    occurrence, marginals = mazsola_reader()
    def get_index(freq_dict):
        items = sorted(filter(lambda item: item[1] >= cutoff, freq_dict.items()), key=operator.itemgetter(1), 
                       reverse=True)
        logging.debug(items[-3:])
        return dict([(w, i) for i, (w, f) in enumerate(items)])

    coords, data = ([], [], []), []
    indices = [get_index(fd) for fd in marginals]
    logging.info('Building tensor...')
    logging.info('  Pupulating lists...')
    for i, ((svo), freq) in enumerate(occurrence.items()):
        if not i % 2000000:
            logging.debug('    {:,}'.format(i))#'{} {}'.format(svo[1], freq))
        for i, word in enumerate(svo):
            if svo[i] not in indices[i]:
                break
        else:
            for i, word in enumerate(svo):
                coords[i].append(indices[i][svo[i]])
            data.append(np.log(freq))
    logging.info('  Creating array')
    shape = tuple(map(len, indices))
    logging.info(shape)
    if middle_end == 'tensorly':
        tensor = sparse.COO(coords, data, shape=shape)#, has_duplicates=False)
    elif middle_end == 'sktensor':
        tensor = sktensor.sptensor(coords, data, shape=shape)
    else:
        raise NotImplementedError
    pickle.dump((tensor, indices), open(verb_tensor_path, mode='wb'))
    logging.info(tensor)
    return tensor, indices

|cutoff|         shape         | d=2  | d=4  | d=5 | d=6 | d=7 | d=10|
|------|-----------------------|------|------|-----|-----|-----|-----|
|     1| 383 k, 110 k, 200 k   |      
|     2| 186 k,  60 k,  99 k   |
|     5|  97 k,  33 k,  50 k   |
|    10|  64 k,  23 k,  32 k   |
|    16|  49 k,  19 k,  24 k   |
|    32|  32 708 13 951 15 740 |14 min|
|    64|  21 033 10 294  9 874 | 6 min|
|   128|  13 093  7 345  6 116 | 2 min|
|------|-----------------------|------|------|-----|-----|-----|-----|
|   256|   7 957  5 054  3 740 | 2 min|
|   512|   4 598  3 269  2 183 | 1 min|
|  1024|   2 489  2 084  1 196 |  51 s|
|  2048|   1 251  1 293    579 |  27 s| 2 min|14 min|  
|  4096|     622    773    243 |  12 s| 1 min| 2 min|  m  |   s|15 min|
|  8192|     278    414    101 |     s|  55 s| 1 min|1 min|   s|4  min|
| 16384|      87    185     43 |     s|     s|  17 s| 29 s| 27s| 50 s |

10-es gyakorisági cutoffal a ginnyn még lefut, 5-össel már nem.

In [6]:
def decomp(cutoff, dim):
    logging.info((cutoff, dim))
    filen_base = '/mnt/store/home/makrai/project/verb-tensor/sktensor/decomp_{}_{}'.format(cutoff, dim)
    if os.path.isfile('{}.{}'.format(filen_base, 'pkl')):
        logging.info('File exists {} {}'.format(cutoff, dim))
        return
    vtensor, indices = get_tensor(cutoff=cutoff)
    try:
        result = orth_als(vtensor, dim)
        pickle.dump(result, open('{}.{}'.format(filen_base, 'pkl'), mode='wb'))
    except Exception as e:
        with open('{}.{}'.format(filen_base, 'err'), mode='w') as logfile:
            logfile.write(Exception)

In [None]:
for dim in range(3,11):
    for exp in range(14, 3, -1):
        cutoff=2**exp
        decomp(cutoff, dim)
    for cutoff in [15, 10]:
        decomp(cutoff, dim)

308 INFO     [2] (16384, 3)
310 INFO     [5] File exists 16384 3
312 INFO     [2] (8192, 3)
313 INFO     [5] File exists 8192 3
315 INFO     [2] (4096, 3)
316 INFO     [5] File exists 4096 3
318 INFO     [2] (2048, 3)
319 INFO     [5] File exists 2048 3
320 INFO     [2] (1024, 3)
321 INFO     [5] File exists 1024 3
322 INFO     [2] (512, 3)
323 INFO     [5] File exists 512 3
325 INFO     [2] (256, 3)
326 INFO     [5] File exists 256 3
327 INFO     [2] (128, 3)
328 INFO     [5] File exists 128 3
329 INFO     [2] (64, 3)
330 INFO     [5] File exists 64 3
330 INFO     [2] (32, 3)
331 INFO     [5] File exists 32 3
332 INFO     [2] (16, 3)
332 INFO     [2] Reweighting: log
333 INFO     [5] Loading tensor from /mnt/store/home/makrai/project/verb-tensor/sktensor/tensor_16.pkl
763 DEBUG    [7] (49356, 19025, 24551)
628 DEBUG    [218] [  0] fit: 0.35748 | delta: 3.6e-01 | secs: 21.13787
632 DEBUG    [191] Zero norm, mode 0, count 1
635 DEBUG    [191] Zero norm, mode 1, count 1
637 DEBUG    [191

In [None]:
ktensor, fit, n_iterations, exectimes = pickle.load(open(
    '/mnt/store/home/makrai/project/verb-tensor/sktensor/decomp_2048_6.pkl', mode='rb'))

In [None]:
sum(exectimes)/60

In [None]:
ktensor.U[0].shape

In [None]:
len(ktensor.U)

In [None]:
ktensor.U[0].shape

In [None]:
ktensor.ndim

In [None]:
def plot_factor(factor=0, add_smooth=0):
    plt.scatter(ktensor.U[factor][1:,0]+add_smooth, ktensor.U[factor][1:,1]+add_smooth, c=ktensor.U[factor][1:,2])
    if add_smooth:
        plt.xscale('log')
        plt.yscale('log')

plot_factor(2)