In [119]:
from collections import defaultdict
import logging
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s,%(msecs)d %(levelname)-8s [%(lineno)d] %(message)s')
import operator
import re

import numpy as np
import os
import pandas as pd
import pickle
import random
import sparse
#import tensorly as tl
#import tensorly.decomposition as decomp
import sktensor
import urllib3
import wget

import matplotlib.pyplot as plt
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

if not os.path.exists('cp_orth.py'):
    wget.download('http://web.stanford.edu/~vsharan/cp_orth.py')
import cp_orth

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [2]:
def mazsola_reader():
    pickle_path = '/mnt/store/home/makrai/project/verb-tensor/mazsola.pkl'
    if os.path.exists(pickle_path):
        return pickle.load(open(pickle_path, mode='rb'))
    path = '/mnt/permanent/Language/Hungarian/Dic/sass15-535k-igei-szerkezet/mazsola_adatbazis.txt'
    occurrence = defaultdict(int)#lambda: defaultdict(lambda: defaultdict(int)))
    margianl_s, margianl_v, margianl_o = defaultdict(int), defaultdict(int), defaultdict(int)
    with open(path) as infile:
        for i, line in enumerate(infile):
            if not i % 500000:
                logging.info('{:.0%}'.format(i/27970403))
            record = defaultdict(str)
            for token in line.strip().split():
                case_stem = re.split('@@', token)
                if len(case_stem) == 1:
                    continue
                try:
                    case, stem = case_stem
                except:
                    logging.warning(line.strip())
                    break
                record[case] = stem
            occurrence[record['NOM'], record['stem'], record['ACC']] += 1
            margianl_s[record['NOM']] += 1
            margianl_v[record['stem']] += 1
            margianl_o[record['ACC']] += 1
    result = occurrence, [margianl_s, margianl_v, margianl_o]
    pickle.dump(result, open(pickle_path, mode='wb'))
    return result                

In [22]:
def get_tensor(cutoff=500, middle_end='sktensor'):
    verb_tensor_path = '/mnt/store/home/makrai/project/verb-tensor/{}/tensor_{}.pkl'.format(middle_end, cutoff)
    if os.path.exists(verb_tensor_path):
        return pickle.load(open(verb_tensor_path, mode='rb'))
    occurrence, marginals = mazsola_reader()
    def get_index(freq_dict):
        items = sorted(filter(lambda item: item[1] >= cutoff, freq_dict.items()), key=operator.itemgetter(1), 
                       reverse=True)
        logging.debug(items[-3:])
        return dict([(w, i) for i, (w, f) in enumerate(items)])

    coords, data = [[], [], []], []
    indices = [get_index(fd) for fd in marginals]
    logging.info('Pupulating lists...')
    for i, ((svo), freq) in enumerate(occurrence.items()):
        if not i % 2000000:
            logging.debug(i)#'{} {}'.format(svo[1], freq))
        for i, word in enumerate(svo):
            if svo[i] not in indices[i]:
                break
        else:
            for i, word in enumerate(svo):
                coords[i].append(indices[i][svo[i]])
            data.append(freq)
    logging.info('Creating array')
    shape = tuple(map(len, indices))
    logging.info(shape)
    if middle_end == 'tensorly':
        tensor = sparse.COO(coords, data, shape=shape)#, has_duplicates=False)
    elif middle_end == 'sktensor':
        tensor = sktensor.sptensor(tuple(coords), data, shape=shape)
    pickle.dump(tensor, open(verb_tensor_path, mode='wb'))
    logging.info(tensor)
    return tensor

In [133]:
vtensor = get_tensor(cutoff=10)

10-es gyakorisági cutoffal a ginnyn még lefut, 5-össel már nem.

In [138]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 2)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

2018-10-02 15:19:02,306,306 DEBUG    [215] [  0] fit: 0.99893 | delta: 1.0e+00 | secs: 11.84999
2018-10-02 15:19:02,320,320 DEBUG    [188] Zero norm, mode 0, count 0
2018-10-02 15:19:02,331,331 DEBUG    [188] Zero norm, mode 1, count 0
2018-10-02 15:19:02,334,334 DEBUG    [188] Zero norm, mode 2, count 0
2018-10-02 15:19:14,231,231 DEBUG    [215] [  1] fit: 0.99893 | delta: 1.1e-10 | secs: 12.49378
2018-10-02 15:19:14,232,232 INFO     [3] 24.343774999999823 seconds


CPU times: user 1min 9s, sys: 18.5 s, total: 1min 27s
Wall time: 1min 10s


In [139]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 5)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

2018-10-02 15:20:43,966,966 DEBUG    [215] [  0] fit: 0.99916 | delta: 1.0e+00 | secs: 31.08208
2018-10-02 15:20:43,974,974 DEBUG    [188] Zero norm, mode 0, count 0
2018-10-02 15:20:43,979,979 DEBUG    [188] Zero norm, mode 1, count 0
2018-10-02 15:20:43,983,983 DEBUG    [188] Zero norm, mode 2, count 0
2018-10-02 15:21:14,127,127 DEBUG    [215] [  1] fit: 0.99921 | delta: 5.2e-05 | secs: 31.84082
2018-10-02 15:21:14,142,142 DEBUG    [188] Zero norm, mode 0, count 0
2018-10-02 15:21:14,146,146 DEBUG    [188] Zero norm, mode 1, count 0
2018-10-02 15:21:14,151,151 DEBUG    [188] Zero norm, mode 2, count 0
2018-10-02 15:21:44,306,306 DEBUG    [215] [  2] fit: 0.99921 | delta: 1.2e-08 | secs: 31.91515
2018-10-02 15:21:44,307,307 INFO     [3] 94.83803900000021 seconds


CPU times: user 2min 24s, sys: 25.8 s, total: 2min 49s
Wall time: 2min 23s


In [140]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 10)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

2018-10-02 15:23:49,801,801 DEBUG    [215] [  0] fit: 0.99918 | delta: 1.0e+00 | secs: 61.22235
2018-10-02 15:23:49,843,843 DEBUG    [188] Zero norm, mode 0, count 0
2018-10-02 15:23:49,856,856 DEBUG    [188] Zero norm, mode 1, count 0
2018-10-02 15:23:49,873,873 DEBUG    [188] Zero norm, mode 2, count 0
2018-10-02 15:24:50,672,672 DEBUG    [215] [  1] fit: 0.99924 | delta: 6.5e-05 | secs: 62.92251
2018-10-02 15:24:50,711,711 DEBUG    [188] Zero norm, mode 0, count 0
2018-10-02 15:24:50,725,725 DEBUG    [188] Zero norm, mode 1, count 0
2018-10-02 15:24:50,742,742 DEBUG    [188] Zero norm, mode 2, count 0
2018-10-02 15:25:51,481,481 DEBUG    [215] [  2] fit: 0.99924 | delta: 2.3e-07 | secs: 62.85185
2018-10-02 15:25:51,482,482 INFO     [3] 186.99671200000012 seconds


CPU times: user 4min 9s, sys: 34 s, total: 4min 43s
Wall time: 4min 7s


In [141]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 20)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

2018-10-02 15:29:24,206,206 DEBUG    [215] [  0] fit: 0.99919 | delta: 1.0e+00 | secs: 121.74696
2018-10-02 15:29:24,572,572 DEBUG    [188] Zero norm, mode 0, count 0
2018-10-02 15:29:24,706,706 DEBUG    [188] Zero norm, mode 1, count 0
2018-10-02 15:29:24,881,881 DEBUG    [188] Zero norm, mode 2, count 0
2018-10-02 15:31:26,900,900 DEBUG    [215] [  1] fit: 0.99926 | delta: 7.2e-05 | secs: 128.61299
2018-10-02 15:31:27,279,279 DEBUG    [188] Zero norm, mode 0, count 0
2018-10-02 15:31:27,338,338 DEBUG    [188] Zero norm, mode 1, count 0
2018-10-02 15:31:27,541,541 DEBUG    [188] Zero norm, mode 2, count 0
2018-10-02 15:33:29,237,237 DEBUG    [215] [  2] fit: 0.99926 | delta: 2.8e-07 | secs: 128.00657
2018-10-02 15:33:29,238,238 INFO     [3] 378.3665209999997 seconds


CPU times: user 7min 46s, sys: 57.3 s, total: 8min 43s
Wall time: 7min 37s


In [142]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 50)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

2018-10-02 15:41:14,613,613 DEBUG    [215] [  0] fit: 0.99920 | delta: 1.0e+00 | secs: 304.56916
2018-10-02 15:41:17,132,132 DEBUG    [188] Zero norm, mode 0, count 0
2018-10-02 15:41:17,684,684 DEBUG    [188] Zero norm, mode 1, count 0
2018-10-02 15:41:18,422,422 DEBUG    [188] Zero norm, mode 2, count 0
2018-10-02 15:46:25,015,15 DEBUG    [215] [  1] fit: 0.99927 | delta: 7.6e-05 | secs: 336.35242
2018-10-02 15:46:27,487,487 DEBUG    [188] Zero norm, mode 0, count 0
2018-10-02 15:46:27,903,903 DEBUG    [188] Zero norm, mode 1, count 0
2018-10-02 15:46:28,380,380 DEBUG    [188] Zero norm, mode 2, count 0
2018-10-02 15:51:32,637,637 DEBUG    [215] [  2] fit: 0.99928 | delta: 5.5e-07 | secs: 330.50908
2018-10-02 15:51:32,638,638 INFO     [3] 971.4306630000001 seconds


CPU times: user 18min 33s, sys: 2min 13s, total: 20min 46s
Wall time: 18min 3s


In [143]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 100)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

2018-10-02 16:06:01,730,730 DEBUG    [215] [  0] fit: 0.99920 | delta: 1.0e+00 | secs: 625.71191
2018-10-02 16:06:12,329,329 DEBUG    [188] Zero norm, mode 0, count 0
2018-10-02 16:06:13,742,742 DEBUG    [188] Zero norm, mode 1, count 0
2018-10-02 16:06:15,928,928 DEBUG    [188] Zero norm, mode 2, count 0
2018-10-02 16:16:43,180,180 DEBUG    [215] [  1] fit: 0.99928 | delta: 7.8e-05 | secs: 732.96041
2018-10-02 16:16:53,047,47 DEBUG    [188] Zero norm, mode 0, count 0
2018-10-02 16:16:54,359,359 DEBUG    [188] Zero norm, mode 1, count 0
2018-10-02 16:16:56,756,756 DEBUG    [188] Zero norm, mode 2, count 0
2018-10-02 16:27:24,337,337 DEBUG    [215] [  2] fit: 0.99928 | delta: 3.2e-07 | secs: 727.82894
2018-10-02 16:27:24,337,337 INFO     [3] 2086.501265 seconds


CPU times: user 36min 31s, sys: 5min 26s, total: 41min 58s
Wall time: 35min 51s


In [None]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 200)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

2018-10-02 16:56:20,468,468 DEBUG    [215] [  0] fit: 0.99920 | delta: 1.0e+00 | secs: 1264.42019
2018-10-02 16:57:13,016,16 DEBUG    [188] Zero norm, mode 0, count 0
2018-10-02 16:57:22,285,285 DEBUG    [188] Zero norm, mode 1, count 0
2018-10-02 16:57:32,979,979 DEBUG    [188] Zero norm, mode 2, count 0
2018-10-02 17:18:49,207,207 DEBUG    [215] [  1] fit: 0.99928 | delta: 7.8e-05 | secs: 1802.89244
2018-10-02 17:19:36,844,844 DEBUG    [188] Zero norm, mode 0, count 0
2018-10-02 17:19:45,003,3 DEBUG    [188] Zero norm, mode 1, count 0
2018-10-02 17:19:59,029,29 DEBUG    [188] Zero norm, mode 2, count 0
2018-10-02 17:42:47,626,626 DEBUG    [215] [  2] fit: 0.99928 | delta: 2.9e-07 | secs: 1878.53085
2018-10-02 17:42:47,628,628 INFO     [3] 4945.84348 seconds


CPU times: user 1h 19min 26s, sys: 17min 55s, total: 1h 37min 22s
Wall time: 1h 15min 23s


In [None]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 300)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

In [None]:
ktensor.U

In [None]:
ktensor.lmbda

In [None]:
def plot_factor(factor=0, add_smooth=0):
    plt.scatter(ktensor.U[factor][1:,0]+add_smooth, ktensor.U[factor][1:,1]+add_smooth, c=ktensor.U[factor][1:,2])
    if add_smooth:
        plt.xscale('log')
        plt.yscale('log')

plot_factor(2)