In [1]:
from collections import defaultdict
import glob
from itertools import groupby
import operator
import os
import re

import numpy as np
import pandas as pd
import pickle
import random
import sparse
#import tensorly as tl
#import tensorly.decomposition as decomp
import sktensor
import urllib3
#import wget

import matplotlib.pyplot as plt
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

import logging
logging.basicConfig(level=logging.DEBUG, 
                    format='%(msecs)d %(levelname)-8s [%(lineno)d] %(message)s')

if not os.path.exists('cp_orth.py'):
    wget.download('http://web.stanford.edu/~vsharan/cp_orth.py')
from cp_orth import orth_als

`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"
429 DEBUG    [211] Loaded backend module://ipykernel.pylab.backend_inline version unknown.


Populating the interactive namespace from numpy and matplotlib


In [2]:
projdir = '/mnt/permanent/home/makrai/project/verb-tensor'

In [3]:
def mazsola_reader():
    pickle_path = os.path.join(projdir, 'mazsola.pkl')
    if os.path.exists(pickle_path):
        logging.info('Loading mazsola dict from {}'.format(pickle_path))
        return pickle.load(open(pickle_path, mode='rb'))
    logging.info('Reading mazsola...'.format(pickle_path))
    path = '/mnt/permanent/Language/Hungarian/Dic/sass15-535k-igei-szerkezet/mazsola_adatbazis.txt'
    occurrence = defaultdict(int)#lambda: defaultdict(lambda: defaultdict(int)))
    margianls = [defaultdict(int) for _ in range(3)]
    with open(path) as infile:
        for i, line in enumerate(infile):
            if not i % 500000:
                logging.info('{:.0%}'.format(i/27970403))
            record = defaultdict(str)
            for token in line.strip().split():
                case_stem = re.split('@@', token)
                if len(case_stem) == 1:
                    continue
                try:
                    case, stem = case_stem
                except:
                    logging.warning(line.strip())
                record[case] = stem
            occurrence[record['NOM'], record['stem'], record['ACC']] += 1
            for i, mode in enumerate(['NOM', 'stem', 'ACC']):
                margianls[i][record[mode]] += 1            
    result = occurrence, margianls
    pickle.dump(result, open(pickle_path, mode='wb'))
    return result                

In [4]:
def get_tensor(middle_end='sktensor', cutoff=10):
    logging.info('Reweighting: log')
    verb_tensor_path = os.path.join(projdir, '{}/tensor_{}.pkl'.format(middle_end, cutoff))
    if os.path.exists(verb_tensor_path):
        logging.info('Loading tensor from {}'.format(verb_tensor_path))
        tensor, indices = pickle.load(open(verb_tensor_path, mode='rb'))
        logging.debug(tensor.shape)
        return tensor, indices
    occurrence, marginals = mazsola_reader()
    def get_index(freq_dict):
        items = sorted(filter(lambda item: item[1] >= cutoff, freq_dict.items()), key=operator.itemgetter(1), 
                       reverse=True)
        logging.debug(items[-3:])
        return dict([(w, i) for i, (w, f) in enumerate(items)])

    coords, data = ([], [], []), []
    indices = [get_index(fd) for fd in marginals]
    logging.info('Building tensor...')
    logging.info('  Pupulating lists...')
    for i, ((svo), freq) in enumerate(occurrence.items()):
        if not i % 2000000:
            logging.debug('    {:,}'.format(i))#'{} {}'.format(svo[1], freq))
        for i, word in enumerate(svo):
            if svo[i] not in indices[i]:
                break
        else:
            for i, word in enumerate(svo):
                coords[i].append(indices[i][svo[i]])
            data.append(np.log(freq))
    logging.info('  Creating array')
    shape = tuple(map(len, indices))
    logging.info(shape)
    if middle_end == 'tensorly':
        tensor = sparse.COO(coords, data, shape=shape)#, has_duplicates=False)
    elif middle_end == 'sktensor':
        tensor = sktensor.sptensor(coords, data, shape=shape)
    else:
        raise NotImplementedError
    pickle.dump((tensor, indices), open(verb_tensor_path, mode='wb'))
    logging.info(tensor)
    return tensor, indices

In [None]:
def decomp(cutoff, dim):
    logging.info((cutoff, dim))
    filen_base = os.path.join(projdir, 'sktensor/decomp_{}_{}'.format(cutoff, dim))
    if os.path.isfile('{}.{}'.format(filen_base, 'pkl')) or os.path.isfile('{}.{}'.format(filen_base, 'err')):
        logging.info('File exists {}'.format(glob.glob(filen_base+'.*')))
        return
    vtensor, indices = get_tensor(cutoff=cutoff)
    try:
        result = orth_als(vtensor, dim)
        pickle.dump(result, open('{}.{}'.format(filen_base, 'pkl'), mode='wb'))
    except Exception as e:
        with open('{}.{}'.format(filen_base, 'err'), mode='w') as logfile:
            logging.error(e)
            logfile.write(e)

In [None]:
def show_expers(feature='exectimes'):
    tabular = []
    mx = []
    for filen in glob.glob(os.path.join(projdir, 'sktensor/decomp_*.pkl')):
        logging.debug('')
        _, cutoff, dim = os.path.splitext(filen)[0].split('_')
        cutoff, dim = map(int, (cutoff, dim))
        ktensor, fit, n_iterations, exectimes = pickle.load(open(filen, mode='rb'))
        tabular.append((cutoff, dim, ktensor.shape))
        mx.append([cutoff, dim, sum(exectimes)/60])
    mx = np.array(mx)
    print(sorted(tabular))
    plt.scatter(np.array(mx).T[0], mx.T[1], c=mx.T[2])
    plt.colorbar()
    plt.xscale('log')

In [None]:
def rand_elem(list1):
    return list1[np.random.randint(0, len(list1))]

In [None]:
%time decomp(2**11, 50)

586 INFO     [2] (2048, 50)
588 INFO     [2] Reweighting: log
589 INFO     [6] Reading mazsola...
591 INFO     [13] 0%
904 INFO     [13] 2%
125 INFO     [13] 4%
779 INFO     [13] 5%
585 INFO     [13] 7%
303 INFO     [13] 9%
638 INFO     [13] 11%
861 INFO     [13] 13%
330 INFO     [13] 14%
156 INFO     [13] 16%
99 INFO     [13] 18%
66 INFO     [13] 20%
130 INFO     [13] 21%
256 INFO     [13] 23%
0 INFO     [13] 25%
259 INFO     [13] 27%
885 INFO     [13] 29%
848 INFO     [13] 30%
126 INFO     [13] 32%
221 INFO     [13] 34%
754 INFO     [13] 36%
421 INFO     [13] 38%
293 INFO     [13] 39%
395 INFO     [13] 41%
289 INFO     [13] 43%
103 INFO     [13] 45%
499 INFO     [13] 46%
508 INFO     [13] 48%
990 INFO     [13] 50%
301 INFO     [13] 52%
54 INFO     [13] 54%
412 INFO     [13] 55%
580 INFO     [13] 57%
849 INFO     [13] 59%
113 INFO     [13] 61%
680 INFO     [13] 63%
626 INFO     [13] 64%
72 INFO     [13] 66%
600 INFO     [13] 68%
513 INFO     [13] 70%
401 INFO     [13] 72%
616 INFO    

510 INFO     [13] 80%
416 INFO     [13] 82%
70 INFO     [13] 84%


981 INFO     [13] 86%
562 INFO     [13] 88%
244 INFO     [13] 89%
956 INFO     [13] 91%
210 INFO     [13] 93%
986 INFO     [13] 95%
688 INFO     [13] 97%
771 INFO     [13] 98%
838 DEBUG    [13] [('tendencia', 2053), ('létrehozásPOSS', 2049), ('rendezvény', 2048)]
901 DEBUG    [13] [('visszalép', 2056), ('felügyel', 2050), ('megnyugszik', 2049)]
973 DEBUG    [13] [('alkalmazásPOSS', 2056), ('nyereség', 2050), ('beszédPOSS', 2049)]
974 INFO     [18] Building tensor...
975 INFO     [19]   Pupulating lists...
976 DEBUG    [22]     0
384 DEBUG    [22]     2,000,000
381 DEBUG    [22]     4,000,000
272 DEBUG    [22]     6,000,000
524 INFO     [30]   Creating array
526 INFO     [32] (1251, 1293, 579)
384 INFO     [40] <sktensor.sptensor.sptensor object at 0x7f074de2dcc0>
923 DEBUG    [218] [  0] fit: 0.46677 | delta: 4.7e-01 | secs: 131.58682
980 DEBUG    [191] Zero norm, mode 0, count 0
36 DEBUG    [191] Zero norm, mode 1, count 0
75 DEBUG    [191] Zero norm, mode 2, count 0
645 DEBUG    [218

221 DEBUG    [191] Zero norm, mode 0, count 0
275 DEBUG    [191] Zero norm, mode 1, count 0
311 DEBUG    [191] Zero norm, mode 2, count 0
477 DEBUG    [218] [  4] fit: 0.54042 | delta: 9.1e-04 | secs: 128.16875
654 DEBUG    [218] [  5] fit: 0.57004 | delta: 3.0e-02 | secs: 127.54461
781 DEBUG    [218] [  6] fit: 0.59256 | delta: 2.3e-02 | secs: 127.40361
72 DEBUG    [218] [  7] fit: 0.61683 | delta: 2.4e-02 | secs: 127.32590
418 DEBUG    [218] [  8] fit: 0.62938 | delta: 1.3e-02 | secs: 127.92924
603 DEBUG    [218] [  9] fit: 0.63540 | delta: 6.0e-03 | secs: 127.46217
741 DEBUG    [218] [ 10] fit: 0.63852 | delta: 3.1e-03 | secs: 128.74450
795 DEBUG    [218] [ 11] fit: 0.64149 | delta: 3.0e-03 | secs: 127.57490
917 DEBUG    [218] [ 12] fit: 0.64477 | delta: 3.3e-03 | secs: 127.27361
948 DEBUG    [218] [ 13] fit: 0.64746 | delta: 2.7e-03 | secs: 128.08325
303 DEBUG    [218] [ 14] fit: 0.64980 | delta: 2.3e-03 | secs: 129.10427
414 DEBUG    [218] [ 15] fit: 0.65182 | delta: 2.0e-03 | sec