In [20]:
from collections import defaultdict
import glob
from itertools import groupby
import operator
import os
import re

import numpy as np
import pandas as pd
import pickle
import random
import sparse
#import tensorly as tl
#import tensorly.decomposition as decomp
import sktensor
import urllib3
#import wget

import matplotlib.pyplot as plt
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

import logging
logging.basicConfig(level=logging.DEBUG, 
                    format='%(levelname)-8s [%(lineno)d] %(message)s')

if not os.path.exists('cp_orth.py'):
    wget.download('http://web.stanford.edu/~vsharan/cp_orth.py')
from cp_orth import orth_als

572 DEBUG    [211] Loaded backend module://ipykernel.pylab.backend_inline version unknown.


Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [12]:
names = ['lemma', 'token_freq', 'pos', 'doc_freq', 'normalized']
prevlex = pd.read_csv('/home/makrai/repo/prevlex/PrevLex.txt', sep='\t', header=None, names=names)
prevlex.head()

Unnamed: 0,lemma,token_freq,pos,doc_freq,normalized
0,abba+hagy,9496,FIN,1258,abba+hagy
1,abba+marad,1277,FIN,620,abba+marad
2,abba+hagyat,45,FIN,41,abba+hagyat
3,abba+fejez,27,UNKNOWN,24,abba+fejez
4,abba+szakad,3,UNKNOWN,3,abba+szakad


In [14]:
class keydefault_dict(dict):
    def __missing__(self, key):
        return ('', key)

def get_prev_verb():
    prev_verb = keydefault_dict()
    for prev_plus_verb in prevlex.lemma:
        prev, verb = prev_plus_verb.split('+')
        prev_verb[prev+verb] = (prev, verb)
    return prev_verb

In [18]:
def mazsola_reader():
    pickle_path = '/mnt/store/home/makrai/project/verb-tensor/prev_sep/mazsola.pkl'
    if os.path.exists(pickle_path):
        logging.info('Loading mazsola dict from {}'.format(pickle_path))
        return pickle.load(open(pickle_path, mode='rb'))
    logging.info('Reading mazsola...'.format(pickle_path))
    path = '/mnt/permanent/Language/Hungarian/Dic/sass15-535k-igei-szerkezet/mazsola_adatbazis.txt'
    prev_verb = get_prev_verb()
    occurrence = defaultdict(int)#lambda: defaultdict(lambda: defaultdict(int)))
    margianls = [defaultdict(int) for _ in range(4)]
    with open(path) as infile:
        for i, line in enumerate(infile):
            if not i % 500000:
                logging.info('{:.0%}'.format(i/27970403))
            record = defaultdict(str)
            for token in line.strip().split():
                case_stem = re.split('@@', token)
                if len(case_stem) == 1:
                    continue
                try:
                    case, stem = case_stem
                except:
                    logging.warning(line.strip())
                record[case] = stem
            record['prev'], record['verb'] = prev_verb[record['stem']]
            occurrence[record['NOM'], record['prev'], record['verb'], record['ACC']] += 1
            for i, mode in enumerate(['NOM', 'prev', 'verb', 'ACC']):
                margianls[i][record[mode]] += 1            
    result = occurrence, margianls
    pickle.dump(result, open(pickle_path, mode='wb'))
    return result                

In [30]:
def get_tensor(middle_end='sktensor', cutoff=10):
    logging.info('Reweighting: log')
    verb_tensor_path = '/mnt/store/home/makrai/project/verb-tensor/prev_sep/tensor_{}_{}.pkl'.format(
        middle_end, cutoff)
    if os.path.exists(verb_tensor_path):
        logging.info('Loading tensor from {}'.format(verb_tensor_path))
        tensor, indices = pickle.load(open(verb_tensor_path, mode='rb'))
        logging.debug(tensor.shape)
        return tensor, indices
    occurrence, marginals = mazsola_reader()
    def get_index(freq_dict):
        items = sorted(filter(lambda item: item[1] >= cutoff, freq_dict.items()), key=operator.itemgetter(1), 
                       reverse=True)
        logging.debug(items[-3:])
        return dict([(w, i) for i, (w, f) in enumerate(items)])

    coords, data = ([], [], [], []), []
    indices = [get_index(fd) for fd in marginals]
    logging.info('Building tensor...')
    logging.info('  Pupulating lists...')
    for i, ((svo), freq) in enumerate(occurrence.items()):
        if not i % 2000000:
            logging.debug('    {:,}'.format(i))#'{} {}'.format(svo[1], freq))
        for i, word in enumerate(svo):
            if svo[i] not in indices[i]:
                break
        else:
            for i, word in enumerate(svo):
                coords[i].append(indices[i][svo[i]])
            data.append(np.log(freq))
    logging.info('  Creating array')
    shape = tuple(map(len, indices))
    logging.info(shape)
    if middle_end == 'tensorly':
        tensor = sparse.COO(coords, data, shape=shape)#, has_duplicates=False)
    elif middle_end == 'sktensor':
        tensor = sktensor.sptensor(coords, data, shape=shape)
    else:
        raise NotImplementedError
    pickle.dump((tensor, indices), open(verb_tensor_path, mode='wb'))
    logging.info(tensor)
    return tensor, indices

In [84]:
def decomp(cutoff, dim):
    logging.info((cutoff, dim))
    filen_base = '/mnt/store/home/makrai/project/verb-tensor/prev_sep/decomp_{}_{}'.format(cutoff, dim)
    if os.path.isfile('{}.{}'.format(filen_base, 'pkl')):
        logging.info('File exists {} {}'.format(cutoff, dim))
        return
    vtensor, indices = get_tensor(cutoff=cutoff)
    try:
        result = orth_als(vtensor, dim)
        pickle.dump(result, open('{}.{}'.format(filen_base, 'pkl'), mode='wb'))
    except Exception as e:
        with open('{}.{}'.format(filen_base, 'err'), mode='w') as logfile:
            logfile.write('{}'.format(e))
        logging.exception(e)

In [101]:
def show_expers(feature='exectimes'):
    tabular = []
    mx = []
    for filen in glob.glob('/mnt/store/home/makrai/project/verb-tensor/prev_sep/decomp_*.pkl'):
        _, cutoff, dim = os.path.splitext(filen)[0].rsplit('_', 2)
        cutoff, dim = map(int, (cutoff, dim))
        ktensor, fit, n_iterations, exectimes = pickle.load(open(filen, mode='rb'))
        tabular.append((cutoff, dim, ktensor.shape))
        mx.append([cutoff, dim, sum(exectimes)/60])
    mx = np.array(mx)
    print(sorted(tabular))
    plt.scatter(np.array(mx).T[0], mx.T[1], c=mx.T[2])
    plt.colorbar()
    plt.xscale('log')

In [102]:
show_expers()

[(2048, 2, (1251, 58, 898, 579)),
 (4096, 2, (622, 48, 616, 243)),
 (8192, 2, (278, 33, 384, 101)),
 (16384, 2, (87, 25, 219, 43)),
 (32768, 2, (24, 16, 120, 16)),
 (32768, 3, (24, 16, 120, 16)),
 (32768, 5, (24, 16, 120, 16)),
 (32768, 10, (24, 16, 120, 16)),
 (65536, 2, (10, 15, 48, 9)),
 (65536, 3, (10, 15, 48, 9)),
 (65536, 4, (10, 15, 48, 9)),
 (65536, 5, (10, 15, 48, 9)),
 (131072, 2, (3, 9, 19, 5))]

In [37]:
def rand_elem(list1):
    return list1[np.random.randint(0, len(list1))]

In [None]:
for dim in [3,5,10,25,50]:
    decomp(2**11, dim)

458 INFO     [2] (2048, 3)
459 INFO     [2] Reweighting: log
460 INFO     [6] Loading tensor from /mnt/store/home/makrai/project/verb-tensor/prev_sep/tensor_sktensor_2048.pkl
559 DEBUG    [8] (1251, 58, 898, 579)
96 DEBUG    [218] [  0] fit: 0.30606 | delta: 3.1e-01 | secs: 7.36870
97 DEBUG    [191] Zero norm, mode 0, count 0
98 DEBUG    [191] Zero norm, mode 1, count 0
99 DEBUG    [191] Zero norm, mode 2, count 0
100 DEBUG    [191] Zero norm, mode 3, count 0
47 DEBUG    [218] [  1] fit: 0.31196 | delta: 5.9e-03 | secs: 6.91210
48 DEBUG    [191] Zero norm, mode 0, count 0
48 DEBUG    [191] Zero norm, mode 1, count 0
49 DEBUG    [191] Zero norm, mode 2, count 0
50 DEBUG    [191] Zero norm, mode 3, count 0
999 DEBUG    [218] [  2] fit: 0.31996 | delta: 8.0e-03 | secs: 6.91431
0 DEBUG    [191] Zero norm, mode 0, count 0
0 DEBUG    [191] Zero norm, mode 1, count 0
1 DEBUG    [191] Zero norm, mode 2, count 0
2 DEBUG    [191] Zero norm, mode 3, count 0
946 DEBUG    [218] [  3] fit: 0.32432 |