In [1]:
from collections import defaultdict
import glob
from itertools import groupby
import operator
import os
import re

import numpy as np
import pandas as pd
import pickle
import random
import sparse
#import tensorly as tl
#import tensorly.decomposition as decomp
import sktensor
import urllib3
#import wget

import matplotlib.pyplot as plt
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

import logging
logging.basicConfig(level=logging.DEBUG, 
                    format='%(levelname)-8s [%(lineno)d] %(message)s')

if not os.path.exists('cp_orth.py'):
    wget.download('http://web.stanford.edu/~vsharan/cp_orth.py')
from cp_orth import orth_als

`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"
DEBUG    [211] Loaded backend module://ipykernel.pylab.backend_inline version unknown.


Populating the interactive namespace from numpy and matplotlib


In [2]:
names = ['lemma', 'token_freq', 'pos', 'doc_freq', 'normalized']
prevlex = pd.read_csv('/home/makrai/repo/prevlex/PrevLex.txt', sep='\t', header=None, names=names)
prevlex.head()

Unnamed: 0,lemma,token_freq,pos,doc_freq,normalized
0,abba+hagy,9496,FIN,1258,abba+hagy
1,abba+marad,1277,FIN,620,abba+marad
2,abba+hagyat,45,FIN,41,abba+hagyat
3,abba+fejez,27,UNKNOWN,24,abba+fejez
4,abba+szakad,3,UNKNOWN,3,abba+szakad


In [3]:
class keydefault_dict(dict):
    def __missing__(self, key):
        return ('', key)

def get_prev_verb():
    prev_verb = keydefault_dict()
    for prev_plus_verb in prevlex.lemma:
        prev, verb = prev_plus_verb.split('+')
        prev_verb[prev+verb] = (prev, verb)
    return prev_verb

In [4]:
projdir = '/mnt/permanent/home/makrai/project/verb-tensor'

In [5]:
def mazsola_reader():
    pickle_path = os.path.join(projdir, 'prev_sep/mazsola.pkl')
    if os.path.exists(pickle_path):
        logging.info('Loading mazsola dict from {}'.format(pickle_path))
        return pickle.load(open(pickle_path, mode='rb'))
    logging.info('Reading mazsola...'.format(pickle_path))
    path = '/mnt/permanent/Language/Hungarian/Dic/sass15-535k-igei-szerkezet/mazsola_adatbazis.txt'
    prev_verb = get_prev_verb()
    occurrence = defaultdict(int)#lambda: defaultdict(lambda: defaultdict(int)))
    margianls = [defaultdict(int) for _ in range(4)]
    with open(path) as infile:
        for i, line in enumerate(infile):
            if not i % 500000:
                logging.info('{:.0%}'.format(i/27970403))
            record = defaultdict(str)
            for token in line.strip().split():
                case_stem = re.split('@@', token)
                if len(case_stem) == 1:
                    continue
                try:
                    case, stem = case_stem
                except:
                    logging.warning(line.strip())
                record[case] = stem
            record['prev'], record['verb'] = prev_verb[record['stem']]
            occurrence[record['NOM'], record['prev'], record['verb'], record['ACC']] += 1
            for i, mode in enumerate(['NOM', 'prev', 'verb', 'ACC']):
                margianls[i][record[mode]] += 1            
    result = occurrence, margianls
    pickle.dump(result, open(pickle_path, mode='wb'))
    return result                

In [6]:
def get_tensor(middle_end='sktensor', cutoff=10):
    logging.info('Reweighting: log')
    verb_tensor_path = os.path.join(projdir, 'prev_sep/tensor_{}_{}.pkl').format(
        middle_end, cutoff)
    if os.path.exists(verb_tensor_path):
        logging.info('Loading tensor from {}'.format(verb_tensor_path))
        tensor, indices = pickle.load(open(verb_tensor_path, mode='rb'))
        logging.debug(tensor.shape)
        return tensor, indices
    occurrence, marginals = mazsola_reader()
    def get_index(freq_dict):
        items = sorted(filter(lambda item: item[1] >= cutoff, freq_dict.items()), key=operator.itemgetter(1), 
                       reverse=True)
        logging.debug(items[-3:])
        return dict([(w, i) for i, (w, f) in enumerate(items)])

    coords, data = ([], [], [], []), []
    indices = [get_index(fd) for fd in marginals]
    logging.info('Building tensor...')
    logging.info('  Pupulating lists...')
    for i, ((svo), freq) in enumerate(occurrence.items()):
        if not i % 2000000:
            logging.debug('    {:,}'.format(i))#'{} {}'.format(svo[1], freq))
        for i, word in enumerate(svo):
            if svo[i] not in indices[i]:
                break
        else:
            for i, word in enumerate(svo):
                coords[i].append(indices[i][svo[i]])
            data.append(np.log(freq))
    logging.info('  Creating array')
    shape = tuple(map(len, indices))
    logging.info(shape)
    if middle_end == 'tensorly':
        tensor = sparse.COO(coords, data, shape=shape)#, has_duplicates=False)
    elif middle_end == 'sktensor':
        tensor = sktensor.sptensor(coords, data, shape=shape)
    else:
        raise NotImplementedError
    pickle.dump((tensor, indices), open(verb_tensor_path, mode='wb'))
    logging.info(tensor)
    return tensor, indices

In [7]:
def decomp(cutoff, dim):
    logging.info((cutoff, dim))
    filen_base = os.path.join(projdir, 'prev_sep/decomp_{}_{}').format(cutoff, dim)
    if os.path.isfile('{}.{}'.format(filen_base, 'pkl')):
        logging.info('File exists {} {}'.format(cutoff, dim))
        return
    vtensor, indices = get_tensor(cutoff=cutoff)
    try:
        result = orth_als(vtensor, dim)
        pickle.dump(result, open('{}.{}'.format(filen_base, 'pkl'), mode='wb'))
    except Exception as e:
        with open('{}.{}'.format(filen_base, 'err'), mode='w') as logfile:
            logfile.write('{}'.format(e))
        logging.exception(e)

In [8]:
def show_expers(feature='exectimes'):
    tabular = []
    mx = []
    for filen in glob.glob(os.path.join(projdir, 'prev_sep/decomp_*.pkl')):
        _, cutoff, dim = os.path.splitext(filen)[0].rsplit('_', 2)
        cutoff, dim = map(int, (cutoff, dim))
        ktensor, fit, n_iterations, exectimes = pickle.load(open(filen, mode='rb'))
        tabular.append((cutoff, dim, ktensor.shape))
        mx.append([cutoff, dim, sum(exectimes)/60])
    mx = np.array(mx)
    print(sorted(tabular))
    plt.scatter(np.array(mx).T[0], mx.T[1], c=mx.T[2])
    plt.colorbar()
    plt.xscale('log')

In [10]:
def rand_elem(list1):
    return list1[np.random.randint(0, len(list1))]

In [None]:
%time decomp(2**11, 50)

INFO     [2] (2048, 50)
INFO     [2] Reweighting: log
INFO     [6] Reading mazsola...
INFO     [14] 0%
INFO     [14] 2%
INFO     [14] 4%
INFO     [14] 5%
INFO     [14] 7%
INFO     [14] 9%
INFO     [14] 11%
INFO     [14] 13%
INFO     [14] 14%
INFO     [14] 16%
INFO     [14] 18%
INFO     [14] 20%
INFO     [14] 21%
INFO     [14] 23%
INFO     [14] 25%
INFO     [14] 27%
INFO     [14] 29%
INFO     [14] 30%
INFO     [14] 32%
INFO     [14] 34%
INFO     [14] 36%
INFO     [14] 38%
INFO     [14] 39%
INFO     [14] 41%
INFO     [14] 43%
INFO     [14] 45%
INFO     [14] 46%
INFO     [14] 48%
INFO     [14] 50%
INFO     [14] 52%
INFO     [14] 54%
INFO     [14] 55%
INFO     [14] 57%
INFO     [14] 59%
INFO     [14] 61%
INFO     [14] 63%
INFO     [14] 64%
INFO     [14] 66%
INFO     [14] 68%
INFO     [14] 70%
INFO     [14] 72%
INFO     [14] 73%
INFO     [14] 75%
INFO     [14] 77%
INFO     [14] 79%


INFO     [14] 80%
INFO     [14] 82%
INFO     [14] 84%


INFO     [14] 86%
INFO     [14] 88%
INFO     [14] 89%
INFO     [14] 91%
INFO     [14] 93%
INFO     [14] 95%
INFO     [14] 97%
INFO     [14] 98%
DEBUG    [14] [('tendencia', 2053), ('létrehozásPOSS', 2049), ('rendezvény', 2048)]
DEBUG    [14] [('utol', 2348), ('számon', 2237), ('ellent', 2237)]
DEBUG    [14] [('gyötör', 2069), ('tapsol', 2062), ('mérsékel', 2051)]
DEBUG    [14] [('alkalmazásPOSS', 2056), ('nyereség', 2050), ('beszédPOSS', 2049)]
INFO     [19] Building tensor...
INFO     [20]   Pupulating lists...
DEBUG    [23]     0
DEBUG    [23]     2,000,000
DEBUG    [23]     4,000,000
DEBUG    [23]     6,000,000
INFO     [31]   Creating array
INFO     [33] (1251, 58, 898, 579)
INFO     [41] <sktensor.sptensor.sptensor object at 0x7f6531e2c9e8>
DEBUG    [218] [  0] fit: 0.32861 | delta: 3.3e-01 | secs: 248.00038
DEBUG    [191] Zero norm, mode 0, count 0
DEBUG    [191] Zero norm, mode 1, count 0
DEBUG    [191] Zero norm, mode 2, count 0
DEBUG    [191] Zero norm, mode 3, count 0
DEBUG  

DEBUG    [191] Zero norm, mode 0, count 0
DEBUG    [191] Zero norm, mode 1, count 0
DEBUG    [191] Zero norm, mode 2, count 0
DEBUG    [191] Zero norm, mode 3, count 0
DEBUG    [218] [  4] fit: 0.39543 | delta: 4.0e-03 | secs: 247.78426
DEBUG    [218] [  5] fit: 0.44796 | delta: 5.3e-02 | secs: 246.76174
DEBUG    [218] [  6] fit: 0.48891 | delta: 4.1e-02 | secs: 246.91536
DEBUG    [218] [  7] fit: 0.51033 | delta: 2.1e-02 | secs: 246.94977
DEBUG    [218] [  8] fit: 0.52230 | delta: 1.2e-02 | secs: 246.95661
DEBUG    [218] [  9] fit: 0.53243 | delta: 1.0e-02 | secs: 246.90402
DEBUG    [218] [ 10] fit: 0.54052 | delta: 8.1e-03 | secs: 246.95840
DEBUG    [218] [ 11] fit: 0.54579 | delta: 5.3e-03 | secs: 247.12152
DEBUG    [218] [ 12] fit: 0.54925 | delta: 3.5e-03 | secs: 247.07687
DEBUG    [218] [ 13] fit: 0.55176 | delta: 2.5e-03 | secs: 246.99708
DEBUG    [218] [ 14] fit: 0.55368 | delta: 1.9e-03 | secs: 247.00363
DEBUG    [218] [ 15] fit: 0.55524 | delta: 1.6e-03 | secs: 246.89555
DEBU