In [1]:
from collections import defaultdict
import logging
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s,%(msecs)d %(levelname)-8s [%(lineno)d] %(message)s')
import operator
import re

import numpy as np
import os
import pandas as pd
import pickle
import random
import sparse
#import tensorly as tl
#import tensorly.decomposition as decomp
import sktensor
import urllib3
import wget

import matplotlib.pyplot as plt
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

if not os.path.exists('cp_orth.py'):
    wget.download('http://web.stanford.edu/~vsharan/cp_orth.py')
import cp_orth

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [2]:
def mazsola_reader():
    pickle_path = '/mnt/store/home/makrai/project/verb-tensor/mazsola.pkl'
    if os.path.exists(pickle_path):
        return pickle.load(open(pickle_path, mode='rb'))
    path = '/mnt/permanent/Language/Hungarian/Dic/sass15-535k-igei-szerkezet/mazsola_adatbazis.txt'
    occurrence = defaultdict(int)#lambda: defaultdict(lambda: defaultdict(int)))
    margianls = [defaultdict(int) for _ in range(3)]
    with open(path) as infile:
        for i, line in enumerate(infile):
            if not i % 500000:
                logging.info('{:.0%}'.format(i/27970403))
            record = defaultdict(str)
            for token in line.strip().split():
                case_stem = re.split('@@', token)
                if len(case_stem) == 1:
                    continue
                try:
                    case, stem = case_stem
                except:
                    logging.warning(line.strip())
                    break
                record[case] = stem
            occurrence[record['NOM'], record['stem'], record['ACC']] += 1
            for i, mode in enumerate(['NOM', 'stem', 'ACC']):
                margianls[i][record[mode]] += 1            
    result = occurrence, margianls
    pickle.dump(result, open(pickle_path, mode='wb'))
    return result                

In [20]:
def get_tensor(middle_end='sktensor', cutoff=2):
    logging.info('Reweighting: log')
    verb_tensor_path = '/mnt/store/home/makrai/project/verb-tensor/{}/tensor_{}.pkl'.format(middle_end, cutoff)
    if os.path.exists(verb_tensor_path):
        logging.info('Loading {}'.format(verb_tensor_path))
        return pickle.load(open(verb_tensor_path, mode='rb'))
    occurrence, marginals = mazsola_reader()
    def get_index(freq_dict):
        items = sorted(filter(lambda item: item[1] >= cutoff, freq_dict.items()), key=operator.itemgetter(1), 
                       reverse=True)
        logging.debug(items[-3:])
        return dict([(w, i) for i, (w, f) in enumerate(items)])

    coords, data = ([], [], []), []
    indices = [get_index(fd) for fd in marginals]
    logging.info('Pupulating lists...')
    for i, ((svo), freq) in enumerate(occurrence.items()):
        if not i % 2000000:
            logging.debug(i)#'{} {}'.format(svo[1], freq))
        for i, word in enumerate(svo):
            if svo[i] not in indices[i]:
                break
        else:
            for i, word in enumerate(svo):
                coords[i].append(indices[i][svo[i]])
            data.append(np.log(freq))
    logging.info('Creating array')
    shape = tuple(map(len, indices))
    logging.info(shape)
    if middle_end == 'tensorly':
        tensor = sparse.COO(coords, data, shape=shape)#, has_duplicates=False)
    elif middle_end == 'sktensor':
        tensor = sktensor.sptensor(coords, data, shape=shape)
    pickle.dump(tensor, open(verb_tensor_path, mode='wb'))
    logging.info(tensor)
    return tensor

In [19]:
vtensor = get_tensor(cutoff=2)

2018-10-15 12:33:46,552,552 INFO     [2] Reweighting: log
2018-10-15 12:33:52,259,259 DEBUG    [11] [('2110.', 2), ('Fedett', 2), ('Növénytelepítés', 2)]
2018-10-15 12:33:52,431,431 DEBUG    [11] [('rafinálódik', 2), ('belemennydörög', 2), ('elfesteget', 2)]
2018-10-15 12:33:52,534,534 DEBUG    [11] [('Emi', 2), ('turistabotPOSS', 2), ('aldehid', 2)]
2018-10-15 12:33:52,601,601 INFO     [16] Pupulating lists...
2018-10-15 12:33:52,602,602 DEBUG    [19] 0
2018-10-15 12:33:57,353,353 DEBUG    [19] 2000000
2018-10-15 12:34:02,056,56 DEBUG    [19] 4000000
2018-10-15 12:34:06,822,822 DEBUG    [19] 6000000
2018-10-15 12:34:11,017,17 INFO     [27] Creating array
2018-10-15 12:34:11,018,18 INFO     [29] (186470, 60495, 98745)
2018-10-15 12:34:13,072,72 INFO     [35] <sktensor.sptensor.sptensor object at 0x7fe2702cc470>


|cutoff|         shape        |
|------|----------------------|
|     0| 383 k, 110 k, 200 k  |
|     1| 186 k,  60 k,  99 k  |

10-es gyakorisági cutoffal a ginnyn még lefut, 5-össel már nem.

In [21]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 2)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

MemoryError: 

In [None]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 5)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

In [None]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 10)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

In [None]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 20)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

In [None]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 50)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

In [None]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 100)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

In [None]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 200)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

In [None]:
%%time
ktensor, fit_, n_epoch_, seconds = cp_orth.orth_als(vtensor, 300)
# TODO pickle.dump(open('mnt/store/home/makrai/project/verb-tensor/kruskal_'))
logging.info("{} seconds".format((sum(seconds))))

In [None]:
ktensor.U

In [None]:
ktensor.lmbda

In [None]:
def plot_factor(factor=0, add_smooth=0):
    plt.scatter(ktensor.U[factor][1:,0]+add_smooth, ktensor.U[factor][1:,1]+add_smooth, c=ktensor.U[factor][1:,2])
    if add_smooth:
        plt.xscale('log')
        plt.yscale('log')

plot_factor(2)