In [80]:
from collections import defaultdict
import logging
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s,%(msecs)d %(levelname)-8s [%(lineno)d] %(message)s')
import operator
import re

import numpy as np
import sparse
import tensorly as tl
import tensorly.decomposition as decomp
import pandas as pd

In [2]:
def mazsola_reader():
    path = '/mnt/permanent/Language/Hungarian/Dic/sass15-535k-igei-szerkezet/mazsola_adatbazis.txt'
    occurrence = defaultdict(int)#lambda: defaultdict(lambda: defaultdict(int)))
    margianl_s, margianl_v, margianl_o = defaultdict(int), defaultdict(int), defaultdict(int)
    with open(path) as infile:
        for i, line in enumerate(infile):
            if not i % 500000:
                logging.info('{:.0%}'.format(i/27970403))
            record = defaultdict(str)
            for token in line.strip().split():
                case_stem = re.split('@@', token)
                if len(case_stem) == 1:
                    continue
                try:
                    case, stem = case_stem
                except:
                    logging.warning(line.strip())
                    break
                record[case] = stem
            occurrence[record['NOM'], record['stem'], record['ACC']] += 1
            margianl_s[record['NOM']] += 1
            margianl_v[record['stem']] += 1
            margianl_o[record['ACC']] += 1
    return occurrence, [margianl_s, margianl_v, margianl_o]

In [3]:
occurrence, marginals = mazsola_reader()

2018-10-01 15:28:26,283,283 INFO     [8] 0%
2018-10-01 15:28:33,047,47 INFO     [8] 2%
2018-10-01 15:28:41,883,883 INFO     [8] 4%
2018-10-01 15:28:52,518,518 INFO     [8] 5%
2018-10-01 15:28:59,806,806 INFO     [8] 7%
2018-10-01 15:29:09,982,982 INFO     [8] 9%
2018-10-01 15:29:19,780,780 INFO     [8] 11%
2018-10-01 15:29:28,996,996 INFO     [8] 13%
2018-10-01 15:29:36,873,873 INFO     [8] 14%
2018-10-01 15:29:45,987,987 INFO     [8] 16%
2018-10-01 15:29:54,342,342 INFO     [8] 18%
2018-10-01 15:30:01,677,677 INFO     [8] 20%
2018-10-01 15:30:08,302,302 INFO     [8] 21%
2018-10-01 15:30:15,395,395 INFO     [8] 23%
2018-10-01 15:30:21,652,652 INFO     [8] 25%
2018-10-01 15:30:26,993,993 INFO     [8] 27%
2018-10-01 15:30:32,415,415 INFO     [8] 29%
2018-10-01 15:30:36,946,946 INFO     [8] 30%
2018-10-01 15:30:42,380,380 INFO     [8] 32%
2018-10-01 15:30:48,424,424 INFO     [8] 34%
2018-10-01 15:30:52,942,942 INFO     [8] 36%
2018-10-01 15:30:58,174,174 INFO     [8] 38%
2018-10-01 15:31:

2018-10-01 15:33:18,530,530 INFO     [8] 86%
2018-10-01 15:33:23,076,76 INFO     [8] 88%
2018-10-01 15:33:27,715,715 INFO     [8] 89%
2018-10-01 15:33:29,149,149 INFO     [8] 91%
2018-10-01 15:33:33,446,446 INFO     [8] 93%
2018-10-01 15:33:37,650,650 INFO     [8] 95%
2018-10-01 15:33:41,718,718 INFO     [8] 97%
2018-10-01 15:33:46,479,479 INFO     [8] 98%


In [117]:
def coords_data(occurrence, cutoff=500):
    def get_index(freq_dict):
        items = sorted(filter(lambda item: item[1] >= cutoff, freq_dict.items()), key=operator.itemgetter(1), 
                       reverse=True)
        logging.debug(items[-10:])
        return dict([(w, i) for i, (w, f) in enumerate(items)])

    coords, data = [[], [], []], []
    indices = [get_index(fd) for fd in marginals]
    logging.info('Pupulating lists...')
    for i, ((svo), freq) in enumerate(occurrence.items()):
        if not i % 2000000:
            logging.debug(i)#'{} {}'.format(svo[1], freq))
        for i, word in enumerate(svo):
            if svo[i] not in indices[i]:
                break
        else:
            for i, word in enumerate(svo):
                coords[i].append(indices[i][svo[i]])
            data.append(freq)
    logging.info('Creating array')
    shape = tuple(map(len, indices))
    logging.info(shape)
    logging.debug(coords[0][:10])
    tensor = sparse.COO(coords, data, shape=shape)#, has_duplicates=False)
    logging.info(tensor)
    return tensor

In [118]:
vtensor = coords_data(occurrence)

2018-10-01 16:48:16,879,879 DEBUG    [5] [('éj', 501), ('pasas', 501), ('összességPOSS', 501), ('szennyezés', 501), ('zenekarPOSS', 500), ('igazgatóhelyettesPOSS', 500), ('matematikus', 500), ('készlet', 500), ('műsorvezető', 500), ('70.', 500)]
2018-10-01 16:48:16,938,938 DEBUG    [5] [('előremegy', 502), ('helytelenít', 502), ('győzköd', 501), ('leselkedik', 501), ('költöztet', 501), ('elég', 500), ('túlszárnyal', 500), ('tudakol', 500), ('megsejt', 500), ('fej', 500)]
2018-10-01 16:48:17,010,10 DEBUG    [5] [('végkielégítés', 503), ('működés', 503), ('angol', 502), ('Jugoszlávia', 502), ('kis', 502), ('időszakPOSS', 501), ('integráció', 501), ('lánc', 501), ('ez--az', 500), ('mozi', 500)]
2018-10-01 16:48:17,012,12 INFO     [10] Pupulating lists...
2018-10-01 16:48:17,013,13 DEBUG    [13] 0
2018-10-01 16:48:20,203,203 DEBUG    [13] 2000000
2018-10-01 16:48:23,926,926 DEBUG    [13] 4000000
2018-10-01 16:48:27,913,913 DEBUG    [13] 6000000
2018-10-01 16:48:31,035,35 INFO     [21] Crea

In [112]:
vtensor

<COO: shape=(2552, 2123, 1231), dtype=int64, nnz=2214019, sorted=False, duplicates=False>

In [119]:
factors = decomp.parafac(vtensor, rank=2)

MemoryError: 