In [23]:
import os.path
import pickle
import random

import numpy as np
from scipy.stats import entropy
import pandas as pd

from mazsola_to_pd import get_mazsola_df

import logging
logging.basicConfig(level=logging.DEBUG, format='%(levelname)-8s [%(lineno)d] %(message)s')

# Describe

In [75]:
def column_entropy(name):
    vals = mazsola.groupby(name).size().values
    vals += [mazsola[name].isna().sum()]
    #logging.debug((name, vals))
    return entropy(vals)

In [76]:
names = sorted(mazsola.columns, key=column_entropy, reverse=True)

In [77]:
mazsola[names[:18]].describe()

Unnamed: 0,NOM,ACC,INE,INS,SUB,DAT,SUP,ILL,DEL,ELA,ALL,ABL,stem,szerint,ADE,után,FOR,CAU
count,16489,10932.0,2998,2134,2012,1739,2024,798,754,655,543,472,27762.0,409,303,231,192,167
unique,7333,3599.0,1372,1270,1143,1005,793,532,471,455,374,326,4149.0,208,203,171,159,135
top,aki,,az,ez,az,én,alapPOSS,figyelem,az,szempont,az,az,,én,mi,év,első,kedvPOSS
freq,224,2567.0,57,85,157,79,87,21,95,20,42,32,7646.0,45,21,9,8,10


# PMI

In [101]:
def get_svo_pmi(mazsola):
    modes = ['NOM', 'stem', 'ACC']
    svo_count = mazsola.groupby(modes).size().reset_index(name='count_')
    svo_count['pmi'] = svo_count.count_
    for i, ser in svo_count.iterrows():
        if not i % 1000:
            logging.debug('{:.0%}'.format(i/svo_count.shape[0]))
        for mode in modes:
            ser.pmi /= svo_count[svo_count[mode]==ser[mode]].count_.sum()
    svo_count.pmi = np.log(svo_count.pmi)
    svo_count.pmi += 2 * np.log(svo_count.count_.sum())
    return svo_count

In [84]:
svo_count = get_svo_pmi()

DEBUG    [5] 0%
DEBUG    [5] 18%
DEBUG    [5] 37%
DEBUG    [5] 55%
DEBUG    [5] 74%
DEBUG    [5] 92%


In [99]:
svo_count.sort_values('pmi', ascending=False)[svo_count.ACC.apply(lambda w: w not in ['NULL', 'az'])].head(10)

  """Entry point for launching an IPython kernel.


Unnamed: 0,NOM,stem,ACC,count_,pmi
5360,úr,köszön,szó,4,18.616748
4908,vita,lezár,szakaszPOSS,4,18.616748
2859,képviselőtársPOSS,kap,ajánlásPOSS,3,18.329066
950,asszony,köszön,szó,2,17.923601
5276,önkormányzat,költ,forint,2,17.923601
832,amely,tart,lépés,2,17.923601
5365,úr,mond,ami,2,17.923601
710,aki,érez,maga,2,17.923601
2831,képviselő,megad,szó,2,17.923601
2860,képviselőtársPOSS,kap,előterjesztés,2,17.923601
