# Checking the uroboros property, taking ambiguity into account

In [1]:
from collections import Counter

import numpy as np
import pandas as pd
import re
import urllib  # the lib that handles the url stuff

In [2]:
names=['en', 'hu', 'la', 'pl', 'id', 'uro', 'pos', 'defi', 'comment']

## Thematic-role version

In [3]:
fl = pd.read_csv('4lang', sep='\t', names=names).fillna('')

In [4]:
concept_chars = '=@a-zA-Z_/0-9-'

In [5]:
parens_and_stuff = "[]<>,()'"

In [6]:
def find_concepts(defi):
    return re.findall(f'[{concept_chars}]+', defi)

In [7]:
defining_tokens = []
for concepts in fl.defi.astype(str).map(find_concepts).values:
    defining_tokens += concepts

In [8]:
defining_vocab = pd.Series(Counter(defining_tokens)).sort_values(ascending=False)

In [9]:
defining_vocab

HAS               690
=AGT              450
=PAT              390
lack              356
IN/2758           330
                 ... 
rapid               1
hate                1
@Socrates           1
@Sigmund_Freud      1
title               1
Length: 1712, dtype: int64

In [10]:
headwords = pd.Series(Counter(fl.en.values)).sort_values(ascending=False)

In [11]:
headwords[headwords > 1].head()

for     5
mill    4
hail    4
base    3
duty    3
dtype: int64

### Undefined/primitive words used in definitions

In [12]:
undefined = set(defining_vocab.index.str.lower()) - set(headwords.index.str.lower())

In [13]:
def is_special(defing_token):
    return re.match('^[@=]', defing_token) or '/' in defing_token

In [14]:
[word for word in undefined if not is_special(word)]

['er',
 'passionate',
 'enthusiastic',
 'has',
 'target',
 'part_of',
 'inherent',
 'palm',
 'identity',
 'tired',
 'celebrate',
 'next_to',
 'is_a']

### Ambig words used without disambiguation

In [15]:
headwords[headwords>1]

for      5
mill     4
hail     4
base     3
duty     3
        ..
map      2
march    2
mass     2
man      2
well     2
Length: 291, dtype: int64

In [16]:
%pprint
print(sorted(set(defining_vocab.index).intersection(set(headwords[headwords>1].index))))
%pprint

Pretty printing has been turned OFF
['act', 'area', 'at', 'bake', 'ball', 'before', 'card', 'case', 'centre', 'change', 'close', 'cloth', 'collar', 'colour', 'control', 'corner', 'crop', 'curl', 'damage', 'defeat', 'detail', 'drink', 'electricity', 'equal', 'experience', 'express', 'fear', 'formal', 'habit', 'honour', 'information', 'jump', 'keep', 'level', 'limb', 'male', 'manner', 'map', 'mass', 'material', 'meal', 'model', 'moral', 'musician', 'nervous', 'object', 'official', 'oppose', 'pen', 'plant', 'pound', 'power', 'practice', 'produce', 'product', 'property', 'quantity', 'raise', 'relative', 'root', 'same', 'sew', 'society', 'step', 'style', 'succeed', 'system', 'tense', 'this', 'throw', 'trade', 'trouble', 'turn', 'vegetable', 'vessel', 'warm', 'wave', 'wish']
Pretty printing has been turned ON


### Is there an erronous disambig?

In [17]:
possible_disambed_words = set(fl.apply(lambda ser: f'{ser.en}/{ser.id}', axis=1).values)
set(defining_vocab.index.str.lower()[defining_vocab.index.str.contains('/')]) - possible_disambed_words

{'march/1563', 'may/1560', 'miss/1357', 'mrs/35'}

No, just casing differences.

### Words with more headwords but only one used

In [18]:
n_senses_used = pd.Series(Counter([pair[0] for pair in defining_vocab.index.str.split('/') if len(pair)>1]))
n_senses_used[n_senses_used>1].sort_values(ascending=False).head()

right      3
place      2
close      2
temple     2
develop    2
dtype: int64

In [19]:
%pprint
print(sorted(set(headwords[headwords>1].index).intersection(set(n_senses_used[n_senses_used==1].index))))
%pprint

Pretty printing has been turned OFF
['-th', 'area', 'at', 'balance', 'base', 'bathe', 'beam', 'bite', 'bore', 'bow', 'cage', 'can', 'care', 'catch', 'chance', 'charge', 'club', 'comfort', 'company', 'cook', 'cool', 'course', 'cup', 'cure', 'desert', 'draw', 'effect', 'expression', 'fan', 'fast', 'figure', 'firm', 'get', 'hail', 'hair', 'horn', 'host', 'interest', 'keep', 'lead', 'lot', 'march', 'match', 'mean', 'minister', 'miss', 'mock', 'open', 'pet', 'post', 'regard', 'return', 'room', 'season', 'set', 'shut', 'sink', 'society', 'spirit', 'spoil', 'spring', 'steep', 'straight', 'stroke', 'succeed', 'take', 'tire', 'trick', 'weary', 'with', 'wound']
Pretty printing has been turned ON


## Current version of `kornai/4lang/4lang`

In [20]:
refrom = pd.read_csv('../4lang/4lang', sep='\t', names=names, dtype=str).fillna('')

In [21]:
refrom.loc[refrom.id=='#','id']=np.nan

In [22]:
refrom.head()

Unnamed: 0,en,hu,la,pl,id,uro,pos,defi,comment
0,-able,-hato1,-bilis,-alny,21,e,G,"gen allow {gen stem_ =agt}, ""_-able"" mark_ stem",%detestable
1,-al,-sa1g,-men,-enie,42,u,G,=REL,%HUN: -a1s % disapprov-
2,-an,-i,-anus,-anin,23,u,G,member =REL,%geographical
3,-ance,-a1s,-tia,-stwo,1,u,G,=REL,%
4,-ar,-t,,,54,u,G,TODO,%


In [23]:
def is_printname(token):
    return re.fullmatch('^[=a-zA-Z_0-9.#/-]+', str(token))

In [24]:
refrom.groupby('uro').size()

uro
c     110
e     358
p      14
u    3045
dtype: int64

In [25]:
refrom.groupby('pos').size()

pos
#       6
A     681
D     101
G     170
N    1821
U     143
V     605
dtype: int64

In [26]:
refrom.groupby('defi').size().sort_values(ascending=False).head()

defi
TODO        422
=REL         10
lack          7
good          6
together      5
dtype: int64

In [27]:
refrom.groupby('comment').size().sort_values(ascending=False).head()

comment
%          2861
%1           72
%no xml      43
%ND          33
%2           17
dtype: int64

In [28]:
refrom[~refrom.comment.str.startswith('%')]

Unnamed: 0,en,hu,la,pl,id,uro,pos,defi,comment


## Refrom/600

In [29]:
reform600 = pd.read_csv('../4lang/Reform/600', sep='\t', names=names)

In [30]:
reform600.head()

Unnamed: 0,en,hu,la,pl,id,uro,pos,defi,comment
0,-able,-hato1,-bilis,-alny,21,e,G,"gen allow {gen stem_ =agt}, ""_-able"" mark_ stem",%detestable
1,-er,-bb,-ior/-ius,-szy,14,e,G,"er_, =agt has quality, ""_-er"" mark_ stem_[qual...",%
2,-er,-o1,-tor/-trix,-ac1/ic1,3627,e,G,"stem_-er is_a =agt, ""_ -er"" mark_ stem_",%nomen agentis
3,-est,leg-bb,-issimus,naj-szyo,1513,e,G,er_ other %,
4,-ing,-a1s,-endi,-anie,2,e,G,"stem_-ing is_a event, ""_-ing"" mark_ stem_",%


In [31]:
printname_pattern = '^[=a-zA-Z_0-9.#/-]+'

In [32]:
printname_pattern = '^[a-z-]+$'

In [33]:
def is_printname(token):
    return re.fullmatch(printname_pattern, str(token))

In [34]:
reform600.groupby('uro').size()

uro
c    110
e    358
p     14
u    293
dtype: int64

In [35]:
reform600.groupby('pos').size()

pos
A    144
D      6
G     31
N    440
U     24
V    130
dtype: int64

In [36]:
reform600.groupby('defi').size().sort_values(ascending=False).head()

defi
man/659                        2
lack move                      2
quantity                       2
=pat in mind, =agt has mind    2
gen want                       2
dtype: int64

In [37]:
reform600.groupby('comment').size().sort_values(ascending=False).head()

comment
%        702
%ND        9
%RA        3
%RG        2
%   %      2
dtype: int64

## V2/700

In [38]:
v2_700 = pd.read_csv('../4lang/V2/700.tsv', sep='\t')

In [39]:
v2_700.head()

Unnamed: 0,en,hu,la,pl,ja,zh,num,s,l,def,%comment
0,/-able,/-ható,/-bilis,/-alni,_ -e-_,ke3- 可,21,e,G,"gen allow {gen stem_ =agt}, ""_-able"" mark_ stem",%detestable
1,/-er,/-bb,/-ior//-ius,/-szy,_ yori _ _-i,bi3jiao4 比较,14,e,G,"er_, =agt has quality, ""_-er"" mark_ stem_[qual...",%
2,/-er,/-ó,/-tor//-trix,/-arz,/-sha 者,/// -zhe3 者,3627,e,G,"stem_-er is_a =agt, ""_ -er"" mark_ stem_",%nomen agentis
3,/-est,leg-bb,/-issimus,naj...szy,ichiban 一番,zui4- 最,3625,e,G,er_ all %,
4,/-est,leg...bb,/-issimus,naj...szy,ichiban _ 一番,zui4- 最,1513,e,G,er_ other %,


In [40]:
printname_pattern = '^[=\w_/.#-]+$'

There is only one capitalized word, _Asia_

In [41]:
def is_printname(token):
    return re.fullmatch(printname_pattern, str(token))

In [42]:
sorted(v2_700.num.astype(int))[:20]

[2, 4, 5, 10, 12, 14, 17, 21, 29, 33, 66, 68, 72, 73, 74, 76, 78, 82, 86, 88]

In [43]:
v2_700.groupby('s').size()

s
c    110
e    359
p     14
u    293
dtype: int64

In [44]:
v2_700.groupby('l').size()

l
A    144
D      6
G     32
N    440
U     24
V    130
dtype: int64

In [45]:
v2_700.groupby('def').size().sort_values(ascending=False).head()

def
many         2
lack move    2
big          2
sex          2
gen want     2
dtype: int64

In [46]:
v2_700.groupby('%comment').size().sort_values(ascending=False).head()

%comment
%       701
%ND       9
%RA       3
%RG       2
%ZsA      2
dtype: int64