# Looking for language structures in conllu files

We got the data from the all_lang files in joined_by_lang/data_joined_by_lang/ by:

gfud pattern-replace 'PRUNE TRUE 1' < joined_by_lang/data_joined_by_lang/all_spanish.conllu | grep -v '#' | cut -f 1,4,7,8 | gfud reduced2conll x__x__xx | gfud conll2tree | cut -f 4,8 > lang_structures/spanish_structures.txt

In [12]:
import os
list_of_files = os.listdir('joined_by_lang/data_joined_by_lang/')
with open(f'creating_shell_scripts/lang_structures.sh', 'w') as f:
        f.write('#! /bin/bash \n')
        f.write('function lang_structures { \n')
        for filename in list_of_files:
            if filename.endswith('.conllu'):
                lang = filename.split('.conllu')[0][4:]
                command = f"gfud pattern-replace 'PRUNE TRUE 1' < joined_by_lang/data_joined_by_lang/{filename} | grep -v '#' | cut -f 1,4,7,8 | gfud reduced2conll x__x__xx | gfud conll2tree | cut -f 4,8 > lang_structures/{lang}_structures.txt \n"
                f.write(command)
            
        f.write('} \n')
        f.write('lang_structures')

In [19]:
def dict_with_structures(filepath):
    """
    Returns a dictionary with the structures 
    in the lang_structures file per language, with 
    the keys as the structure and the value as the 
    number of times it happens.
    
    Args:
      - filepath: path of the file with the 
        structures, from lang_structures directory.
    Returns:
      - d: a dictionary with the structures as the
        keys and the number of times they happen as 
        the values.
    """
    f = open(filepath)

    d = {}
    n = 0
    for line in f.readlines():
        if n == 0:
            root = [line[:-1]]
            dependents = []
            n += 1
        elif line.strip() == '':
            root.extend(sorted(dependents))
            if tuple(root) in d:
                d[tuple(root)] += 1
            else:
                d[tuple(root)] = 1
            n = 0        
        else:
            dependents.append(line[:-1])
            n += 1
    f.close()
    
    return d

In [20]:
english_structures = dict_with_structures('lang_structures/english_structures.txt')
english_structures

{('PROPN\troot', 'PROPN\tappos', 'PUNCT\tpunct'): 14,
 ('NOUN\troot',
  'AUX\tcop',
  'DET\tdet',
  'PROPN\tnmod',
  'PROPN\tnsubj',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 5,
 ('VERB\troot',
  'AUX\taux:pass',
  'NOUN\tobl',
  'PRON\tnsubj:pass',
  'PROPN\tobl',
  'PROPN\tobl',
  'PUNCT\tpunct'): 1,
 ('VERB\troot',
  'NOUN\tobj',
  'PROPN\tnsubj',
  'PUNCT\tpunct',
  'PUNCT\tpunct',
  'VERB\tadvcl'): 38,
 ('NUM\troot',
  'ADJ\tappos',
  'AUX\tcop',
  'NOUN\tnsubj',
  'NUM\tcompound',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 1,
 ('PROPN\troot', 'AUX\tcop', 'NOUN\tnsubj', 'PUNCT\tpunct'): 7,
 ('NOUN\troot',
  'AUX\tcop',
  'NUM\tnmod',
  'PROPN\tnsubj',
  'PUNCT\tpunct',
  'PUNCT\tpunct',
  'VERB\tadvcl'): 1,
 ('VERB\troot',
  'ADV\tadvmod',
  'AUX\taux:pass',
  'NOUN\tobl',
  'PRON\tnsubj:pass',
  'PROPN\tobl',
  'PUNCT\tpunct',
  'VERB\tconj'): 1,
 ('VERB\troot',
  'NOUN\tnsubj',
  'NOUN\tobl',
  'NOUN\tobl',
  'PROPN\tobl',
  'PUNCT\tpunct',
  'VERB\tconj'): 1,
 ('VERB\troot',
  'NOUN\tnsub

In [21]:
{k: v for k, v in sorted(english_structures.items(), key=lambda item: item[1], reverse=True)}

{('NOUN\troot',): 622,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 509,
 ('NOUN\troot', 'ADJ\tamod'): 323,
 ('VERB\troot', 'NOUN\tobj', 'PROPN\tnsubj', 'PUNCT\tpunct'): 259,
 ('VERB\troot',
  'AUX\taux:pass',
  'NOUN\tnsubj:pass',
  'NOUN\tobl',
  'PUNCT\tpunct'): 239,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'PUNCT\tpunct'): 201,
 ('PROPN\troot',): 152,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\tpunct', 'VERB\tccomp'): 140,
 ('NOUN\troot', 'NOUN\tconj'): 127,
 ('VERB\troot', 'NOUN\tnsubj', 'PROPN\tobj', 'PUNCT\tpunct'): 124,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'NOUN\tobl', 'PUNCT\tpunct'): 104,
 ('VERB\troot',
  'NOUN\tnsubj',
  'NOUN\tobj',
  'NOUN\tobl',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 98,
 ('VERB\troot',
  'NOUN\tobj',
  'NOUN\tobl',
  'PROPN\tnsubj',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 89,
 ('VERB\troot', 'ADV\tadvmod', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 83,
 ('NOUN\troot', 'NOUN\tcompound'): 82,
 ('VERB\troot', 'NOUN\tobj', 'PRON\tnsubj', '

In [22]:
len(english_structures.keys())

8131

In [32]:
def dict_with_all_structures(dirpath):
    """
    Returns a dictionary with the structures 
    in the lang_structures file of all languages, with 
    the keys as the structure and the value as the 
    number of times it happens.
    
    Args:
      - dirpath: path of the directory with the 
        structures (lang_structures).
    Returns:
      - d: a dictionary with the structures as the
        keys and the number of times they happen as 
        the values.
    """
    import os
    list_of_files = os.listdir(dirpath)
    
    d = {}
    
    for filepath in list_of_files:
        f = open(f'{dirpath}/{filepath}')

        n = 0
        for line in f.readlines():
            if n == 0:
                root = [line[:-1]]
                dependents = []
                n += 1
            elif line.strip() == '':
                root.extend(sorted(dependents))
                if tuple(root) in d:
                    d[tuple(root)] += 1
                else:
                    d[tuple(root)] = 1
                n = 0        
            else:
                dependents.append(line[:-1])
                n += 1
        f.close()
    
    return d

In [34]:
all_structures = dict_with_all_structures('lang_structures')

In [35]:
{k: v for k, v in sorted(all_structures.items(), key=lambda item: item[1], reverse=True)}

{('NOUN\troot',): 25407,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 11680,
 ('NOUN\troot', 'ADJ\tamod'): 10047,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'PUNCT\tpunct'): 9824,
 ('NOUN\troot', 'NOUN\tnmod'): 5423,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'NOUN\tobl', 'PUNCT\tpunct'): 5375,
 ('PROPN\troot',): 3877,
 ('NOUN\troot', 'NOUN\tconj'): 3417,
 ('CCONJ\troot', 'PUNCT\tpunct', 'VERB\tparataxis'): 3341,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'NOUN\tobl', 'PUNCT\tpunct'): 3264,
 ('VERB\troot', 'NOUN\tobj', 'PROPN\tnsubj', 'PUNCT\tpunct'): 3056,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\tpunct', 'VERB\tccomp'): 2516,
 ('VERB\troot',
  'ADV\tadvmod',
  'NOUN\tnsubj',
  'NOUN\tobj',
  'PUNCT\tpunct'): 2332,
 ('VERB\troot',
  'NOUN\tnsubj',
  'NOUN\tobj',
  'PUNCT\tpunct',
  'VERB\tconj'): 2286,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\tpunct', 'VERB\txcomp'): 2111,
 ('VERB\troot',
  'NOUN\tobj',
  'NOUN\tobl',
  'PROPN\tnsubj',
  'PUNCT\tpunct'): 2107,
 ('VERB\tro

In [41]:
def dict_with_common_structures(dirpath):
    """
    Get 20 common structures from each file, 
    then compare.
    """
    import os
    list_of_files = os.listdir(dirpath)
    
    big_d = {}
    
    for filepath in list_of_files:
        d = {}
        f = open(f'{dirpath}/{filepath}')

        n = 0
        for line in f.readlines():
            if n == 0:
                root = [line[:-1]]
                dependents = []
                n += 1
            elif line.strip() == '':
                root.extend(sorted(dependents))
                if tuple(root) in d:
                    d[tuple(root)] += 1
                else:
                    d[tuple(root)] = 1
                n = 0        
            else:
                dependents.append(line[:-1])
                n += 1
        f.close()
        
        my_keys = sorted(d, key=d.get, reverse=True)[:20]
        for key in my_keys:
            if key in big_d:
                big_d[key] += 1
            else:
                big_d[key] = 1
    
    return big_d

In [42]:
common_structures = dict_with_common_structures('lang_structures')

In [43]:
common_structures

{('NOUN\troot',): 55,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 41,
 ('NOUN\troot', 'ADJ\tamod'): 42,
 ('VERB\troot', 'NOUN\tobj', 'PROPN\tnsubj', 'PUNCT\tpunct'): 19,
 ('VERB\troot',
  'AUX\taux:pass',
  'NOUN\tnsubj:pass',
  'NOUN\tobl',
  'PUNCT\tpunct'): 10,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'PUNCT\tpunct'): 42,
 ('PROPN\troot',): 26,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\tpunct', 'VERB\tccomp'): 21,
 ('NOUN\troot', 'NOUN\tconj'): 35,
 ('VERB\troot', 'NOUN\tnsubj', 'PROPN\tobj', 'PUNCT\tpunct'): 1,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'NOUN\tobl', 'PUNCT\tpunct'): 36,
 ('VERB\troot',
  'NOUN\tnsubj',
  'NOUN\tobj',
  'NOUN\tobl',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 3,
 ('VERB\troot',
  'NOUN\tobj',
  'NOUN\tobl',
  'PROPN\tnsubj',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 1,
 ('VERB\troot', 'ADV\tadvmod', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 16,
 ('NOUN\troot', 'NOUN\tcompound'): 6,
 ('VERB\troot', 'NOUN\tobj', 'PRON\tnsubj', 'PUNCT\tpunct'):

In [58]:
ordered = {k: v for k, v in sorted(common_structures.items(), key=lambda item: item[1], reverse=True)}
ordered

{('NOUN\troot',): 55,
 ('NOUN\troot', 'ADJ\tamod'): 42,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'PUNCT\tpunct'): 42,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 41,
 ('NOUN\troot', 'NOUN\tnmod'): 39,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'NOUN\tobl', 'PUNCT\tpunct'): 36,
 ('NOUN\troot', 'NOUN\tconj'): 35,
 ('PROPN\troot',): 26,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'NOUN\tobl', 'PUNCT\tpunct'): 26,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\tpunct', 'VERB\tccomp'): 21,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct', 'VERB\tconj'): 20,
 ('VERB\troot', 'NOUN\tobj', 'PROPN\tnsubj', 'PUNCT\tpunct'): 19,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\tpunct'): 18,
 ('PUNCT\troot',): 18,
 ('VERB\troot', 'ADV\tadvmod', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 16,
 ('VERB\troot', 'NOUN\tobj', 'NOUN\tobl', 'PROPN\tnsubj', 'PUNCT\tpunct'): 16,
 ('VERB\troot', 'ADV\tadvmod', 'NOUN\tnsubj', 'NOUN\tobl', 'PUNCT\tpunct'): 14,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\t

In [60]:
with open('lang_structures/common_lang_structures.txt', 'w') as f:
    for k, v in ordered.items():
        if v > 1:
            for x in k:
                f.write(x + '\n')
            f.write('\n')

In [52]:
n = 0
m = 0
o = 0
for el in common_structures:
    if common_structures[el] == 1:
        n+=1
    else:
        o += 1
        m+=common_structures[el]
print(n, o, m, n+m)
print(sum(common_structures.values()))

266 112 862 1128
1128
