# Looking for language structures in conllu files

We got the data from the all_lang files in joined_by_lang/data_joined_by_lang/ by:

gfud pattern-replace 'PRUNE TRUE 1' < joined_by_lang/data_joined_by_lang/all_spanish.conllu | grep -v '#' | cut -f 1,4,7,8 | gfud reduced2conll x__x__xx | gfud conll2tree | cut -f 4,8 > lang_structures/spanish_structures.txt

In [12]:
import os
list_of_files = os.listdir('joined_by_lang/data_joined_by_lang/')
with open(f'creating_shell_scripts/lang_structures.sh', 'w') as f:
        f.write('#! /bin/bash \n')
        f.write('function lang_structures { \n')
        for filename in list_of_files:
            if filename.endswith('.conllu'):
                lang = filename.split('.conllu')[0][4:]
                command = f"gfud pattern-replace 'PRUNE TRUE 1' < joined_by_lang/data_joined_by_lang/{filename} | grep -v '#' | cut -f 1,4,7,8 | gfud reduced2conll x__x__xx | gfud conll2tree | cut -f 4,8 > lang_structures/{lang}_structures.txt \n"
                f.write(command)
            
        f.write('} \n')
        f.write('lang_structures')

In [85]:
def dict_with_structures(filepath):
    """
    Returns a dictionary with the structures 
    in the lang_structures file per language, with 
    the keys as the structure and the value as the 
    number of times it happens.
    
    Args:
      - filepath: path of the file with the 
        structures, from lang_structures directory.
    Returns:
      - d: a dictionary with the structures as the
        keys and the number of times they happen as 
        the values.
    """
    f = open(filepath)

    d = {}
    n = 0
    for line in f.readlines():
        if n == 0:
            root = [line[:-1]]
            dependents = []
            n += 1
        elif line.strip() == '':
            root.extend(sorted(dependents))
            if tuple(root) in d:
                d[tuple(root)] += 1
            else:
                d[tuple(root)] = 1
            n = 0        
        else:
            dependents.append(line[:-1])
            n += 1
    f.close()
    
    return d

In [86]:
english_structures = dict_with_structures('lang_structures/separated_lang_structures/english_structures.txt')
english_structures

{('PROPN\troot', 'PROPN\tappos', 'PUNCT\tpunct'): 14,
 ('NOUN\troot',
  'AUX\tcop',
  'DET\tdet',
  'PROPN\tnmod',
  'PROPN\tnsubj',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 5,
 ('VERB\troot',
  'AUX\taux:pass',
  'NOUN\tobl',
  'PRON\tnsubj:pass',
  'PROPN\tobl',
  'PROPN\tobl',
  'PUNCT\tpunct'): 1,
 ('VERB\troot',
  'NOUN\tobj',
  'PROPN\tnsubj',
  'PUNCT\tpunct',
  'PUNCT\tpunct',
  'VERB\tadvcl'): 38,
 ('NUM\troot',
  'ADJ\tappos',
  'AUX\tcop',
  'NOUN\tnsubj',
  'NUM\tcompound',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 1,
 ('PROPN\troot', 'AUX\tcop', 'NOUN\tnsubj', 'PUNCT\tpunct'): 7,
 ('NOUN\troot',
  'AUX\tcop',
  'NUM\tnmod',
  'PROPN\tnsubj',
  'PUNCT\tpunct',
  'PUNCT\tpunct',
  'VERB\tadvcl'): 1,
 ('VERB\troot',
  'ADV\tadvmod',
  'AUX\taux:pass',
  'NOUN\tobl',
  'PRON\tnsubj:pass',
  'PROPN\tobl',
  'PUNCT\tpunct',
  'VERB\tconj'): 1,
 ('VERB\troot',
  'NOUN\tnsubj',
  'NOUN\tobl',
  'NOUN\tobl',
  'PROPN\tobl',
  'PUNCT\tpunct',
  'VERB\tconj'): 1,
 ('VERB\troot',
  'NOUN\tnsub

In [87]:
{k: v for k, v in sorted(english_structures.items(), key=lambda item: item[1], reverse=True)}

{('NOUN\troot',): 622,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 509,
 ('NOUN\troot', 'ADJ\tamod'): 323,
 ('VERB\troot', 'NOUN\tobj', 'PROPN\tnsubj', 'PUNCT\tpunct'): 259,
 ('VERB\troot',
  'AUX\taux:pass',
  'NOUN\tnsubj:pass',
  'NOUN\tobl',
  'PUNCT\tpunct'): 239,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'PUNCT\tpunct'): 201,
 ('PROPN\troot',): 152,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\tpunct', 'VERB\tccomp'): 140,
 ('NOUN\troot', 'NOUN\tconj'): 127,
 ('VERB\troot', 'NOUN\tnsubj', 'PROPN\tobj', 'PUNCT\tpunct'): 124,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'NOUN\tobl', 'PUNCT\tpunct'): 104,
 ('VERB\troot',
  'NOUN\tnsubj',
  'NOUN\tobj',
  'NOUN\tobl',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 98,
 ('VERB\troot',
  'NOUN\tobj',
  'NOUN\tobl',
  'PROPN\tnsubj',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 89,
 ('VERB\troot', 'ADV\tadvmod', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 83,
 ('NOUN\troot', 'NOUN\tcompound'): 82,
 ('VERB\troot', 'NOUN\tobj', 'PRON\tnsubj', '

In [88]:
len(english_structures.keys())

8131

In [90]:
def dict_with_all_structures(dirpath):
    """
    Returns a dictionary with the structures 
    in the lang_structures file of all languages, with 
    the keys as the structure and the value as the 
    number of times it happens.
    
    Args:
      - dirpath: path of the directory with the 
        structures (lang_structures/separated_lang_structures).
    Returns:
      - d: a dictionary with the structures as the
        keys and the number of times they happen as 
        the values.
      - list(d.keys()): the structures in a list.
    """
    import os
    list_of_files = os.listdir(dirpath)
    
    d = {}
    
    for filepath in list_of_files:
        f = open(f'{dirpath}/{filepath}')

        n = 0
        for line in f.readlines():
            if n == 0:
                root = [line[:-1]]
                dependents = []
                n += 1
            elif line.strip() == '':
                root.extend(sorted(dependents))
                if tuple(root) in d:
                    d[tuple(root)] += 1
                else:
                    d[tuple(root)] = 1
                n = 0        
            else:
                dependents.append(line[:-1])
                n += 1
        f.close()
    
    return d, list(d.keys())

In [91]:
all_structures_dict, structures = dict_with_all_structures('lang_structures/separated_lang_structures')

In [93]:
len(structures) # 223848
# structures[223847]

223848

In [94]:
{k: v for k, v in sorted(all_structures.items(), key=lambda item: item[1], reverse=True)}

{('NOUN\troot',): 25408,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 11681,
 ('NOUN\troot', 'ADJ\tamod'): 10048,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'PUNCT\tpunct'): 9825,
 ('NOUN\troot', 'NOUN\tnmod'): 5424,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'NOUN\tobl', 'PUNCT\tpunct'): 5376,
 ('PROPN\troot',): 3878,
 ('NOUN\troot', 'NOUN\tconj'): 3418,
 ('CCONJ\troot', 'PUNCT\tpunct', 'VERB\tparataxis'): 3341,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'NOUN\tobl', 'PUNCT\tpunct'): 3265,
 ('VERB\troot', 'NOUN\tobj', 'PROPN\tnsubj', 'PUNCT\tpunct'): 3057,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\tpunct', 'VERB\tccomp'): 2517,
 ('VERB\troot',
  'ADV\tadvmod',
  'NOUN\tnsubj',
  'NOUN\tobj',
  'PUNCT\tpunct'): 2333,
 ('VERB\troot',
  'NOUN\tnsubj',
  'NOUN\tobj',
  'PUNCT\tpunct',
  'VERB\tconj'): 2287,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\tpunct', 'VERB\txcomp'): 2112,
 ('VERB\troot',
  'NOUN\tobj',
  'NOUN\tobl',
  'PROPN\tnsubj',
  'PUNCT\tpunct'): 2108,
 ('VERB\tro

In [95]:
def dict_with_common_structures(dirpath):
    """
    Get 20 common structures from each file, 
    then compare.
    """
    import os
    list_of_files = os.listdir(dirpath)
    
    big_d = {}
    
    for filepath in list_of_files:
        d = {}
        f = open(f'{dirpath}/{filepath}')

        n = 0
        for line in f.readlines():
            if n == 0:
                root = [line[:-1]]
                dependents = []
                n += 1
            elif line.strip() == '':
                root.extend(sorted(dependents))
                if tuple(root) in d:
                    d[tuple(root)] += 1
                else:
                    d[tuple(root)] = 1
                n = 0        
            else:
                dependents.append(line[:-1])
                n += 1
        f.close()
        
        my_keys = sorted(d, key=d.get, reverse=True)[:20]
        for key in my_keys:
            if key in big_d:
                big_d[key] += 1
            else:
                big_d[key] = 1
    
    return big_d

In [96]:
common_structures = dict_with_common_structures('lang_structures/separated_lang_structures')

In [97]:
common_structures

{('NOUN\troot',): 55,
 ('NOUN\troot', 'ADJ\tamod'): 42,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 41,
 ('ADJ\troot', 'AUX\tcop', 'NOUN\tnsubj', 'NOUN\tobl'): 1,
 ('VERB\troot',
  'AUX\taux:pass',
  'NOUN\tnsubj:pass',
  'NOUN\tobl',
  'PUNCT\tpunct'): 10,
 ('ADJ\troot', 'AUX\tcop', 'NOUN\tnsubj'): 1,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'PUNCT\tpunct'): 42,
 ('NOUN\troot', 'AUX\tcop', 'DET\tdet', 'NOUN\tnmod', 'NOUN\tnsubj'): 1,
 ('NOUN\troot', 'NOUN\tnmod'): 39,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj'): 4,
 ('NOUN\troot',
  'ADJ\tamod',
  'AUX\tcop',
  'DET\tdet',
  'NOUN\tnmod',
  'NOUN\tnsubj'): 1,
 ('NOUN\troot', 'CCONJ\tcc', 'NOUN\tconj'): 2,
 ('VERB\troot', 'AUX\taux', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 10,
 ('VERB\troot',
  'AUX\taux',
  'NOUN\tnsubj',
  'NOUN\tobj',
  'NOUN\tobl',
  'PUNCT\tpunct'): 8,
 ('VERB\troot',
  'ADV\tadvmod',
  'AUX\taux:pass',
  'NOUN\tnsubj:pass',
  'NOUN\tobl',
  'PUNCT\tpunct'): 2,
 ('VERB\troot', 'ADV\tadvmod

In [98]:
ordered = {k: v for k, v in sorted(common_structures.items(), key=lambda item: item[1], reverse=True)}
ordered

{('NOUN\troot',): 55,
 ('NOUN\troot', 'ADJ\tamod'): 42,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'PUNCT\tpunct'): 42,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 41,
 ('NOUN\troot', 'NOUN\tnmod'): 39,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'NOUN\tobl', 'PUNCT\tpunct'): 36,
 ('NOUN\troot', 'NOUN\tconj'): 35,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'NOUN\tobl', 'PUNCT\tpunct'): 26,
 ('PROPN\troot',): 26,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\tpunct', 'VERB\tccomp'): 21,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct', 'VERB\tconj'): 20,
 ('VERB\troot', 'NOUN\tobj', 'PROPN\tnsubj', 'PUNCT\tpunct'): 19,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\tpunct'): 18,
 ('PUNCT\troot',): 18,
 ('VERB\troot', 'ADV\tadvmod', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 16,
 ('VERB\troot', 'NOUN\tobj', 'NOUN\tobl', 'PROPN\tnsubj', 'PUNCT\tpunct'): 16,
 ('VERB\troot', 'ADV\tadvmod', 'NOUN\tnsubj', 'NOUN\tobl', 'PUNCT\tpunct'): 14,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\t

In [149]:
a = ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'PUNCT\tpunct')
b = [x.replace('\t', '-') for x in a]
c = b[0][:-5] + '(' + ', '.join(b[1:]) + ')'
c

'VERB(NOUN-nsubj, NOUN-obl, PUNCT-punct)'

In [150]:
# make into df and then latex
import pandas as pd
lst = [] # all structures
lst_bigger_5 = [] # all structures repeated in at least 5 languages
for key, value in ordered.items():
    newsep = [x.replace('\t', '-') for x in key]
    newkey = newsep[0][:-5] + '(' + ', '.join(newsep[1:]) + ')'
    lst.append([newkey, value])
    if value >= 5:
        lst_bigger_5.append(([newkey, value]))

In [151]:
lst

[['NOUN()', 55],
 ['NOUN(ADJ-amod)', 42],
 ['VERB(NOUN-nsubj, NOUN-obl, PUNCT-punct)', 42],
 ['VERB(NOUN-nsubj, NOUN-obj, PUNCT-punct)', 41],
 ['NOUN(NOUN-nmod)', 39],
 ['VERB(NOUN-nsubj, NOUN-obj, NOUN-obl, PUNCT-punct)', 36],
 ['NOUN(NOUN-conj)', 35],
 ['VERB(NOUN-nsubj, NOUN-obl, NOUN-obl, PUNCT-punct)', 26],
 ['PROPN()', 26],
 ['VERB(NOUN-nsubj, PUNCT-punct, VERB-ccomp)', 21],
 ['VERB(NOUN-nsubj, NOUN-obj, PUNCT-punct, VERB-conj)', 20],
 ['VERB(NOUN-obj, PROPN-nsubj, PUNCT-punct)', 19],
 ['VERB(NOUN-nsubj, PUNCT-punct)', 18],
 ['PUNCT()', 18],
 ['VERB(ADV-advmod, NOUN-nsubj, NOUN-obj, PUNCT-punct)', 16],
 ['VERB(NOUN-obj, NOUN-obl, PROPN-nsubj, PUNCT-punct)', 16],
 ['VERB(ADV-advmod, NOUN-nsubj, NOUN-obl, PUNCT-punct)', 14],
 ['VERB(NOUN-nsubj, PUNCT-punct, VERB-xcomp)', 13],
 ['VERB(NOUN-obj, NOUN-obl, PUNCT-punct)', 12],
 ['VERB(AUX-aux, NOUN-nsubj, NOUN-obl, PUNCT-punct)', 11],
 ['VERB(AUX-aux:pass, NOUN-nsubj:pass, NOUN-obl, PUNCT-punct)', 10],
 ['VERB(AUX-aux, NOUN-nsubj, NOUN

In [152]:
df1 = pd.DataFrame(lst, columns =['Structure', 'Frequency'])
df1.to_csv('lang_structures/frequency_structures.csv', sep = '\t', index = False)

In [153]:
df2 = pd.DataFrame(lst_bigger_5, columns =['Structure', 'Frequency'])
df2.to_csv('lang_structures/frequency_structures_5.csv', sep = '\t', index = False)

In [140]:
with open('lang_structures/common_lang_structures.txt', 'w') as f:
    for k, v in ordered.items():
        if v > 1:
            for x in k:
                f.write(x + '\n')
            f.write('\n')

In [103]:
n = 0
m = 0
o = 0
for el in common_structures:
    if common_structures[el] == 1:
        n+=1
    else:
        o += 1
        m+=common_structures[el]
print(n, o, m, n+m)
print(sum(common_structures.values()))

266 112 862 1128
1128


## Cosine similarity of language structures

In [104]:
structures_mapping_dict = {x:i for i, x in enumerate(structures)}

In [41]:
# structures_mapping_dict

In [22]:
import torch
# fake example to test if it works
a = {'a':0, 'b':1, 'c':2}
b = {'b':93, 'c':4}
z = torch.zeros(3)
for k, v in b.items():
    position = a[k]
    z[position] = v
z

tensor([ 0., 93.,  4.])

In [105]:
import os
import torch
path = 'lang_structures/separated_lang_structures/'
list_of_files = os.listdir(path)
list_of_tensors = []
list_of_lang = []
for file in list_of_files:
    lang = file[:-15]
    list_of_lang.append(lang)
    dictionary_with_structures = dict_with_structures(path+file)
    zeros = torch.zeros(len(structures))
    for structure, frequency in dictionary_with_structures.items():
        position = structures_mapping_dict[structure]
        zeros[position] = frequency
    list_of_tensors.append(zeros)

In [106]:
len(list_of_tensors)

58

In [42]:
# list_of_lang

In [58]:
la = [[1, 2, 3], [4, 5, 6]]
import pandas as pd
df = pd.DataFrame(la, columns =['Test1', 'Test2', 'Test3'])
df.index = ['cuack', 'miau']
df

Unnamed: 0,Test1,Test2,Test3
cuack,1,2,3
miau,4,5,6


In [107]:
cos = torch.nn.CosineSimilarity(dim=0)

In [108]:
cosine_sim_to_df = []
for i, tensor1 in enumerate(list_of_tensors):
    lang_tensor = []
    for tensor2 in list_of_tensors:
        lang_tensor.append(round(float(cos(tensor1, tensor2)), 4))
    cosine_sim_to_df.append(lang_tensor)

In [109]:
def no_underscores(text):
    if '_' in text:
        sep = text.split('_')
        cap = [x.capitalize() for x in sep]
        newtext = ' '.join(cap)
        return newtext
    else:
        return text.capitalize()
    
capitalised_lang = [no_underscores(x) for x in list_of_lang]

In [110]:
df = pd.DataFrame(cosine_sim_to_df, columns = capitalised_lang, index = capitalised_lang)

In [111]:
df

Unnamed: 0,Afrikaans,Arabic,Armenian,Basque,Belarusian,Bulgarian,Catalan,Chinese,Classical Chinese,Croatian,...,Swedish,Tamil,Telugu,Turkish,Ukrainian,Urdu,Uyghur,Vietnamese,Welsh,Wolof
Afrikaans,1.0,0.187,0.5937,0.7108,0.5729,0.6763,0.7317,0.6016,0.1087,0.715,...,0.7619,0.5324,0.5545,0.5715,0.6278,0.5932,0.4334,0.545,0.5319,0.1742
Arabic,0.187,1.0,0.1392,0.1858,0.194,0.1999,0.2069,0.1776,0.0156,0.2047,...,0.215,0.1399,0.1696,0.1607,0.2096,0.1504,0.1421,0.1569,0.1476,0.0567
Armenian,0.5937,0.1392,1.0,0.6547,0.4384,0.5011,0.6312,0.4076,0.0348,0.6638,...,0.6214,0.3668,0.4356,0.4705,0.4841,0.5316,0.3307,0.3858,0.3909,0.1176
Basque,0.7108,0.1858,0.6547,1.0,0.4886,0.6409,0.7181,0.7307,0.0664,0.7261,...,0.7862,0.5746,0.6262,0.6287,0.5588,0.6504,0.5032,0.5592,0.6284,0.2719
Belarusian,0.5729,0.194,0.4384,0.4886,1.0,0.7103,0.6617,0.4429,0.041,0.6799,...,0.7144,0.4303,0.6728,0.6098,0.9387,0.3843,0.5517,0.5924,0.4986,0.1678
Bulgarian,0.6763,0.1999,0.5011,0.6409,0.7103,1.0,0.8008,0.5556,0.068,0.7831,...,0.759,0.4793,0.5652,0.5514,0.7435,0.5259,0.4459,0.6036,0.4855,0.1892
Catalan,0.7317,0.2069,0.6312,0.7181,0.6617,0.8008,1.0,0.6198,0.0539,0.8119,...,0.8102,0.5224,0.583,0.5744,0.7111,0.5955,0.4561,0.603,0.5148,0.2322
Chinese,0.6016,0.1776,0.4076,0.7307,0.4429,0.5556,0.6198,1.0,0.1446,0.585,...,0.6984,0.5545,0.6018,0.5169,0.5126,0.5131,0.4577,0.5765,0.5328,0.2884
Classical Chinese,0.1087,0.0156,0.0348,0.0664,0.041,0.068,0.0539,0.1446,1.0,0.0493,...,0.0647,0.3454,0.0714,0.0656,0.0409,0.0568,0.1005,0.2289,0.0617,0.0801
Croatian,0.715,0.2047,0.6638,0.7261,0.6799,0.7831,0.8119,0.585,0.0493,1.0,...,0.8017,0.5046,0.5821,0.6005,0.7356,0.6166,0.464,0.552,0.5159,0.2163


In [112]:
df.to_csv('lang_structures/cosine-sim-per-lang-structure.txt')