# Looking for language structures in conllu files

We got the data from the all_lang files in joined_by_lang/data_joined_by_lang/ by:

gfud pattern-replace 'PRUNE TRUE 1' < joined_by_lang/data_joined_by_lang/all_spanish.conllu | grep -v '#' | cut -f 1,4,7,8 | gfud reduced2conll x__x__xx | gfud conll2tree | cut -f 4,8 > lang_structures/spanish_structures.txt

In [88]:
import os
list_of_files = os.listdir('joined_by_lang/data_joined_by_lang/')
with open(f'creating_shell_scripts/lang_structures.sh', 'w') as f:
        f.write('#! /bin/bash \n')
        f.write('function lang_structures { \n')
        for filename in list_of_files:
            if filename.endswith('.conllu'):
                lang = filename.split('.conllu')[0][4:]
                command = f"gfud pattern-replace 'PRUNE TRUE 1' < joined_by_lang/data_joined_by_lang/{filename} | grep -v $'# ' | cut -f 1,4,7,8 | gfud reduced2conll x__x__xx | gfud conll2tree | cut -f 4,8 > lang_structures/separated_lang_structures/{lang}_structures.txt \n"
                f.write(command)
            
        f.write('} \n')
        f.write('lang_structures')

## Structure per language

In [89]:
def dict_with_structures(filepath):
    """
    Returns a dictionary with the structures 
    in the lang_structures file per language, with 
    the keys as the structure and the value as the 
    number of times it happens.
    
    Args:
      - filepath: path of the file with the 
        structures, from lang_structures directory.
    Returns:
      - d: a dictionary with the structures as the
        keys and the number of times they happen as 
        the values.
    """
    f = open(filepath)

    d = {}
    n = 0
    for line in f.readlines():
        if n == 0:
            root = [line[:-1]]
            dependents = []
            n += 1
        elif line.strip() == '':
            root.extend(sorted(dependents))
            if tuple(root) in d:
                d[tuple(root)] += 1
            else:
                d[tuple(root)] = 1
            n = 0        
        else:
            dependents.append(line[:-1])
            n += 1
    f.close()
    
    return d

In [90]:
english_structures = dict_with_structures('lang_structures/separated_lang_structures/english_structures.txt')
english_structures

{('PROPN\troot', 'PROPN\tappos', 'PUNCT\tpunct'): 17,
 ('NOUN\troot',
  'AUX\tcop',
  'DET\tdet',
  'PROPN\tnmod',
  'PROPN\tnsubj',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 5,
 ('VERB\troot',
  'AUX\taux:pass',
  'NOUN\tobl',
  'PRON\tnsubj:pass',
  'PROPN\tobl',
  'PROPN\tobl',
  'PUNCT\tpunct'): 1,
 ('VERB\troot',
  'NOUN\tobj',
  'PROPN\tnsubj',
  'PUNCT\tpunct',
  'PUNCT\tpunct',
  'VERB\tadvcl'): 77,
 ('NUM\troot',
  'ADJ\tappos',
  'AUX\tcop',
  'NOUN\tnsubj',
  'NUM\tcompound',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 1,
 ('PROPN\troot', 'AUX\tcop', 'NOUN\tnsubj', 'PUNCT\tpunct'): 9,
 ('NOUN\troot',
  'AUX\tcop',
  'NUM\tnmod',
  'PROPN\tnsubj',
  'PUNCT\tpunct',
  'PUNCT\tpunct',
  'VERB\tadvcl'): 1,
 ('VERB\troot',
  'ADV\tadvmod',
  'AUX\taux:pass',
  'NOUN\tobl',
  'PRON\tnsubj:pass',
  'PROPN\tobl',
  'PUNCT\tpunct',
  'VERB\tconj'): 1,
 ('VERB\troot',
  'NOUN\tnsubj',
  'NOUN\tobl',
  'NOUN\tobl',
  'PROPN\tobl',
  'PUNCT\tpunct',
  'VERB\tconj'): 1,
 ('VERB\troot',
  'NOUN\tnsub

In [87]:
{k: v for k, v in sorted(english_structures.items(), key=lambda item: item[1], reverse=True)}

{('NOUN\troot',): 622,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 509,
 ('NOUN\troot', 'ADJ\tamod'): 323,
 ('VERB\troot', 'NOUN\tobj', 'PROPN\tnsubj', 'PUNCT\tpunct'): 259,
 ('VERB\troot',
  'AUX\taux:pass',
  'NOUN\tnsubj:pass',
  'NOUN\tobl',
  'PUNCT\tpunct'): 239,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'PUNCT\tpunct'): 201,
 ('PROPN\troot',): 152,
 ('VERB\troot', 'NOUN\tnsubj', 'PUNCT\tpunct', 'VERB\tccomp'): 140,
 ('NOUN\troot', 'NOUN\tconj'): 127,
 ('VERB\troot', 'NOUN\tnsubj', 'PROPN\tobj', 'PUNCT\tpunct'): 124,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'NOUN\tobl', 'PUNCT\tpunct'): 104,
 ('VERB\troot',
  'NOUN\tnsubj',
  'NOUN\tobj',
  'NOUN\tobl',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 98,
 ('VERB\troot',
  'NOUN\tobj',
  'NOUN\tobl',
  'PROPN\tnsubj',
  'PUNCT\tpunct',
  'PUNCT\tpunct'): 89,
 ('VERB\troot', 'ADV\tadvmod', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 83,
 ('NOUN\troot', 'NOUN\tcompound'): 82,
 ('VERB\troot', 'NOUN\tobj', 'PRON\tnsubj', '

In [88]:
len(english_structures.keys())

8131

In [191]:
count_little = 0
for k, v in english_structures.items():
    if v <=5:
        count_little += 1
print(count_little)

7699


In [256]:
wolof_structures = dict_with_structures('lang_structures/separated_lang_structures/wolof_structures.txt')
print(len(wolof_structures.keys()))
count_little_wolof = 0
for k, v in wolof_structures.items():
    if v <=5:
        count_little_wolof += 1
print(count_little_wolof)

590
579


In [257]:
chinese_structures = dict_with_structures('lang_structures/separated_lang_structures/chinese_structures.txt')
print(len(chinese_structures.keys()))
count_little_chinese = 0
for k, v in chinese_structures.items():
    if v <=5:
        count_little_chinese += 1
print(count_little_chinese)

10344
10192


In [258]:
spanish_structures = dict_with_structures('lang_structures/separated_lang_structures/spanish_structures.txt')
print(len(spanish_structures.keys()))
count_little_spanish = 0
for k, v in spanish_structures.items():
    if v <=5:
        count_little_spanish += 1
print(count_little_spanish)

7856
7332


## Structure of ALL languages

In [91]:
def dict_with_all_structures(dirpath):
    """
    Returns a dictionary with the structures 
    in the lang_structures file of all languages, with 
    the keys as the structure and the value as the 
    number of times it happens.
    
    Args:
      - dirpath: path of the directory with the 
        structures (lang_structures/separated_lang_structures).
    Returns:
      - d: a dictionary with the structures as the
        keys and the number of times they happen as 
        the values.
      - list(d.keys()): the structures in a list.
    """
    import os
    list_of_files = os.listdir(dirpath)
    
    d = {}
    
    for filepath in list_of_files:
        f = open(f'{dirpath}/{filepath}')

        n = 0
        for line in f.readlines():
            if n == 0:
                root = [line[:-1]]
                dependents = []
                n += 1
            elif line.strip() == '':
                root.extend(sorted(dependents))
                if tuple(root) in d:
                    d[tuple(root)] += 1
                else:
                    d[tuple(root)] = 1
                n = 0        
            else:
                dependents.append(line[:-1])
                n += 1
        f.close()
    
    return d, list(d.keys())

In [92]:
all_structures_dict, structures = dict_with_all_structures('lang_structures/separated_lang_structures')

In [93]:
print(len(structures), len(set(structures)))

241998 241998


In [94]:
len(structures) # 241998
# structures[223847]

241998

In [95]:
unique_constructions = 0
for k, v in all_structures_dict.items():
    if v == 1:
        unique_constructions += 1
print(unique_constructions) # what patterns are found only in one language

189727


In [96]:
223848-176069

47779

In [99]:
# {k: v for k, v in sorted(all_structures_dict.items(), key=lambda item: item[1], reverse=True)}

In [101]:
sum(all_structures_dict.values())

784748

### Another way of doing it

In [235]:
def dict_with_common_structures_no_limit(dirpath):
    """
    Get all common structures from each file, 
    then compare.
    """
    import os
    list_of_files = os.listdir(dirpath)
    
    big_d = {}
    all_values = 0
    
    for filepath in list_of_files:
        d = {}
        f = open(f'{dirpath}/{filepath}')

        n = 0
        for line in f.readlines():
            if n == 0:
                root = [line[:-1]]
                dependents = []
                n += 1
            elif line.strip() == '':
                root.extend(sorted(dependents))
                if tuple(root) in d:
                    d[tuple(root)] += 1
                else:
                    d[tuple(root)] = 1
                n = 0        
            else:
                dependents.append(line[:-1])
                n += 1
        f.close()
        
        # I am doing nothing with the values
        all_values += sum(d.values())
        
        my_keys = sorted(d, key=d.get, reverse=True)
        for key in my_keys:
            if key in big_d:
                big_d[key] += 1
            else:
                big_d[key] = 1
    
    return big_d, all_values

In [236]:
common_structures_no_limit, values_no_limit = dict_with_common_structures_no_limit('lang_structures/separated_lang_structures')
common_structures_no_limit

{('NOUN\troot',): 56,
 ('NOUN\troot', 'ADJ\tamod'): 51,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 49,
 ('ADJ\troot', 'AUX\tcop', 'NOUN\tnsubj', 'NOUN\tobl'): 12,
 ('VERB\troot',
  'AUX\taux:pass',
  'NOUN\tnsubj:pass',
  'NOUN\tobl',
  'PUNCT\tpunct'): 12,
 ('ADJ\troot', 'AUX\tcop', 'NOUN\tnsubj'): 11,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobl', 'PUNCT\tpunct'): 49,
 ('NOUN\troot', 'AUX\tcop', 'DET\tdet', 'NOUN\tnmod', 'NOUN\tnsubj'): 4,
 ('NOUN\troot', 'NOUN\tnmod'): 53,
 ('VERB\troot', 'NOUN\tnsubj', 'NOUN\tobj'): 39,
 ('NOUN\troot',
  'ADJ\tamod',
  'AUX\tcop',
  'DET\tdet',
  'NOUN\tnmod',
  'NOUN\tnsubj'): 5,
 ('NOUN\troot', 'CCONJ\tcc', 'NOUN\tconj'): 4,
 ('VERB\troot', 'AUX\taux', 'NOUN\tnsubj', 'NOUN\tobj', 'PUNCT\tpunct'): 32,
 ('VERB\troot',
  'AUX\taux',
  'NOUN\tnsubj',
  'NOUN\tobj',
  'NOUN\tobl',
  'PUNCT\tpunct'): 29,
 ('VERB\troot',
  'ADV\tadvmod',
  'AUX\taux:pass',
  'NOUN\tnsubj:pass',
  'NOUN\tobl',
  'PUNCT\tpunct'): 12,
 ('VERB\troot', 'ADV\ta

In [241]:
print('all common structures: ', len(common_structures_no_limit))
print('number of times they happen: ', values_no_limit)
print(sum(common_structures_no_limit.values())) # what was this?

all common structures:  223848
number of times they happen:  711394
303950


## 100 most common structures per language

In [4]:
def dict_with_common_structures_100(dirpath):
    """
    Get 100 common structures from each file, 
    then compare.
    """
    import os
    list_of_files = os.listdir(dirpath)
    
    big_d = {}
    
    for filepath in list_of_files:
        d = {}
        f = open(f'{dirpath}/{filepath}')

        n = 0
        for line in f.readlines():
            if n == 0:
                root = [line[:-1]]
                dependents = []
                n += 1
            elif line.strip() == '':
                root.extend(sorted(dependents))
                if tuple(root) in d:
                    d[tuple(root)] += 1
                else:
                    d[tuple(root)] = 1
                n = 0        
            else:
                dependents.append(line[:-1])
                n += 1
        f.close()
        
        my_keys = sorted(d, key=d.get, reverse=True)[:100]
        for key in my_keys:
            if key in big_d:
                big_d[key] += 1
            else:
                big_d[key] = 1
    
    return big_d

In [5]:
common_structures_100 = dict_with_common_structures_100('lang_structures/separated_lang_structures')

In [6]:
# common_structures_100

In [8]:
ordered_100 = {k: v for k, v in sorted(common_structures_100.items(), key=lambda item: item[1], reverse=True)}
# ordered_100

In [9]:
# make into df and then latex
import pandas as pd
lst_100 = [] # all structures
lst_100_bigger_5 = [] # all structures repeated in at least 5 languages
roots_100 = {}
for key, value in ordered_100.items():
    newsep = [x.replace('\t', '-') for x in key]
    root = newsep[0][:-5]
    newkey = root + '(' + ', '.join(newsep[1:]) + ')'
    lst_100.append([newkey, value, str(round(value/58*100, 2))+'%'])
    if value >= 5:
        lst_100_bigger_5.append(([newkey, value, str(round(value/58*100, 2))+'%']))
    if root in roots_100:
        roots_100[root] += 1
    else:
        roots_100[root] = 1

In [10]:
# lst_100

In [11]:
{k: v for k, v in sorted(roots_100.items(), key=lambda item: item[1], reverse=True)}

{'VERB': 1582,
 'NOUN': 445,
 'ADJ': 198,
 'PROPN': 87,
 'CCONJ': 46,
 'ADV': 35,
 'AUX': 23,
 'PRON': 19,
 'NUM': 15,
 'PART': 15,
 'PUNCT': 8,
 'X': 8,
 'DET': 6,
 'ADP': 6,
 'SYM': 2,
 'INTJ': 1}

In [12]:
# sum(roots.values()) # just checking 20 was 378
print(len(lst_100), len(lst_100_bigger_5))

2496 205


## 20 most common structures per language

In [146]:
def dict_with_common_structures_20(dirpath):
    """
    Get 20 common structures from each file, 
    then compare.
    """
    import os
    list_of_files = os.listdir(dirpath)
    
    big_d = {}
    
    for filepath in list_of_files:
        d = {}
        f = open(f'{dirpath}/{filepath}')

        n = 0
        for line in f.readlines():
            if n == 0:
                root = [line[:-1]]
                dependents = []
                n += 1
            elif line.strip() == '':
                root.extend(sorted(dependents))
                if tuple(root) in d:
                    d[tuple(root)] += 1
                else:
                    d[tuple(root)] = 1
                n = 0        
            else:
                dependents.append(line[:-1])
                n += 1
        f.close()
        
        my_keys = sorted(d, key=d.get, reverse=True)[:20]
        for key in my_keys:
            if key in big_d:
                big_d[key] += 1
            else:
                big_d[key] = 1
    
    return big_d

In [147]:
common_structures_20 = dict_with_common_structures_20('lang_structures/separated_lang_structures')
# common_structures_20

In [104]:
len(common_structures_20.keys())

381

In [148]:
ordered_20 = {k: v for k, v in sorted(common_structures_20.items(), key=lambda item: item[1], reverse=True)}
# ordered_20

In [151]:
# make into df and then latex
import pandas as pd
lst_20 = [] # all structures
lst_20_bigger_5 = [] # all structures repeated in at least 5 languages
roots_20 = {}
for key, value in ordered_20.items():
    newsep = [x.replace('\t', '-') for x in key]
    root = newsep[0][:-5]
    newkey = root + '(' + ', '.join(newsep[1:]) + ')'
    lst_20.append([newkey, value, str(round(value/58*100, 2))+'%'])
    if value >= 5:
        lst_20_bigger_5.append(([newkey, value, str(round(value/58*100, 2))+'%']))
    if root in roots_20:
        roots_20[root] += 1
    else:
        roots_20[root] = 1

In [107]:
print(len(lst_20), len(lst_20_bigger_5))

381 46


In [152]:
with open('lang_structures/common_lang_structures.txt', 'w') as f:
    for k, v in ordered_20.items():
        if v > 1:
            for x in k:
                f.write(x + '\n')
            f.write('\n')

In [153]:
n = 0
m = 0
o = 0
for el in common_structures_20:
    if common_structures_20[el] == 1:
        n+=1
    else:
        o += 1
        m+=common_structures_20[el]
print(n, o, m, n+m)
print(sum(common_structures_20.values()))

273 108 887 1160
1160


In [154]:
{k: v for k, v in sorted(roots_20.items(), key=lambda item: item[1], reverse=True)}

{'VERB': 217,
 'NOUN': 72,
 'ADJ': 34,
 'PROPN': 18,
 'CCONJ': 12,
 'PRON': 8,
 'ADV': 7,
 'PART': 4,
 'NUM': 3,
 'PUNCT': 2,
 'AUX': 2,
 'X': 1,
 'ADP': 1}

In [155]:
sum(roots_20.values())

381

## Saving into file

In [156]:
df1 = pd.DataFrame(lst_20, columns =['Structure', 'Frequency', '% of Lang'])
df1.to_csv('lang_structures/frequency_structures.csv', sep = '\t', index = False)
df2 = pd.DataFrame(lst_20_bigger_5, columns =['Structure', 'Frequency', '% of Lang'])
df2.to_csv('lang_structures/frequency_structures_5.csv', sep = '\t', index = False)

In [283]:
# looking for examples in English

file = 'lang_structures/separated_lang_structures/english_structures.txt'
fil2 = 'help.txt'
counter = 0
sentence = ''
check = 'VERB\troot\nNOUN\tnsubj\nNOUN\tobj\nPUNCT\tpunct\n'

f = open(fil2)
for line in f.readlines():
    if 'root' in line:
        counter += 1
    if line.strip() == '':
        if sentence == check:
            print(counter)
        else:
            sentence = ''
    else:
        sentence += line
f.close()
# sentence

7
56
180
189
198
217
246
251
287
312
344
363
390
404
425


## Cosine similarity of language structures

In [113]:
structures_mapping_dict = {x:i for i, x in enumerate(structures)}

In [41]:
# structures_mapping_dict

In [22]:
import torch
# fake example to test if it works
a = {'a':0, 'b':1, 'c':2}
b = {'b':93, 'c':4}
z = torch.zeros(3)
for k, v in b.items():
    position = a[k]
    z[position] = v
z

tensor([ 0., 93.,  4.])

In [114]:
import os
import torch
path = 'lang_structures/separated_lang_structures/'
list_of_files = os.listdir(path)
list_of_tensors = []
list_of_lang = []
for file in list_of_files:
    lang = file[:-15]
    list_of_lang.append(lang)
    dictionary_with_structures = dict_with_structures(path+file)
    zeros = torch.zeros(len(structures))
    for structure, frequency in dictionary_with_structures.items():
        position = structures_mapping_dict[structure]
        zeros[position] = frequency
    list_of_tensors.append(zeros)

In [182]:
blablabla = 0
for x in list_of_tensors:
    blablabla += torch.sum(x)
blablabla

tensor(711394.)

In [106]:
len(list_of_tensors)

58

In [144]:
print(list_of_lang[18], list_of_lang[29])

scottish_gaelic latin


In [83]:
# list_of_lang.index('finnish') # 15
list_of_lang.index('english') # 15

13

In [115]:
sum(list_of_tensors[15])

tensor(16172.)

In [42]:
# list_of_lang

In [58]:
la = [[1, 2, 3], [4, 5, 6]]
import pandas as pd
df = pd.DataFrame(la, columns =['Test1', 'Test2', 'Test3'])
df.index = ['cuack', 'miau']
df

Unnamed: 0,Test1,Test2,Test3
cuack,1,2,3
miau,4,5,6


In [116]:
cos = torch.nn.CosineSimilarity(dim=0)

In [117]:
cosine_sim_to_df = []
for i, tensor1 in enumerate(list_of_tensors):
    lang_tensor = []
    for tensor2 in list_of_tensors:
        lang_tensor.append(round(float(cos(tensor1, tensor2)), 4))
    cosine_sim_to_df.append(lang_tensor)

In [118]:
def no_underscores(text):
    if '_' in text:
        sep = text.split('_')
        cap = [x.capitalize() for x in sep]
        newtext = ' '.join(cap)
        return newtext
    else:
        return text.capitalize()
    
capitalised_lang = [no_underscores(x) for x in list_of_lang]

In [119]:
df = pd.DataFrame(cosine_sim_to_df, columns = capitalised_lang, index = capitalised_lang)

In [120]:
df

Unnamed: 0,English,Afrikaans,Arabic,Belarusian,Bulgarian,Catalan,Czech,Welsh,Danish,North Sami,...,Telugu,Turkish,Ukrainian,Urdu,Uyghur,Vietnamese,Classical Chinese,Wolof,Chinese,Old Church Slavonic
English,1.0,0.7095,0.1882,0.724,0.7987,0.7958,0.7665,0.4629,0.7921,0.489,...,0.5664,0.5609,0.7337,0.5198,0.4121,0.6479,0.0474,0.1747,0.5479,0.3716
Afrikaans,0.7095,1.0,0.187,0.5729,0.6763,0.7317,0.6662,0.5319,0.7636,0.5268,...,0.5545,0.5715,0.6278,0.5932,0.4334,0.545,0.1087,0.1742,0.5883,0.4519
Arabic,0.1882,0.187,1.0,0.194,0.1999,0.2069,0.224,0.1476,0.2083,0.1306,...,0.1696,0.1607,0.2096,0.1504,0.1421,0.1569,0.0156,0.0567,0.1756,0.1002
Belarusian,0.724,0.5729,0.194,1.0,0.7103,0.6617,0.8242,0.4986,0.79,0.4891,...,0.6728,0.6098,0.9387,0.3843,0.5517,0.5924,0.041,0.1678,0.44,0.2762
Bulgarian,0.7987,0.6763,0.1999,0.7103,1.0,0.8008,0.8039,0.4855,0.7547,0.4839,...,0.5652,0.5514,0.7435,0.5259,0.4459,0.6036,0.068,0.1892,0.5507,0.3494
Catalan,0.7958,0.7317,0.2069,0.6617,0.8008,1.0,0.773,0.5148,0.8025,0.5056,...,0.583,0.5744,0.7111,0.5955,0.4561,0.603,0.0539,0.2322,0.6123,0.411
Czech,0.7665,0.6662,0.224,0.8242,0.8039,0.773,1.0,0.5081,0.8315,0.4957,...,0.632,0.6202,0.87,0.5202,0.4909,0.5842,0.0406,0.1654,0.5326,0.3349
Welsh,0.4629,0.5319,0.1476,0.4986,0.4855,0.5148,0.5081,1.0,0.5831,0.5503,...,0.5529,0.5579,0.4888,0.4779,0.4631,0.4348,0.0617,0.1614,0.5228,0.3808
Danish,0.7921,0.7636,0.2083,0.79,0.7547,0.8025,0.8315,0.5831,1.0,0.6245,...,0.7238,0.6939,0.8328,0.5953,0.53,0.668,0.0572,0.1736,0.6337,0.4717
North Sami,0.489,0.5268,0.1306,0.4891,0.4839,0.5056,0.4957,0.5503,0.6245,1.0,...,0.6042,0.5899,0.4973,0.4349,0.4601,0.4902,0.1037,0.1395,0.5327,0.443


In [121]:
df.to_csv('lang_structures/cosine-sim-per-lang-structure.csv')

In [122]:
len(structures)

241998

In [123]:
df.iloc[-1]

English                0.3716
Afrikaans              0.4519
Arabic                 0.1002
Belarusian             0.2762
Bulgarian              0.3494
Catalan                0.4110
Czech                  0.3349
Welsh                  0.3808
Danish                 0.4717
North Sami             0.4430
German                 0.4220
Estonian               0.4897
Greek                  0.3436
Spanish                0.3693
Basque                 0.5154
Persian                0.4141
French                 0.3703
Irish                  0.4500
Scottish Gaelic        0.4678
Galician               0.4159
Gothic                 0.5242
Korean                 0.5568
Armenian               0.3258
Hindi                  0.4596
Croatian               0.4017
Indonesian             0.2428
Italian                0.4277
Hebrew                 0.1733
Kazakh                 0.2297
Latin                  0.5404
Latvian                0.3317
Lithuanian             0.4785
Hungarian              0.4107
Maltese   

In [124]:
df.describe()['English']

count    58.000000
mean      0.613203
std       0.199068
min       0.047400
25%       0.523275
50%       0.655700
75%       0.746600
max       1.000000
Name: English, dtype: float64

## Let's do the same, normalising the vectors

The structures will be divided by the total number of sentences

In [125]:
# Getting the total number of sentences per language

list_of_dirs = os.listdir('data/')

sent_per_lang = {}

for directory in list_of_dirs:
    dir_path = f'data/{directory}'
    files = os.listdir(dir_path)
    for file in files:
        lang = file.split('-')[1].split('.')[0]
        file_path = f'data/{directory}/{file}'
        with FileReadBackwards(file_path, encoding="utf-8") as frb:
            for l in frb:
                if l.startswith('# sent_id = '):
                    if lang in sent_per_lang:
                        sent_per_lang[lang] += int(l.split(' = ')[1])
                    else:
                        sent_per_lang[lang] = int(l.split(' = ')[1])
                    break

In [126]:
sent_per_lang

{'english': 35168,
 'afrikaans': 12198,
 'arabic': 18835,
 'belarusian': 9426,
 'bulgarian': 14458,
 'catalan': 16777,
 'czech': 19715,
 'welsh': 4699,
 'danish': 11635,
 'north_sami': 568,
 'german': 37194,
 'estonian': 10378,
 'greek': 11063,
 'spanish': 22452,
 'basque': 11691,
 'persian': 16172,
 'french': 34226,
 'irish': 3837,
 'scottish_gaelic': 814,
 'galician': 11144,
 'gothic': 351,
 'korean': 14641,
 'armenian': 15605,
 'hindi': 10096,
 'croatian': 10907,
 'indonesian': 16164,
 'italian': 22724,
 'hebrew': 13226,
 'kazakh': 9128,
 'latin': 5570,
 'latvian': 7473,
 'lithuanian': 7435,
 'hungarian': 19667,
 'maltese': 4448,
 'marathi': 7057,
 'dutch': 19499,
 'japanese': 23379,
 'polish': 18858,
 'portuguese': 22673,
 'romanian': 14080,
 'russian': 23025,
 'sanskrit': 8028,
 'slovak': 11056,
 'slovenian': 11379,
 'serbian': 13557,
 'finnish': 18579,
 'swedish': 16726,
 'tamil': 9313,
 'telugu': 13987,
 'turkish': 18012,
 'ukrainian': 17176,
 'urdu': 7017,
 'uyghur': 3599,
 'vi

In [127]:
import os
import torch
structure_path = 'lang_structures/separated_lang_structures/'
list_of_files_structures = os.listdir(path)
list_of_tensors_structures_normalised = []
list_of_lang = []
for file in list_of_files:
    lang = file[:-15]
    list_of_lang.append(lang)
    dictionary_with_structures = dict_with_structures(path+file)
    zeros = torch.zeros(len(structures))
    lang_size = sent_per_lang[lang]
#     print(lang_size)
    for structure, frequency in dictionary_with_structures.items():
        position = structures_mapping_dict[structure]
        zeros[position] = frequency/lang_size
    list_of_tensors_structures_normalised.append(zeros)

In [128]:
# list_of_tensors_structures_normalised

In [129]:
cos = torch.nn.CosineSimilarity(dim=0)
cosine_sim_to_df_normalised = []
for tensor1 in list_of_tensors_structures_normalised:
    lang_tensor = []
    for tensor2 in list_of_tensors_structures_normalised:
        lang_tensor.append(round(float(cos(tensor1, tensor2)), 4))
    cosine_sim_to_df_normalised.append(lang_tensor)

In [130]:
cosine_sim_to_df_normalised == cosine_sim_to_df

False

In [145]:
help = 0
for i, (cs, normalisedcs) in enumerate(zip(cosine_sim_to_df, cosine_sim_to_df_normalised)):
    if cs != normalisedcs:
        for x, y in zip(cs, normalisedcs):
            if x != y:
                print(x, y)
#         print(cs)
#         print(normalisedcs)
        print(i)
# I HAVE DECIDED TO IGNORE THE DIFFERENCE OF 0.0001 - probably due to the rounding up

0.6702 0.6703
18
0.6702 0.6703
29


In [131]:
def no_underscores(text):
    if '_' in text:
        sep = text.split('_')
        cap = [x.capitalize() for x in sep]
        newtext = ' '.join(cap)
        return newtext
    else:
        return text.capitalize()
    
capitalised_lang = [no_underscores(x) for x in list_of_lang]

In [132]:
df_normalised = pd.DataFrame(cosine_sim_to_df_normalised, columns = capitalised_lang, index = capitalised_lang)

In [133]:
df_normalised

Unnamed: 0,English,Afrikaans,Arabic,Belarusian,Bulgarian,Catalan,Czech,Welsh,Danish,North Sami,...,Telugu,Turkish,Ukrainian,Urdu,Uyghur,Vietnamese,Classical Chinese,Wolof,Chinese,Old Church Slavonic
English,1.0,0.7095,0.1882,0.724,0.7987,0.7958,0.7665,0.4629,0.7921,0.489,...,0.5664,0.5609,0.7337,0.5198,0.4121,0.6479,0.0474,0.1747,0.5479,0.3716
Afrikaans,0.7095,1.0,0.187,0.5729,0.6763,0.7317,0.6662,0.5319,0.7636,0.5268,...,0.5545,0.5715,0.6278,0.5932,0.4334,0.545,0.1087,0.1742,0.5883,0.4519
Arabic,0.1882,0.187,1.0,0.194,0.1999,0.2069,0.224,0.1476,0.2083,0.1306,...,0.1696,0.1607,0.2096,0.1504,0.1421,0.1569,0.0156,0.0567,0.1756,0.1002
Belarusian,0.724,0.5729,0.194,1.0,0.7103,0.6617,0.8242,0.4986,0.79,0.4891,...,0.6728,0.6098,0.9387,0.3843,0.5517,0.5924,0.041,0.1678,0.44,0.2762
Bulgarian,0.7987,0.6763,0.1999,0.7103,1.0,0.8008,0.8039,0.4855,0.7547,0.4839,...,0.5652,0.5514,0.7435,0.5259,0.4459,0.6036,0.068,0.1892,0.5507,0.3494
Catalan,0.7958,0.7317,0.2069,0.6617,0.8008,1.0,0.773,0.5148,0.8025,0.5056,...,0.583,0.5744,0.7111,0.5955,0.4561,0.603,0.0539,0.2322,0.6123,0.411
Czech,0.7665,0.6662,0.224,0.8242,0.8039,0.773,1.0,0.5081,0.8315,0.4957,...,0.632,0.6202,0.87,0.5202,0.4909,0.5842,0.0406,0.1654,0.5326,0.3349
Welsh,0.4629,0.5319,0.1476,0.4986,0.4855,0.5148,0.5081,1.0,0.5831,0.5503,...,0.5529,0.5579,0.4888,0.4779,0.4631,0.4348,0.0617,0.1614,0.5228,0.3808
Danish,0.7921,0.7636,0.2083,0.79,0.7547,0.8025,0.8315,0.5831,1.0,0.6245,...,0.7238,0.6939,0.8328,0.5953,0.53,0.668,0.0572,0.1736,0.6337,0.4717
North Sami,0.489,0.5268,0.1306,0.4891,0.4839,0.5056,0.4957,0.5503,0.6245,1.0,...,0.6042,0.5899,0.4973,0.4349,0.4601,0.4902,0.1037,0.1395,0.5327,0.443


In [136]:
df.equals(df_normalised)

False

In [58]:
import pickle
f = open("ignore/my_lang_fam.pkl", "rb")
my_lang_fam = pickle.load(f)
f.close()

In [66]:
# my_lang_fam

In [169]:
lang_fam_mapping = {}
for big_fam, fam in my_lang_fam.items():
    for family, lang in fam.items():
        for l in lang:
            lang_fam_mapping[no_underscores(l)] = big_fam + ', ' + family

In [170]:
lang_fam_mapping

{'English': 'Indo-European languages, Germanic languages',
 'Afrikaans': 'Indo-European languages, Germanic languages',
 'Danish': 'Indo-European languages, Germanic languages',
 'German': 'Indo-European languages, Germanic languages',
 'Gothic': 'Indo-European languages, Germanic languages',
 'Dutch': 'Indo-European languages, Germanic languages',
 'Swedish': 'Indo-European languages, Germanic languages',
 'Belarusian': 'Indo-European languages, Slavic languages',
 'Bulgarian': 'Indo-European languages, Slavic languages',
 'Czech': 'Indo-European languages, Slavic languages',
 'Croatian': 'Indo-European languages, Slavic languages',
 'Polish': 'Indo-European languages, Slavic languages',
 'Russian': 'Indo-European languages, Slavic languages',
 'Slovak': 'Indo-European languages, Slavic languages',
 'Slovenian': 'Indo-European languages, Slavic languages',
 'Serbian': 'Indo-European languages, Slavic languages',
 'Ukrainian': 'Indo-European languages, Slavic languages',
 'Old Church S

In [67]:
cosine_sim_to_df

[[1.0,
  0.187,
  0.5937,
  0.7108,
  0.5729,
  0.6763,
  0.7317,
  0.6016,
  0.1087,
  0.715,
  0.6662,
  0.7636,
  0.7703,
  0.7148,
  0.7127,
  0.0,
  0.6818,
  0.7078,
  0.7211,
  0.3062,
  0.6432,
  0.4171,
  0.5638,
  0.3313,
  0.2904,
  0.5955,
  0.7843,
  0.5895,
  0.203,
  0.6097,
  0.7307,
  0.6617,
  0.6973,
  0.5484,
  0.6625,
  0.5268,
  0.4519,
  0.0,
  0.6385,
  0.5447,
  0.7541,
  0.5592,
  0.2751,
  0.5889,
  0.7338,
  0.7032,
  0.733,
  0.6593,
  0.7619,
  0.5324,
  0.5545,
  0.5715,
  0.6278,
  0.5932,
  0.4334,
  0.545,
  0.5319,
  0.1742],
 [0.187,
  1.0,
  0.1392,
  0.1858,
  0.194,
  0.1999,
  0.2069,
  0.1776,
  0.0156,
  0.2047,
  0.224,
  0.2083,
  0.2045,
  0.1874,
  0.2036,
  0.0,
  0.2135,
  0.2092,
  0.1911,
  0.0717,
  0.2066,
  0.1453,
  0.1469,
  0.0847,
  0.0863,
  0.1663,
  0.2191,
  0.1545,
  0.0513,
  0.1511,
  0.1998,
  0.2097,
  0.2361,
  0.1772,
  0.171,
  0.1306,
  0.1002,
  0.0004,
  0.2076,
  0.1522,
  0.2113,
  0.1858,
  0.0609,
  0.1541,
  0

In [None]:

for lang, vector in zip(capitalised_lang, cosine_sim_to_df):
    

In [73]:
a = [1, 2, 3, 4, 5, 9111, -4]
import numpy as np
# np.argsort(a)
a[-3:]

[5, 9111, -4]

In [139]:
pd.DataFrame(df.columns.values[np.argsort(-df.values, axis=1)[:, :3]], 
                  index=df.index,
                  columns = ['1st Max','2nd Max','3rd Max'])

Unnamed: 0,1st Max,2nd Max,3rd Max
English,English,Italian,Dutch
Afrikaans,Afrikaans,Italian,Dutch
Arabic,Arabic,Lithuanian,Czech
Belarusian,Belarusian,Ukrainian,Russian
Bulgarian,Bulgarian,Italian,Slovak
Catalan,Catalan,Spanish,Romanian
Czech,Czech,Slovak,Polish
Welsh,Welsh,Basque,Estonian
Danish,Danish,Swedish,Dutch
North Sami,North Sami,Swedish,Latin


In [140]:
pd.DataFrame(df.columns.values[np.argsort(-df.values, axis=1)[:, -3:]], 
                  index=df.index,
                  columns = ['3rd Min','2nd Min','1st Min'])

Unnamed: 0,3rd Min,2nd Min,1st Min
English,Wolof,Kazakh,Classical Chinese
Afrikaans,Arabic,Wolof,Classical Chinese
Arabic,Wolof,Kazakh,Classical Chinese
Belarusian,Wolof,Sanskrit,Classical Chinese
Bulgarian,Wolof,Kazakh,Classical Chinese
Catalan,Arabic,Kazakh,Classical Chinese
Czech,Kazakh,Wolof,Classical Chinese
Welsh,Wolof,Arabic,Classical Chinese
Danish,Kazakh,Wolof,Classical Chinese
North Sami,Wolof,Arabic,Classical Chinese


In [157]:
df.describe()

Unnamed: 0,English,Afrikaans,Arabic,Belarusian,Bulgarian,Catalan,Czech,Welsh,Danish,North Sami,...,Telugu,Turkish,Ukrainian,Urdu,Uyghur,Vietnamese,Classical Chinese,Wolof,Chinese,Old Church Slavonic
count,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,...,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0,58.0
mean,0.613203,0.588071,0.183347,0.589314,0.608847,0.634193,0.634309,0.481741,0.680424,0.489617,...,0.570176,0.546971,0.61926,0.487312,0.448502,0.523333,0.080131,0.200412,0.530586,0.399512
std,0.199068,0.173438,0.119715,0.217516,0.199065,0.196427,0.216201,0.146266,0.202182,0.140476,...,0.165462,0.155957,0.22161,0.151345,0.138143,0.150831,0.127597,0.121626,0.148527,0.132491
min,0.0474,0.1087,0.0156,0.041,0.068,0.0539,0.0406,0.0617,0.0572,0.1037,...,0.0714,0.0656,0.0409,0.0568,0.1005,0.1387,0.0156,0.0507,0.162,0.1002
25%,0.523275,0.54585,0.150575,0.46285,0.4973,0.56145,0.511725,0.454725,0.62455,0.460275,...,0.5356,0.50975,0.490925,0.4174,0.41495,0.445775,0.047875,0.1574,0.48805,0.336275
50%,0.6557,0.6166,0.1858,0.64435,0.6427,0.671,0.67475,0.51575,0.7388,0.50335,...,0.61005,0.586,0.68225,0.52,0.464,0.566,0.057,0.183,0.5649,0.4086
75%,0.7466,0.709075,0.2063,0.74105,0.760975,0.7745,0.79685,0.5563,0.804675,0.5484,...,0.6622,0.630875,0.780275,0.578475,0.523275,0.603825,0.067525,0.22755,0.61825,0.46005
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [205]:
round(np.mean(df['Finnish'].values), 4)

0.6498

In [174]:
'\n'.join(str(df['English'].nlargest(4)).split('\n')[1:])

'Italian    0.8621\nDutch      0.8525\nGerman     0.8229\nName: English, dtype: float64'

In [191]:
c = df['English'].nlargest(4)
print(c.values[1:]) # array([1.    , 0.8621, 0.8525, 0.8229])
print(c.index[1:]) # array([1.    , 0.8621, 0.8525, 0.8229])
d = c.index[1:]
list(d)

[0.8621 0.8525 0.8229]
Index(['Italian', 'Dutch', 'German'], dtype='object')


['Italian', 'Dutch', 'German']

In [200]:
s = ''
for x, y in zip(c.values[1:], list(d)):
    s += str(x) + ' ' + str(y) + ', '
s[:-2]

'0.8621 Italian, 0.8525 Dutch, 0.8229 German'

In [172]:
str(df['English'].nsmallest(3))

'Classical Chinese    0.0474\nKazakh               0.1689\nWolof                0.1747\nName: English, dtype: float64'

In [206]:
big_table_structures_general = []
for lang in list(df.index):
    fam = lang_fam_mapping[lang]
    
    largest = df[lang].nlargest(4)
    s1 = ''
    for x, y in zip(largest.values[1:], list(largest.index[1:])):
        s1 += str(x) + ' ' + str(y) + ', '
    largest_str = s1[:-2]
    
    smallest = df['English'].nsmallest(3)
    s2 = ''
    for x, y in zip(smallest.values, list(smallest.index)):
        s2 += str(x) + ' ' + str(y) + ', '
    smallest_str = s2[:-2]
    
    avg = round(np.mean(df[lang].values), 4)
    
    lang_list = [lang, fam, largest_str, smallest_str, avg]
    
    big_table_structures_general.append(lang_list)

In [207]:
big_table_structures_general

[['English',
  'Indo-European languages, Germanic languages',
  '0.8621 Italian, 0.8525 Dutch, 0.8229 German',
  '0.0474 Classical Chinese, 0.1689 Kazakh, 0.1747 Wolof',
  0.6132],
 ['Afrikaans',
  'Indo-European languages, Germanic languages',
  '0.7843 Italian, 0.7703 Dutch, 0.7636 Danish',
  '0.0474 Classical Chinese, 0.1689 Kazakh, 0.1747 Wolof',
  0.5881],
 ['Arabic',
  'Afroasiatic languages, Semitic languages',
  '0.2361 Lithuanian, 0.224 Czech, 0.222 Slovak',
  '0.0474 Classical Chinese, 0.1689 Kazakh, 0.1747 Wolof',
  0.1833],
 ['Belarusian',
  'Indo-European languages, Slavic languages',
  '0.9387 Ukrainian, 0.8828 Russian, 0.8706 Latvian',
  '0.0474 Classical Chinese, 0.1689 Kazakh, 0.1747 Wolof',
  0.5893],
 ['Bulgarian',
  'Indo-European languages, Slavic languages',
  '0.8454 Italian, 0.8177 Slovak, 0.8065 Spanish',
  '0.0474 Classical Chinese, 0.1689 Kazakh, 0.1747 Wolof',
  0.6088],
 ['Catalan',
  'Indo-European languages, Romance languages',
  '0.9238 Spanish, 0.8673 R

In [211]:
df_structures_general = pd.DataFrame(big_table_structures_general, columns = ['Language', 'Lang fam', 'Max sim', 'Min sim', 'Avg sim'])

In [212]:
df_structures_general

Unnamed: 0,Language,Lang fam,Max sim,Min sim,Avg sim
0,English,"Indo-European languages, Germanic languages","0.8621 Italian, 0.8525 Dutch, 0.8229 German","0.0474 Classical Chinese, 0.1689 Kazakh, 0.174...",0.6132
1,Afrikaans,"Indo-European languages, Germanic languages","0.7843 Italian, 0.7703 Dutch, 0.7636 Danish","0.0474 Classical Chinese, 0.1689 Kazakh, 0.174...",0.5881
2,Arabic,"Afroasiatic languages, Semitic languages","0.2361 Lithuanian, 0.224 Czech, 0.222 Slovak","0.0474 Classical Chinese, 0.1689 Kazakh, 0.174...",0.1833
3,Belarusian,"Indo-European languages, Slavic languages","0.9387 Ukrainian, 0.8828 Russian, 0.8706 Latvian","0.0474 Classical Chinese, 0.1689 Kazakh, 0.174...",0.5893
4,Bulgarian,"Indo-European languages, Slavic languages","0.8454 Italian, 0.8177 Slovak, 0.8065 Spanish","0.0474 Classical Chinese, 0.1689 Kazakh, 0.174...",0.6088
5,Catalan,"Indo-European languages, Romance languages","0.9238 Spanish, 0.8673 Romanian, 0.8602 Italian","0.0474 Classical Chinese, 0.1689 Kazakh, 0.174...",0.6342
6,Czech,"Indo-European languages, Slavic languages","0.9457 Slovak, 0.911 Polish, 0.87 Ukrainian","0.0474 Classical Chinese, 0.1689 Kazakh, 0.174...",0.6343
7,Welsh,"Indo-European languages, Celtic languages","0.6284 Basque, 0.621 Estonian, 0.6209 Swedish","0.0474 Classical Chinese, 0.1689 Kazakh, 0.174...",0.4817
8,Danish,"Indo-European languages, Germanic languages","0.9208 Swedish, 0.8995 Dutch, 0.8966 Finnish","0.0474 Classical Chinese, 0.1689 Kazakh, 0.174...",0.6804
9,North Sami,"Uralic languages, Sámi languages","0.6435 Swedish, 0.6381 Latin, 0.6308 Estonian","0.0474 Classical Chinese, 0.1689 Kazakh, 0.174...",0.4896


In [None]:
df_structures_general.to_csv('lang_structures/.csv', sep = '\t', index = False)