# Analyze typological diversity by assessing language distances

## Imports and helper functions

In [2]:
import pandas as pd
import os

import plotly.express as px
import pandas as pd
import plotly.graph_objects as go

import lang2vec.lang2vec as l2v
import pkg_resources
from zipfile import ZipFile as zf
import scipy.sparse as sparse

import itertools
from tqdm import tqdm

import plotly.figure_factory as ff
import numpy as np


custom_template = dict(
    layout=go.Layout(title_font=dict(family="Times New Roman"))
)
"""
The following code was reused from https://github.com/antonisa/lang2vec, since there was a problem with installing the package. All rights reserved to the owners of the code.

"""


DISTANCES_LANGUAGE_FILE = pkg_resources.resource_filename(__name__, "/distances/distances_languages.txt")


DISTANCES_FILE = pkg_resources.resource_filename(__name__, "/distances/distances2.zip")

DISTANCES = ["genetic", "geographic", "syntactic", "inventory", "phonological", "featural"]

def available_distance_languages():
    with open(DISTANCES_LANGUAGE_FILE) as inp:
        l = inp.readlines()[0]
    return l.strip().split(',')


DISTANCE_LANGUAGES = available_distance_languages()

def map_distance_to_filename(distance):
    d = {"genetic": "genetic_upper_sparse.npz",
     "geographic": "geographic_upper_round1_sparse.npz", 
     "syntactic" : "syntactic_upper_round2_sparse.npz",
     "inventory" : "inventory_upper_sparse.npz",
     "phonological" : "phonological_upper_sparse.npz",
     "featural" : "featural_upper_round1_sparse.npz"}
    return d[distance]



def distance(distance, *args):

    if isinstance(distance, str):
        distance_list = [distance]
    elif isinstance(distance, list):
        distance_list = distance
    else:
        raise Exception("Unknown distance type. Provide a name (str) or a list of str.")

    for dist in distance_list:
        if dist not in DISTANCES:
            raise Exception("Unknown distance " + dist + ". The available ones are: " + ' '.join(DISTANCES))

    if len(args) == 1 and not isinstance(args[0],list):
        raise Exception("Error: You only provided one language argument.\nProvide multiple language arguments, or a single list of languages as arguments.")
    if len(args) == 1 and isinstance(args[0],list):
        langs = args[0]
    else:
        langs = [l for l in args]
    for l in langs:
        if l not in DISTANCE_LANGUAGES:
            raise Exception("Unknown language " + l + " (or maybe we don't have precomputed distances for this one).")
    indeces = [DISTANCE_LANGUAGES.index(l) for l in langs]


    N = len(indeces)
    if N == 2:
        out = []
        with zf(DISTANCES_FILE, 'r') as zp:
            for dist in distance_list:
                data = sparse.load_npz(zp.open(map_distance_to_filename(dist)))
                if indeces[0] > indeces[1]:
                    out.append(data[indeces[1],indeces[0]])
                else:
                    out.append(data[indeces[0],indeces[1]])
        if len(out) > 1:
            return out
        else:
            return out[0]
    else:
        arr_list = [np.zeros((N,N)) for dist in distance_list]
        with zf(DISTANCES_FILE, 'r') as zp:
            for k,dist in enumerate(distance_list):
                data = sparse.load_npz(zp.open(map_distance_to_filename(dist)))
                for a,i in enumerate(indeces):
                    for b,j in enumerate(indeces):
                        if a != b:
                            if i > j:
                                arr_list[k][a,b] = data[j,i]
                            else:
                                arr_list[k][a,b] = data[i,j]                
        if len(arr_list) > 1:
            return arr_list
        else:
            return arr_list[0]
        



def get_features(languages, feature_set_inp, header=False, minimal=False):    
    if isinstance(languages, str):
        lang_codes = languages.split()
    elif isinstance(languages, list):
        lang_codes = languages 
    else:
        raise Exception("Improper type "+type(languages)+" for languages.\nRequires string or list of strings.")
        
    feature_names, feature_values = get_concatenated_sets(lang_codes, feature_set_inp)
    feature_names = np.array([ f.replace(" ","_") for f in feature_names ])

    if minimal:
        mask = np.all(feature_values == 0.0, axis=0)
        mask |= np.all(feature_values == 1.0, axis=0)
        mask |= np.all(feature_values == -1.0, axis=0)
        unmasked_indices = np.where(np.logical_not(mask))
    else:
        unmasked_indices = np.where(np.ones(feature_values.shape[1]))
    
    output = {}
    if header:
        output['CODE']=list(feature_names[unmasked_indices])
        
    for i, lang_code in enumerate(lang_codes):
        values = feature_values[i,unmasked_indices].ravel()
        values = [ '--' if f == -1 else f for f in values ]
        #print("\t".join([lang_code]+values))
        output[lang_code] = values
    return output


def get_lang2vec_low_cov(lang_codes, threshold=0.95):
    syn_features = l2v.get_features(
        lang_codes,
        l2v.fs_concatenation(
            [l2v.fs_union(["syntax_wals", "syntax_sswl", "syntax_ethnologue"])]
        ),
    )

    low_cov = []
    for iso in lang_codes:
        syn_missing = syn_features[iso].count("--") / len(syn_features[iso])
        if syn_missing > threshold:
            low_cov.append(iso)

    return low_cov

## Load MultiQ Languages and perform distance calculation for all language combinations

In [3]:
multiq_path = '../../../data/model_language_fidelity/Mistral-7B-Instruct-v0.1.csv'

df_multiq =  pd.read_csv(multiq_path)

In [18]:
all_languages = list(df_multiq.iso_639_3.unique())
languages_to_test = [x for x in all_languages if not x in low_coverage or low_coverage.remove(x)]


syntactic = []
geographic = []
genetic = []
for combi in tqdm(itertools.combinations(languages_to_test, 2)):
    syntactic.append(distance("syntactic", combi[0], combi[1]))
    geographic.append(distance("geographic", combi[0], combi[1]))
    genetic.append(distance("genetic", combi[0], combi[1]))

7503it [2:22:14,  1.14s/it]


In [38]:

hist_data = [syntactic, geographic, genetic]

group_labels = ['Syntactic', 'Geographic', 'Genetic']
colors = ['#BDDB39', '#FF8A93', '#1BBB9B']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=colors, bin_size=.05,
                         show_curve=False)

# Add title
fig.update_layout(
    font_family="Times New Roman",
    width=600,
    font=dict(size=24),
    height=400,
    margin=dict(l=25, r=0, t=0, b=30),
    template=custom_template,
    legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
)
    )
fig.show()

fig.write_image("../../img/topological_coverage.pdf", format="pdf")