In [1]:
!pip install pycountry
!pip install basemap



In [2]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.patheffects as PathEffects
import lang2vec.lang2vec as l2v
import importlib.resources
from lang2vec import data as lang2vec_data
import umap
from mpl_toolkits.basemap import Basemap




In [3]:
# `available_uriel_languages` from lang2vec is bugged so we need to extract languages manually
dt = np.load(importlib.resources.open_binary(lang2vec_data, 'feature_predictions.npz'))
uriel_languages = sorted(dt['langs'])

uriel_features = l2v.get_features(
    languages=uriel_languages,
    feature_set_inp='syntax_knn+phonology_knn+inventory_knn',
    header=True,
)

# CODE is a special value for feature names
uriel_codes = uriel_features['CODE']

df = pd.DataFrame(
    [uriel_features[l] for l in uriel_languages],
    index=uriel_languages,
    columns=uriel_codes,
)

In [4]:
umap_object = umap.UMAP(
    n_neighbors=200,
    metric='cosine',
    min_dist=0.5,
    random_state=1,
)
umap_object.fit(np.vstack(df[uriel_codes].itertuples(index=False, name=None)))

df['uriel_x'] = umap_object.embedding_[:,0]
df['uriel_y'] = umap_object.embedding_[:,1]

  umap_object.fit(np.vstack(df[uriel_codes].itertuples(index=False, name=None)))


In [5]:
def normalize_iso(iso):
    if len(iso) == 3:
        return iso
    isos = iso.split(', ')
    for iso in isos:
        if iso in uriel_languages:
            return iso    

# WALS files downloaded from https://github.com/cldf-datasets/wals/releases
# Files can be found in `raw` folder
lan_csv = pd.read_csv('language.csv').set_index('pk')        # Contains geographical coordinates
wal_csv = pd.read_csv('walslanguage.csv').set_index('pk')    # Contains ISO codes
wsl_df = lan_csv.join(wal_csv)

# Normalize ISO codes, some are null, some have multiple variants
wsl_df = wsl_df[wsl_df.iso_codes.notna()]
wsl_df['iso_codes'] = wsl_df['iso_codes'].apply(normalize_iso)

wsl_df = wsl_df[['iso_codes', 'latitude', 'longitude']].set_index('iso_codes')
wsl_df = wsl_df[~wsl_df.index.duplicated()] 
# Multiple records can share one ISO code (e.g. Zulu, Zulu (northern), Zulu (southerns)) and they can even have different primary key
# in `languages.csv` and different geographical coordinates. Here we simply select the first record. This might not be an optimal
# solution, but it is only a handful of languages and I believe that the coordinates will still roughly match.

df = df.join(wsl_df, how='left')

In [7]:
families = l2v.get_features(
    languages=uriel_languages,
    feature_set_inp='fam',
    header=True,
)
families_codes = families['CODE']
fam_df = pd.DataFrame(
    [families[l] for l in uriel_languages],
    index=uriel_languages,
    columns=families['CODE'],
)

df = df.join(fam_df)

In [8]:
df['family_str'] = [
    ' '.join(f[2:] for i, f in zip(row, families_codes) if i)
    for row in fam_df.itertuples(index=False, name=None)
]

In [9]:
import pycountry

pycountry.languages._load()

df['name'] = [
    pycountry.languages.get(alpha_3=l).name if l in pycountry.languages.indices['alpha_3'] else None
    for l in uriel_languages  
]

In [10]:
df
# df.loc['slk']

Unnamed: 0,S_SVO,S_SOV,S_VSO,S_VOS,S_OVS,S_OSV,S_SUBJECT_BEFORE_VERB,S_SUBJECT_AFTER_VERB,S_OBJECT_AFTER_VERB,S_OBJECT_BEFORE_VERB,...,F_Berawan,F_Central-East_Berawan,F_Iwaidjan_Proper_(Unattested),F_West_Zapotec,F_West-Central_West_Zapotec,F_Coatec,F_Coatlan-Loxicha_Zapotec,F_Yongnan-Yongbei,family_str,name
aaa,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Atlantic-Congo Volta-Congo Benue-Congo Akpes-E...,Ghotuo
aab,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Atlantic-Congo Volta-Congo Benue-Congo Benue-C...,Alumu-Tesu
aac,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Suki-Gogodala Gogodalic,Ari
aad,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Sepik,Amal
aae,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Indo-European Albanian Albanian-Tosk,Arbëreshë Albanian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zyj,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tai-Kadai Kam-Tai Be-Tai Daic Northern_Daic No...,Youjiang Zhuang
zyn,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Tai-Kadai Kam-Tai Be-Tai Daic Northern_Daic Yo...,Yongnan Zhuang
zyp,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Sino-Tibetan Kuki-Chin-Naga Kuki-Chin Maraic N...,Zyphe Chin
zza,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Indo-European Indo-Iranian Iranian Western_Ira...,Zaza


In [14]:
%matplotlib widget

def visualize(
    backend,
    features,
    color_families=False,
    legend_families=False,
    alpha=1,
    zoom=None,  # left right bottom up
    evaluation=None,
):
    if color_families == True:
        color_families = [
            'Atlantic-Congo',
            'Austronesian',
            'Indo-European',
            'Slavic',
            'Germanic',
            'Italic',
            'Afro-Asiatic',
            'Semitic',
            'Sino-Tibetan',
            'Nuclear_Trans_New_Guinea',
            'Pama-Nyungan',
            'Otomanguean',
            'Austroasiatic',
            'Dravidian',
            'Turkic',
            'Uralic',
        ]
    
    if color_families:
        color_families = ['F_' + family for family in color_families]
    
    if backend == 'matplotlib':
        
        plt.rcParams["figure.figsize"] = (15, 15)
        
        if features == 'uriel':
    
            if color_families:
                other = df[~df[color_families].any(axis=1)]
                plt.scatter(other.uriel_x, other.uriel_y, 3, c='gray', marker='o', label='Other' if legend_families else None, alpha=alpha)
                for family in color_families:
                    family_df = df[df[family] == 1]
                    plt.scatter(family_df.uriel_x, family_df.uriel_y, 10, marker='o', label=family if legend_families else None, alpha=alpha)
                    # TODO: fix colors
                    # TODO: select each language only once, e.g. Indo-European vs Slavic

            else:
                plt.scatter(df.uriel_x, df.uriel_y, 3, marker='o', c='gray', alpha=alpha)
                
            if zoom is not None:
                plt.xlim(zoom[0], zoom[1])
                plt.ylim(zoom[2], zoom[3])
                
            if evaluation is not None:
                subset_df = df.loc[evaluation.languages]
                plt.scatter(subset_df['uriel_x'], subset_df['uriel_y'], s=evaluation.sizes, c=evaluation.colors)
                
                if evaluation.legend:
                    for label, color, size in zip(evaluation.legend['labels'], evaluation.legend['colors'], evaluation.legend['sizes']):
                        plt.scatter(None, None, s=size, color=color, label=label)

        elif features == 'geo':
            
            if zoom is None:
                zoom = (-180, 180, -60, 75)

            m = Basemap(
                projection='merc',
                llcrnrlat=zoom[2],
                urcrnrlat=zoom[3],
                llcrnrlon=zoom[0],
                urcrnrlon=zoom[1],
                lat_ts=20,
                resolution='c'
            )
            m.drawcoastlines()
            m.fillcontinents(color='white', lake_color='white')
            m.drawparallels(np.arange(-90.,91.,30.), labels=[True,False,False,False])
            m.drawmeridians(np.arange(-180.,181.,60.), labels=[False,True,True,False])
            m.drawmapboundary(fill_color='white')

            
            if color_families:
                other = df[~df[color_families].any(axis=1)]
                m.scatter(*m(other.longitude, other.latitude), 3, c='gray', marker='o', zorder=3, label='Other' if legend_families else None, alpha=alpha)
                for family in color_families:
                    family_df = df[df[family] == 1]
                    m.scatter(*m(family_df.longitude, family_df.latitude), 10, marker='o', label=family if legend_families else None, zorder=3, alpha=alpha)
                    # TODO: fix colors
                    # TODO: select each language only once, e.g. Indo-European vs Slavic
            else:
                m.scatter(*m(df.longitude, df.latitude), 10, marker='o', color='gray', zorder=3, alpha=alpha)
                
            if evaluation is not None:
                subset_df = df.loc[evaluation.languages]
                plt.scatter(*m(subset_df['longitude'], subset_df['latitude']), s=evaluation.sizes, c=evaluation.colors, zorder=3)
                
                if evaluation.legend:
                    for label, color, size in zip(evaluation.legend['labels'], evaluation.legend['colors'], evaluation.legend['sizes']):
                        plt.scatter(None, None, s=size, color=color, label=label)                
            
        else:
            raise AttributeError('Attribute `features` muse be either `uriel` or `geo`.')
    
        plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
            
    elif backend == 'bokeh':
        ...

class Evaluation:
    
    def __init__(self, languages, sizes, colors, legend):
        self.languages = languages
        self.sizes = sizes
        self.colors = colors
        self.legend = legend
        
    def visualize(self, *args, **kwargs):
        visualize(*args, **kwargs, evaluation=self)
    
    @classmethod
    def compare_two_methods(cls, languages, score_diff, method_names):
        
        colors = ['g' if s > 0 else 'r' for s in score_diff]
        scores = [s if s > 0 else -s for s in score_diff]
        mx, mn = max(scores), min(scores)
        scores = [s - min(scores) for s in scores]  # normalization
        scores = [s / max(scores) for s in scores]  # normalization
        sizes = [s * 90 + 10 for s in scores]  # 20-100 scale
        
        method_a, method_b = method_names
        legend = {
            'labels': [
                f'{method_a} wins by {mx}',
                f'{method_a} wins by {(mx + mn) / 2}',
                f'{method_a} wins by {mn}',
                f'{method_b} wins by {mn}',
                f'{method_b} wins by {(mx + mn) / 2}',
                f'{method_b} wins by {mx}',
            ],
            'colors': ['g', 'g', 'g', 'r', 'r', 'r'],
            'sizes': [100, 55, 10, 10, 55, 100],
        }
        
        return Evaluation(
            languages=languages,
            sizes=sizes,
            colors=colors,
            legend=legend,
        )
        
    @classmethod
    def show_winner(cls, languages, winners):
        methods = list(set(winners))
        cm = matplotlib.cm.get_cmap('tab20').colors
        cm = cm[::2] + cm[1::2]
        
        legend = {
            'labels': methods,
            'colors': cm[:len(methods)],
            'sizes': [50 for _ in range(len(methods))],
        }
        
        return Evaluation(
            languages=languages,
            sizes=50,
            colors=[cm[methods.index(w)] for w in winners],
            legend=legend,
        )
        
    @classmethod
    def show_languages(cls, languages, color='r'):
        return Evaluation(
            languages=languages,
            sizes=50,
            colors=color,
        )
        
    @classmethod
    def show_performance(cls, languages, scores, color='r'):  # scores should be scaled by user and with max being better
        mn, mx = min(scores), max(scores)
        scores = [s - min(scores) for s in scores]  # normalization
        scores = [s / max(scores) for s in scores]  # normalization
        sizes = [s * 90 + 10 for s in scores]  # 20-100 scale
        
        legend = {
            'labels': [
                f'{mn}',
                f'{(mx + mn) / 2}',
                f'{mx}',
            ],
            'colors': [color, color, color],
            'sizes': [10, 55, 100],
        }
        
        # TODO: legend
        return Evaluation(
            languages=languages,
            sizes=sizes,
            colors=color,
            legend=legend,
        )
        
Evaluation.show_performance(['slk', 'ces', 'hbs', 'rus', 'eng'], [75, 80, 79, 65, 90]).\
visualize('matplotlib', 'uriel', alpha=0.1, color_families=True, legend_families=False)#, zoom=[0,10,-5,5])
...

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Ellipsis

In [None]:
import umap.plot
import bokeh

hover_data = pd.DataFrame({
    'family': df['family_str'],
    'iso': df.index,
    'name': df['name'],
})
hover_data.index = range(7970)

p = umap.plot.interactive(
    umap_object,
    hover_data=hover_data,
    interactive_text_search=True,
    point_size=4,
)
bokeh.plotting.output_notebook() 
bokeh.plotting.show(p)

In [None]:
import numpy as np
from types import FunctionType
from utils import language_iso

@language_iso
def rahimi_ner():
    languages = [line.split()[0] for line in open('./papers/rahimi_ner.txt')]
    scores = np.vstack([
        [float(v) for v in line.split()[2:]]
        for line
        in open('./papers/rahimi_ner.txt')
    ])
    return languages, scores

@language_iso
def heinzerling_ner():
    languages = [line.split()[0] for line in open('./papers/heinzerling_ner.txt')]
    scores = np.vstack([
        [float(v) for v in line.split()[1:]]
        for line
        in open('./papers/heinzerling_ner.txt')
    ])
    return languages, scores

@language_iso
def heinzerling_pos():
    languages = [line.split()[0] for line in open('./papers/heinzerling_pos.txt')]
    scores = np.vstack([
        [float(v) for v in line.split()[1:]]
        for line
        in open('./papers/heinzerling_pos.txt')
    ])
    return languages, scores

@language_iso
def heinzerling_pos_low():
    languages = [line.split()[0] for line in open('./papers/heinzerling_pos_low.txt')]
    scores = np.vstack([
        [float(v) for v in line.split()[1:]]
        for line
        in open('./papers/heinzerling_pos_low.txt')
    ])
    r_languages, r_scores = heinzerling_pos()
    languages += r_languages
    scores = np.vstack([scores, r_scores[:, [1,2,4,10]]])
    return languages, scores

In [None]:
languages = [line.split()[0] for line in open('./papers/heinzerling_ner.txt')]
languages = [LETTER_CODES.get(l, l) for l in languages]
languages