# Visualisation of cosine similarity

I want to visualise the cosine similarities in a matrix. Let's start with one topic: Earth.

In [1]:
import os
import pandas as pd
import copy
import numpy as np

In [2]:
def get_lang_and_topics():
    """
    Gives information about the languages and topics
    of the data.
    
    Returns:
      - languages: a dictionary with the number of topics
        available for each language.
      - sorted_lang: a (sorted) list of languages.
      - lang_mapping: a dictionary mapping the sorted
        languages to an integer.
      - topics: a (sorted) list of topics.
    """
    languages = {}
    list_of_dirs = os.listdir('data/')
    
    print(f'There are {len(list_of_dirs)} directories')

    for directory in list_of_dirs:
        path = (f'data/{directory}')
        files = os.listdir(path)
        for f in files:
            lang = f.split('-')[1].split('.')[0]
            if lang in languages:
                languages[lang] += 1
            else:
                languages[lang] = 1
    #     print(f'There are {len(files)} files in {directory}.')
    print(f'There are {len(languages.keys())} languages')
    sorted_lang = sorted(list(languages.keys()))
    lang_mapping = {x:i for i, x in enumerate(sorted_lang)}
    
    topics = sorted(os.listdir('data/'))
    topic_mapping = {x:i for i, x in enumerate(topics)}
    
    return languages, sorted_lang, lang_mapping, topics, topic_mapping

In [3]:
languages, sorted_lang, lang_mapping, topics, topic_mapping = get_lang_and_topics()

There are 62 directories
There are 58 languages


In [7]:
print(languages.keys())

dict_keys(['english', 'afrikaans', 'arabic', 'belarusian', 'bulgarian', 'catalan', 'czech', 'welsh', 'danish', 'north_sami', 'german', 'estonian', 'greek', 'spanish', 'basque', 'persian', 'french', 'irish', 'scottish_gaelic', 'galician', 'gothic', 'korean', 'armenian', 'hindi', 'croatian', 'indonesian', 'italian', 'hebrew', 'kazakh', 'latin', 'latvian', 'lithuanian', 'hungarian', 'maltese', 'marathi', 'dutch', 'japanese', 'polish', 'portuguese', 'romanian', 'russian', 'sanskrit', 'slovak', 'slovenian', 'serbian', 'finnish', 'swedish', 'tamil', 'telugu', 'turkish', 'ukrainian', 'urdu', 'uyghur', 'vietnamese', 'classical_chinese', 'wolof', 'chinese', 'old_church_slavonic'])


In [8]:
print(topics)

['Adolf_Hitler', 'Africa', 'Asia', 'Association_football', 'Barack_Obama', 'Bible', 'Buddha', 'Buddhism', 'China', 'Christianity', 'Christmas', 'Dog', 'Earth', 'English_Language', 'Europe', 'Eye', 'George_W._Bush', 'Ghana', 'Gold', 'Hinduism', 'Human', 'India', 'Internet', 'Iran', 'Iraq', 'Iron', 'Islam', 'Italy', 'Japan', 'Jesus', 'Judaism', 'Julius_Caesar', 'Koran', 'Maize', 'Milk', 'Mohandas_Karamchand_Gandhi', 'Money', 'Moon', 'Moses', 'Muhammad', 'New_York_City', 'Niger', 'Osama_Bin_Laden', 'Paris', 'Periodic_table', 'Pope_Benedict_XVI', 'Pope_John_Paul_II', 'Religion', 'Rice', 'Roman_Catholic_Church', 'Rome', 'Russia', 'Silver', 'South_Africa', 'South_America', 'Soviet_Union', 'Sun', 'United_Kingdom', 'United_States', 'Water', 'Wikipedia', 'World_War_II']


In [4]:
def create_dataframe_by_topic(doc, lang, lang_mapping):
    """
    Creates pandas dataframe by topic
    
    Args:
      - doc: the document with the information.
      - lang: the list of sorted languages.
      - lang_mapping: a dictionary mapping the sorted
        languages to an integer.
    Returns:
      - df: pandas dataframe with information.
    """
    if '/' in doc:
        topic_name = doc.split('/')[-1].split('-cosine-sim')[0]
    else:
        topic_name = doc.split('-cosine-sim')[0]
    topic = {}
    for language in lang:
        topic[language] = [0]*len(lang)
        
    f = open(doc, "r")
    for line in f.readlines():
        if line.startswith('0'):
            cosine_sim = float(line[:-1])
            topic[lang1][lang_mapping[lang2]] = cosine_sim
            topic[lang2][lang_mapping[lang1]] = cosine_sim
            topic[lang1][lang_mapping[lang1]] = 1
            topic[lang2][lang_mapping[lang2]] = 1
        else: 
            files = line[:-1].split(' ')
            lang1 = files[0].split('-')[1].split('.')[0]
            lang2 = files[1].split('-')[1].split('.')[0]
    f.close()
    df = pd.DataFrame(topic)
    df.index = lang
    
    df.style.set_caption(f'{topic_name} by language')
    return df

In [18]:
# pd.set_option('display.max_columns', None)

In [6]:
Barack_Obama = create_dataframe_by_topic('cosine-sim-per-topic/Barack_Obama-cosine-sim.txt', sorted_lang, lang_mapping)
Barack_Obama

Unnamed: 0,afrikaans,arabic,armenian,basque,belarusian,bulgarian,catalan,chinese,classical_chinese,croatian,...,swedish,tamil,telugu,turkish,ukrainian,urdu,uyghur,vietnamese,welsh,wolof
afrikaans,1.0,0.740499,0.645476,0.709308,0.764804,0.917652,0.941216,0.636963,0,0.905333,...,0.898306,0.657098,0.650381,0.629815,0.864364,0.822071,0.691817,0.629302,0.896418,0
arabic,0.740499,1.0,0.460484,0.731411,0.813803,0.826204,0.675491,0.717349,0,0.771868,...,0.759872,0.854264,0.490425,0.519632,0.849624,0.740629,0.617805,0.52502,0.829936,0
armenian,0.645476,0.460484,1.0,0.723931,0.662375,0.638839,0.622327,0.617424,0,0.793283,...,0.788443,0.575944,0.690886,0.859562,0.718919,0.583799,0.877069,0.618555,0.71255,0
basque,0.709308,0.731411,0.723931,1.0,0.879942,0.752758,0.678833,0.841968,0,0.831769,...,0.745908,0.900607,0.588648,0.656925,0.884836,0.634647,0.846441,0.574505,0.817934,0
belarusian,0.764804,0.813803,0.662375,0.879942,1.0,0.871272,0.719914,0.798464,0,0.862767,...,0.80059,0.833767,0.544189,0.579708,0.945893,0.703386,0.782668,0.578098,0.880846,0
bulgarian,0.917652,0.826204,0.638839,0.752758,0.871272,1.0,0.860686,0.688282,0,0.935965,...,0.907253,0.710092,0.6158,0.602991,0.925761,0.852656,0.70897,0.633176,0.925746,0
catalan,0.941216,0.675491,0.622327,0.678833,0.719914,0.860686,1.0,0.651665,0,0.855523,...,0.869611,0.58692,0.645379,0.636297,0.805756,0.789681,0.627943,0.671813,0.876653,0
chinese,0.636963,0.717349,0.617424,0.841968,0.798464,0.688282,0.651665,1.0,0,0.733407,...,0.75648,0.846494,0.629613,0.557989,0.83306,0.668203,0.78915,0.738799,0.778571,0
classical_chinese,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
croatian,0.905333,0.771868,0.793283,0.831769,0.862767,0.935965,0.855523,0.733407,0,1.0,...,0.93707,0.75568,0.741359,0.705604,0.93603,0.860361,0.843813,0.709472,0.922124,0


In [5]:
earth = create_dataframe_by_topic('cosine-sim-per-topic/Earth-cosine-sim.txt', sorted_lang, lang_mapping)
earth

Unnamed: 0,afrikaans,arabic,armenian,basque,belarusian,bulgarian,catalan,chinese,classical_chinese,croatian,...,swedish,tamil,telugu,turkish,ukrainian,urdu,uyghur,vietnamese,welsh,wolof
afrikaans,1.0,0.717868,0.0,0.784911,0.0,0.0,0.0,0.635391,0.0,0.0,...,0.0,0.705331,0.0,0.0,0.0,0.0,0.0,0.718369,0.0,0.0
arabic,0.717868,1.0,0.0,0.765315,0.0,0.0,0.0,0.717639,0.0,0.0,...,0.0,0.88406,0.0,0.0,0.0,0.0,0.0,0.600479,0.0,0.0
armenian,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.679553,0.0,0.0,0.0,0.0
basque,0.784911,0.765315,0.0,1.0,0.0,0.0,0.0,0.859931,0.0,0.0,...,0.0,0.914378,0.0,0.0,0.0,0.0,0.0,0.68619,0.0,0.0
belarusian,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.744316,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bulgarian,0.0,0.0,0.0,0.0,0.0,1.0,0.857342,0.0,0.586022,0.93988,...,0.929925,0.0,0.0,0.717805,0.902245,0.0,0.0,0.0,0.893354,0.0
catalan,0.0,0.0,0.0,0.0,0.0,0.857342,1.0,0.0,0.530181,0.79611,...,0.89873,0.0,0.0,0.677148,0.782854,0.0,0.0,0.0,0.94616,0.0
chinese,0.635391,0.717639,0.0,0.859931,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.820563,0.0,0.0,0.0,0.0,0.0,0.77314,0.0,0.0
classical_chinese,0.0,0.0,0.0,0.0,0.0,0.586022,0.530181,0.0,1.0,0.602436,...,0.617902,0.0,0.0,0.516433,0.541321,0.0,0.0,0.0,0.601543,0.0
croatian,0.0,0.0,0.0,0.0,0.0,0.93988,0.79611,0.0,0.602436,1.0,...,0.921881,0.0,0.0,0.795785,0.949439,0.0,0.0,0.0,0.870601,0.0


In [25]:
earth.to_csv('earth.csv', sep=',')

In [6]:
data_earth = earth.replace([0], np.nan).replace([1], np.nan)
data_earth.describe()

Unnamed: 0,afrikaans,arabic,armenian,basque,belarusian,bulgarian,catalan,chinese,classical_chinese,croatian,...,swedish,tamil,telugu,turkish,ukrainian,urdu,uyghur,vietnamese,welsh,wolof
count,19.0,19.0,4.0,19.0,7.0,19.0,19.0,19.0,19.0,19.0,...,19.0,19.0,7.0,19.0,19.0,4.0,4.0,19.0,19.0,4.0
mean,0.804369,0.741432,0.789566,0.76805,0.78415,0.807042,0.75457,0.70363,0.553191,0.839038,...,0.838135,0.729141,0.675299,0.752427,0.820766,0.704518,0.57951,0.70289,0.816097,0.55883
std,0.117396,0.130646,0.126364,0.104429,0.182235,0.133445,0.140435,0.101875,0.089463,0.117715,...,0.105673,0.136769,0.096046,0.0901,0.11836,0.117379,0.119536,0.094221,0.119838,0.242191
min,0.598216,0.313805,0.679553,0.56473,0.422579,0.575302,0.482671,0.466186,0.308191,0.602436,...,0.617902,0.391305,0.529034,0.516433,0.541321,0.600607,0.404694,0.484769,0.570531,0.240545
25%,0.718119,0.717754,0.682435,0.70771,0.754866,0.711308,0.648565,0.639683,0.523307,0.768482,...,0.815146,0.672569,0.620216,0.702702,0.760423,0.648912,0.553307,0.656892,0.747109,0.482733
50%,0.784911,0.765315,0.778479,0.765315,0.80921,0.857342,0.79611,0.720053,0.586022,0.870601,...,0.86866,0.718392,0.718204,0.792606,0.86343,0.672284,0.623048,0.709367,0.870601,0.583153
75%,0.906951,0.833908,0.88561,0.857701,0.88322,0.916987,0.847343,0.769214,0.609553,0.943374,...,0.914891,0.837152,0.737711,0.798303,0.924483,0.72789,0.649251,0.762304,0.893899,0.659251
max,0.960789,0.88406,0.921753,0.919311,0.981091,0.956683,0.95375,0.859931,0.660153,0.979507,...,0.939373,0.914378,0.764002,0.892693,0.949439,0.872899,0.667252,0.921201,0.94616,0.82847


In [8]:
data_earth.min().min()

0.24054482879856956

In [9]:
data_earth.max().max()

0.9947763899865332

In [7]:
def create_dataframe_by_lang(doc, topics, topic_mapping):
    """
    Creates pandas dataframe by language
    
    Args:
      - doc: the document with the information.
      - topics: the list of sorted topics.
      - topic_mapping: a dictionary mapping the sorted
        topics to an integer.
    Returns:
      - df: pandas dataframe with information.
    """
    if '/' in doc:
        language_name = doc.split('/')[-1].split('-cosine_sim')[0]
    else:
        language_name = doc.split('-cosine_sim')[0]
    
    lang = {}
    for topic in topics:
        lang[topic] = [0]*len(topics)
        
    f = open(doc, "r")
    for line in f.readlines():
        if line.startswith('0'):
            cosine_sim = float(line[:-1])
            lang[topic1][topic_mapping[topic2]] = cosine_sim
            lang[topic2][topic_mapping[topic1]] = cosine_sim
            lang[topic1][topic_mapping[topic1]] = 1
            lang[topic2][topic_mapping[topic2]] = 1
        else: 
            languages = line[:-1].split(' ')
            topic1 = languages[0].split('-')[0]
            topic2 = languages[1].split('-')[0]
    f.close()
    df = pd.DataFrame(lang)
    df.index = topics
    df.style.set_caption(f'{language_name} by topic')
    
    return df

In [8]:
english = create_dataframe_by_lang('cosine-sim-per-lang/english-cosine_sim.txt', topics, topic_mapping)
english

Unnamed: 0,Adolf_Hitler,Africa,Asia,Association_football,Barack_Obama,Bible,Buddha,Buddhism,China,Christianity,...,Silver,South_Africa,South_America,Soviet_Union,Sun,United_Kingdom,United_States,Water,Wikipedia,World_War_II
Adolf_Hitler,1.000000,0.975701,0.977857,0.985614,0.985782,0.974498,0.981897,0.972502,0.982869,0.980596,...,0.966793,0.978423,0.971761,0.987888,0.978980,0.978873,0.978302,0.984134,0.990179,0.985624
Africa,0.975701,1.000000,0.994797,0.976733,0.965361,0.988471,0.978266,0.990315,0.996866,0.991179,...,0.985633,0.991365,0.994513,0.990895,0.984339,0.991548,0.991452,0.992417,0.983504,0.987609
Asia,0.977857,0.994797,1.000000,0.977482,0.967870,0.988378,0.981573,0.991312,0.994839,0.992581,...,0.985958,0.990583,0.998255,0.988823,0.978005,0.994719,0.993356,0.994743,0.985114,0.983125
Association_football,0.985614,0.976733,0.977482,1.000000,0.975193,0.983742,0.980402,0.971284,0.980872,0.978592,...,0.964904,0.982058,0.973527,0.993619,0.991644,0.985044,0.979509,0.978977,0.985753,0.984915
Barack_Obama,0.985782,0.965361,0.967870,0.975193,1.000000,0.949647,0.953112,0.948126,0.979764,0.953694,...,0.960973,0.983000,0.964727,0.975057,0.961137,0.979197,0.980888,0.974872,0.986477,0.979243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
United_Kingdom,0.978873,0.991548,0.994719,0.985044,0.979197,0.984397,0.973701,0.980435,0.994727,0.984588,...,0.977239,0.994979,0.994235,0.992554,0.981753,1.000000,0.996866,0.988862,0.985952,0.989701
United_States,0.978302,0.991452,0.993356,0.979509,0.980888,0.981583,0.974863,0.982548,0.995702,0.981814,...,0.983552,0.998223,0.994783,0.989393,0.976817,0.996866,1.000000,0.990525,0.987425,0.990076
Water,0.984134,0.992417,0.994743,0.978977,0.974872,0.983268,0.983440,0.990241,0.993454,0.988742,...,0.992090,0.990876,0.992951,0.986573,0.976996,0.988862,0.990525,1.000000,0.992396,0.984507
Wikipedia,0.990179,0.983504,0.985114,0.985753,0.986477,0.977733,0.982723,0.979578,0.989806,0.979606,...,0.982957,0.990261,0.982873,0.986947,0.979060,0.985952,0.987425,0.992396,1.000000,0.985818


In [9]:
data_english = english.replace([0], np.nan).replace([1], np.nan)
data_english.describe()

Unnamed: 0,Adolf_Hitler,Africa,Asia,Association_football,Barack_Obama,Bible,Buddha,Buddhism,China,Christianity,...,Silver,South_Africa,South_America,Soviet_Union,Sun,United_Kingdom,United_States,Water,Wikipedia,World_War_II
count,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,...,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
mean,0.979579,0.983967,0.984568,0.977783,0.966201,0.981943,0.980224,0.982165,0.984906,0.983723,...,0.975879,0.982317,0.982444,0.98516,0.977771,0.982298,0.982286,0.985444,0.982867,0.981059
std,0.009062,0.01094,0.009947,0.008764,0.014191,0.010886,0.010114,0.011317,0.009889,0.01073,...,0.011811,0.011396,0.011778,0.007962,0.010192,0.011008,0.011422,0.008001,0.006715,0.008083
min,0.945645,0.959306,0.962532,0.95295,0.929198,0.941112,0.93056,0.940058,0.96403,0.937826,...,0.950634,0.954617,0.955222,0.961212,0.945135,0.953356,0.953626,0.96291,0.965803,0.963821
25%,0.975405,0.976733,0.977857,0.973527,0.95881,0.977498,0.976611,0.977511,0.979764,0.978592,...,0.966636,0.976105,0.973527,0.979222,0.973283,0.975381,0.976629,0.979873,0.978995,0.976419
50%,0.980227,0.988471,0.987961,0.978592,0.968147,0.983742,0.98183,0.985198,0.986856,0.985716,...,0.977239,0.985219,0.98665,0.986405,0.977862,0.985044,0.983552,0.987836,0.982957,0.98305
75%,0.985782,0.991452,0.992474,0.983742,0.975801,0.989794,0.986054,0.990073,0.992918,0.991393,...,0.984628,0.991803,0.991803,0.991914,0.984725,0.991548,0.990525,0.991774,0.988285,0.987153
max,0.993299,0.997363,0.998255,0.993619,0.996159,0.995271,0.994531,0.997588,0.99863,0.997832,...,0.996065,0.998223,0.998255,0.996613,0.998739,0.996866,0.998223,0.996894,0.992749,0.993895


In [10]:
data_english.min().min()

0.9015785702126288

In [16]:
afrikaans = create_dataframe_by_lang('cosine-sim-per-lang/afrikaans-cosine_sim.txt', topics, topic_mapping)
afrikaans['Earth']

Adolf_Hitler            0.969053
Africa                  0.995073
Asia                    0.991120
Association_football    0.982721
Barack_Obama            0.960238
                          ...   
United_Kingdom          0.984085
United_States           0.987128
Water                   0.978197
Wikipedia               0.969267
World_War_II            0.980926
Name: Earth, Length: 62, dtype: float64

In [20]:
afrikaans['Earth'].idxmin(axis=1)

'Religion'

In [24]:
afrikaans.to_csv('afrikaans.csv', sep=',')

In [15]:
data_afrikaans = afrikaans.replace([0], np.nan).replace([1], np.nan)
data_afrikaans.describe()['Earth']

count    61.000000
mean      0.968987
std       0.053230
min       0.591695
25%       0.970080
50%       0.979808
75%       0.984229
max       0.995429
Name: Earth, dtype: float64

In [13]:
data_afrikaans.min().min()

0.5645573926867273

In [14]:
"""
I know that it is Iran + South_Africa by looking at the file, but I want to 
be able to get the column and row name directly
"""
data_afrikaans.max().max()

0.9991618300614605

In [15]:
data_afrikaans.idxmax(0)

Adolf_Hitler              South_Africa
Africa                           Earth
Asia                            Europe
Association_football      Soviet_Union
Barack_Obama            George_W._Bush
                             ...      
United_Kingdom           United_States
United_States           United_Kingdom
Water                     South_Africa
Wikipedia                         Iron
World_War_II              Soviet_Union
Length: 62, dtype: object

In [16]:
descr = data_afrikaans.describe()
descr.loc['max'].idxmax(0)

'Iran'

In [17]:
print(data_afrikaans.where(data_afrikaans == 0.9991618300614605).dropna().index)
print(data_afrikaans.where(data_afrikaans == 0.9991618300614605).dropna().columns)

Index([], dtype='object')
Index(['Adolf_Hitler', 'Africa', 'Asia', 'Association_football',
       'Barack_Obama', 'Bible', 'Buddha', 'Buddhism', 'China', 'Christianity',
       'Christmas', 'Dog', 'Earth', 'English_Language', 'Europe', 'Eye',
       'George_W._Bush', 'Ghana', 'Gold', 'Hinduism', 'Human', 'India',
       'Internet', 'Iran', 'Iraq', 'Iron', 'Islam', 'Italy', 'Japan', 'Jesus',
       'Judaism', 'Julius_Caesar', 'Koran', 'Maize', 'Milk',
       'Mohandas_Karamchand_Gandhi', 'Money', 'Moon', 'Moses', 'Muhammad',
       'New_York_City', 'Niger', 'Osama_Bin_Laden', 'Paris', 'Periodic_table',
       'Pope_Benedict_XVI', 'Pope_John_Paul_II', 'Religion', 'Rice',
       'Roman_Catholic_Church', 'Rome', 'Russia', 'Silver', 'South_Africa',
       'South_America', 'Soviet_Union', 'Sun', 'United_Kingdom',
       'United_States', 'Water', 'Wikipedia', 'World_War_II'],
      dtype='object')


## Finding data on language families

The following is a very inelegant way to do it, but it works.

In [4]:
import requests
from bs4 import BeautifulSoup

# Attention! I am not getting the distinction 
# between families (e.g. Indo-European languages >
# Indo-Iranian > Bihari languages, Irani, etc.)

url = 'https://omniglot.com/writing/langfam.htm'
prev = requests.get(url).text
htmlParse = BeautifulSoup(prev, 'html.parser')
with open('languages.tmp', 'w') as f:
    for tag in htmlParse.find_all(["h2", "dl"]):
        f.write(str(tag) + '\n')

In [33]:
lang_fam = {}
f = open('languages.tmp', 'r')
for line in f.readlines():
    line = line.strip()
    if line.startswith('<dd'):
        line = line[4:]
        
    if line.startswith('<h2'):
        big_fam = line.split('>')[1].split('<')[0]
        lang_fam[big_fam] = {}
    elif line.startswith('<dt'):
        fam = line.split('>')[1].split('<')[0]
        lang_fam[big_fam][fam] = []
    elif line.startswith('<a'):
        lang = line.split('>')[1].split('<')[0]
        lang_fam[big_fam][fam].append(lang.lower())
f.close()

In [107]:
# lang_fam

In [100]:
only_lang = []
for big_fam, fam in lang_fam.items():
    for lang in fam.values():
        only_lang.extend(lang)

In [44]:
# only_lang

In [108]:
# languages that do not match
not_match = []
for lang in languages:
    if '_' in lang:
        lang = ' '.join(lang.split('_'))
    if lang not in only_lang:
        not_match.append(lang)

In [104]:
my_lang_fam = {}

for lang in list(languages.keys()):
    if '_' in lang:
        lang = ' '.join(lang.split('_'))
    
    for big_fam, family in lang_fam.items():
        for fam, langs in family.items():
            if lang in langs:
                if big_fam not in my_lang_fam:
                    my_lang_fam[big_fam] = {}

                if fam not in my_lang_fam[big_fam]:
                    my_lang_fam[big_fam][fam] = ['_'.join(lang.split(' '))]
                else:
                    my_lang_fam[big_fam][fam].append('_'.join(lang.split(' ')))

# Adding the ones that do not match
my_lang_fam['Afroasiatic languages']['Semitic languages'].append('arabic')
my_lang_fam['Uralic languages']['Sámi languages'] = ['north_sami']
my_lang_fam['Indo-European languages']['Indo-Iranian languages'].append('sanskrit')
my_lang_fam['Sino-Tibetan languages'] = {'Sinitic (Chinese) languages' : ['classical_chinese']}
my_lang_fam['Sino-Tibetan languages']['Sinitic (Chinese) languages'].append('chinese')

In [105]:
my_lang_fam

{'Indo-European languages': {'Germanic languages': ['english',
   'afrikaans',
   'danish',
   'german',
   'gothic',
   'dutch',
   'swedish'],
  'Slavic languages': ['belarusian',
   'bulgarian',
   'czech',
   'croatian',
   'polish',
   'russian',
   'slovak',
   'slovenian',
   'serbian',
   'ukrainian',
   'old_church_slavonic'],
  'Romance languages': ['catalan',
   'spanish',
   'french',
   'galician',
   'italian',
   'portuguese',
   'romanian'],
  'Celtic languages': ['welsh', 'irish', 'scottish_gaelic'],
  'Hellenic languages': ['greek'],
  'Indo-Iranian languages': ['persian',
   'hindi',
   'marathi',
   'urdu',
   'sanskrit'],
  'Armenian languages': ['armenian'],
  'Italic languages': ['latin'],
  'Baltic languages': ['latvian', 'lithuanian']},
 'Uralic languages': {'Finnic languages': ['estonian', 'finnish'],
  'Ugric languages': ['hungarian'],
  'Sámi languages': ['north_sami']},
 'Language isolates': {'Language isolates': ['basque']},
 'Koreanic languages': {'Korean

In [109]:
my_lang_fam.keys()

dict_keys(['Indo-European languages', 'Uralic languages', 'Language isolates', 'Koreanic languages', 'Austronesian languages', 'Afroasiatic languages', 'Turkic languages', 'Japonic / Japanese-Ryukyuan languages', 'Dravidian languages', 'Austroasiatic languages', 'Niger-Congo languages', 'Sino-Tibetan languages'])