# Visualisation of cosine similarity

I want to visualise the cosine similarities in a matrix. Let's start with one topic: Earth.

In [1]:
import os
import pandas as pd
import copy
import numpy as np

In [2]:
def get_lang_and_topics():
    """
    Gives information about the languages and topics
    of the data.
    
    Returns:
      - languages: a dictionary with the number of topics
        available for each language.
      - sorted_lang: a (sorted) list of languages.
      - lang_mapping: a dictionary mapping the sorted
        languages to an integer.
      - topics: a (sorted) list of topics.
    """
    languages = {}
    list_of_dirs = os.listdir('data/')
    
    print(f'There are {len(list_of_dirs)} directories')

    for directory in list_of_dirs:
        path = (f'data/{directory}')
        files = os.listdir(path)
        for f in files:
            lang = f.split('-')[1].split('.')[0]
            if lang in languages:
                languages[lang] += 1
            else:
                languages[lang] = 1
    #     print(f'There are {len(files)} files in {directory}.')
    print(f'There are {len(languages.keys())} languages')
    sorted_lang = sorted(list(languages.keys()))
    lang_mapping = {x:i for i, x in enumerate(sorted_lang)}
    
    topics = sorted(os.listdir('data/'))
    topic_mapping = {x:i for i, x in enumerate(topics)}
    
    return languages, sorted_lang, lang_mapping, topics, topic_mapping

In [3]:
languages, sorted_lang, lang_mapping, topics, topic_mapping = get_lang_and_topics()

There are 62 directories
There are 58 languages


In [7]:
print(languages.keys())

dict_keys(['english', 'afrikaans', 'arabic', 'belarusian', 'bulgarian', 'catalan', 'czech', 'welsh', 'danish', 'north_sami', 'german', 'estonian', 'greek', 'spanish', 'basque', 'persian', 'french', 'irish', 'scottish_gaelic', 'galician', 'gothic', 'korean', 'armenian', 'hindi', 'croatian', 'indonesian', 'italian', 'hebrew', 'kazakh', 'latin', 'latvian', 'lithuanian', 'hungarian', 'maltese', 'marathi', 'dutch', 'japanese', 'polish', 'portuguese', 'romanian', 'russian', 'sanskrit', 'slovak', 'slovenian', 'serbian', 'finnish', 'swedish', 'tamil', 'telugu', 'turkish', 'ukrainian', 'urdu', 'uyghur', 'vietnamese', 'classical_chinese', 'wolof', 'chinese', 'old_church_slavonic'])


In [8]:
print(topics)

['Adolf_Hitler', 'Africa', 'Asia', 'Association_football', 'Barack_Obama', 'Bible', 'Buddha', 'Buddhism', 'China', 'Christianity', 'Christmas', 'Dog', 'Earth', 'English_Language', 'Europe', 'Eye', 'George_W._Bush', 'Ghana', 'Gold', 'Hinduism', 'Human', 'India', 'Internet', 'Iran', 'Iraq', 'Iron', 'Islam', 'Italy', 'Japan', 'Jesus', 'Judaism', 'Julius_Caesar', 'Koran', 'Maize', 'Milk', 'Mohandas_Karamchand_Gandhi', 'Money', 'Moon', 'Moses', 'Muhammad', 'New_York_City', 'Niger', 'Osama_Bin_Laden', 'Paris', 'Periodic_table', 'Pope_Benedict_XVI', 'Pope_John_Paul_II', 'Religion', 'Rice', 'Roman_Catholic_Church', 'Rome', 'Russia', 'Silver', 'South_Africa', 'South_America', 'Soviet_Union', 'Sun', 'United_Kingdom', 'United_States', 'Water', 'Wikipedia', 'World_War_II']


In [4]:
def create_dataframe_by_topic(doc, lang, lang_mapping):
    """
    Creates pandas dataframe by topic
    
    Args:
      - doc: the document with the information.
      - lang: the list of sorted languages.
      - lang_mapping: a dictionary mapping the sorted
        languages to an integer.
    Returns:
      - df: pandas dataframe with information.
    """
    if '/' in doc:
        topic_name = doc.split('/')[-1].split('-cosine-sim')[0]
    else:
        topic_name = doc.split('-cosine-sim')[0]
    topic = {}
    for language in lang:
        topic[language] = [0]*len(lang)
        
    f = open(doc, "r")
    for line in f.readlines():
        if line[0].isdigit():
            cosine_sim = float(line[:-1])
            topic[lang1][lang_mapping[lang2]] = cosine_sim
            topic[lang2][lang_mapping[lang1]] = cosine_sim
            topic[lang1][lang_mapping[lang1]] = 1
            topic[lang2][lang_mapping[lang2]] = 1
        else: 
            files = line[:-1].split(' ')
            lang1 = files[0].split('-')[1].split('.')[0]
            try:
                lang2 = files[1].split('-')[1].split('.')[0]
            except:
                print(files)
    f.close()
    df = pd.DataFrame(topic)
    df.index = lang
    
    df.style.set_caption(f'{topic_name} by language')
    return df

In [18]:
# pd.set_option('display.max_columns', None)

In [20]:
topic_path = 'cosine-sim-per-topic/cosine-sim-plain-data/'
save_path = 'cosine-sim-per-topic/cosine-sim-dataframes/'
topic_files = os.listdir(topic_path)
for file in topic_files:
#     print(file)
    name = file.split("-")[0]
    df = create_dataframe_by_topic(topic_path+file, sorted_lang, lang_mapping)
    df.to_csv(f'{save_path}{name}.csv', sep=',')

In [4]:
def no_underscores(text):
    if '_' in text:
        sep = text.split('_')
        cap = [x.capitalize() for x in sep]
        newtext = ' '.join(cap)
        return newtext
    else:
        return text.capitalize()

In [5]:
import os
import numpy as np
import pandas as pd
topic = {} # topic: [max_sim, min_sim, avg_sim]
path_topic_df = 'cosine-sim-per-topic/cosine-sim-dataframes/'
topic_df = os.listdir(path_topic_df)
for dfname in topic_df:
    name = dfname.split('.')[0]
    more_df = pd.read_csv(path_topic_df+dfname, sep = ',', index_col = 0)
    df = more_df.replace([0], np.nan).replace([1], np.nan) # ignore 1s and 0s
    maxvalue = df.max().max()
    minvalue = df.min().min()
    descr = df.describe()
    max1 = descr.loc['max'].idxmax(0)
    max2 = df.idxmax(0)[max1]
    min1 = descr.loc['min'].idxmin(0)
    min2 = df.idxmin(0)[min1]
    meanvalue = df.mean().mean()
    topic[no_underscores(name)] = [round(maxvalue, 4), (no_underscores(max1) + ', ' + no_underscores(max2)), round(minvalue, 4), (no_underscores(min1) + ', ' + no_underscores(min2)), round(meanvalue, 4)]

In [6]:
topic

{'Adolf Hitler': [0.9928,
  'Catalan, Spanish',
  0.2138,
  'Japanese, Sanskrit',
  0.7383],
 'Africa': [0.996, 'Catalan, Spanish', 0.1238, 'Gothic, Hungarian', 0.7105],
 'Asia': [0.9896,
  'Belarusian, Ukrainian',
  0.1076,
  'Gothic, Hungarian',
  0.7125],
 'Association Football': [0.9968,
  'Catalan, Spanish',
  0.1195,
  'Gothic, Hungarian',
  0.7208],
 'Barack Obama': [0.9887,
  'Catalan, Spanish',
  0.1396,
  'Japanese, Sanskrit',
  0.6949],
 'Bible': [0.9895,
  'Catalan, Spanish',
  0.2033,
  'Classical Chinese, Japanese',
  0.7521],
 'Buddha': [0.9892, 'Czech, Slovak', 0.0926, 'Gothic, Hungarian', 0.7435],
 'Buddhism': [0.989,
  'Belarusian, Ukrainian',
  0.1734,
  'Japanese, Sanskrit',
  0.7623],
 'China': [0.9948,
  'Belarusian, Ukrainian',
  0.1296,
  'Gothic, North Sami',
  0.7045],
 'Christianity': [0.9952,
  'Catalan, Spanish',
  0.1638,
  'Japanese, Sanskrit',
  0.7344],
 'Christmas': [0.987,
  'Catalan, Spanish',
  0.1624,
  'Japanese, Sanskrit',
  0.7299],
 'Dog': [0.9

In [14]:
sum_av_per_topic = []
[sum_av_per_topic.append(x[-1]) for x in topic.values()]
np.average(sum_av_per_topic)
# sum_av_per_topic

0.7212838709677417

In [10]:
df_all_topics = pd.DataFrame.from_dict(topic, orient='index')
df_all_topics.columns = ['Max sim', 'Max lang', 'Min sim', 'Min lang', 'Avg sim']
df_all_topics.to_latex('cosine-sim-per-topic/all_topics_info.csv', caption = 'Cosine imilarity per topic of the Wikipedia', label = 'table:cosinesimtopic')

In [11]:
df_all_topics.describe()

Unnamed: 0,Max sim,Min sim,Avg sim
count,62.0,62.0,62.0
mean,0.992876,0.136866,0.721284
std,0.003682,0.060452,0.023342
min,0.9837,0.0308,0.659
25%,0.990325,0.090725,0.70545
50%,0.9935,0.1288,0.71645
75%,0.9956,0.165675,0.7386
max,0.9983,0.3501,0.7918


In [12]:
max_lang_topic = df_all_topics['Max lang'].tolist()
import collections
counter=collections.Counter(max_lang_topic)
counter

Counter({'Catalan, Spanish': 43,
         'Belarusian, Ukrainian': 9,
         'Czech, Slovak': 7,
         'Croatian, Serbian': 3})

In [127]:
43/62*100

69.35483870967742

In [13]:
min_lang_topic = df_all_topics['Min lang'].tolist()
mincounter=collections.Counter(min_lang_topic)
mincounter

Counter({'Japanese, Sanskrit': 18,
         'Gothic, Hungarian': 23,
         'Classical Chinese, Japanese': 2,
         'Gothic, North Sami': 1,
         'Latin, Sanskrit': 1,
         'Kazakh, Sanskrit': 1,
         'Galician, Sanskrit': 4,
         'Gothic, Kazakh': 1,
         'Gothic, Sanskrit': 1,
         'Hungarian, Japanese': 2,
         'Sanskrit, Wolof': 1,
         'Latin, Uyghur': 1,
         'Japanese, Marathi': 1,
         'Latin, Urdu': 1,
         'Japanese, Telugu': 1,
         'Old Church Slavonic, Uyghur': 1,
         'Gothic, Wolof': 1,
         'Hungarian, Urdu': 1})

In [15]:
import pandas as pd
lang_info = {} # topic: [max_sim, min_sim, avg_sim]
path_lang_df = 'cosine-sim-per-lang/cosine-sim-dataframes/'
lang_df = os.listdir(path_lang_df)
for dfname in lang_df:
    name = dfname.split('.')[0].capitalize()
    more_df = pd.read_csv(path_lang_df+dfname, sep = ',', index_col = 0)
    df = more_df.replace([0], np.nan).replace([1], np.nan) # ignore 1s and 0s
    maxvalue = df.max().max()
    minvalue = df.min().min()
    descr = df.describe()
    max1 = descr.loc['max'].idxmax(0)
    max2 = df.idxmax(0)[max1]
    min1 = descr.loc['min'].idxmin(0)
    min2 = df.idxmin(0)[min1]
    meanvalue = df.mean().mean()
    lang_info[no_underscores(name)] = [round(maxvalue, 4), (no_underscores(max1) + ', ' + no_underscores(max2)), round(minvalue, 4), (no_underscores(min1) + ', ' + no_underscores(min2)), round(meanvalue, 4)]

In [16]:
lang_info

{'Afrikaans': [0.9992,
  'Iran, South Africa',
  0.5646,
  'Association Football, Religion',
  0.954],
 'Arabic': [0.999, 'Gold, Silver', 0.9023, 'Muhammad, Soviet Union', 0.9794],
 'Armenian': [0.9976,
  'Italy, United Kingdom',
  0.7021,
  'George W. Bush, South Africa',
  0.9544],
 'Basque': [0.9974,
  'Europe, United States',
  0.7796,
  'English Language, Osama Bin Laden',
  0.9605],
 'Belarusian': [0.9982,
  'China, South Africa',
  0.8031,
  'Money, Osama Bin Laden',
  0.9596],
 'Bulgarian': [0.9983,
  'South Africa, United States',
  0.8746,
  'Association Football, World War Ii',
  0.9759],
 'Catalan': [0.9986, 'Eye, Milk', 0.9084, 'English Language, Money', 0.979],
 'Chinese': [0.9967, 'Japan, United States', 0.8518, 'China, Moses', 0.9653],
 'Classical Chinese': [0.9967,
  'China, Japan',
  0.679,
  'Human, South Africa',
  0.9437],
 'Croatian': [0.9972,
  'Italy, United States',
  0.8362,
  'Islam, Osama Bin Laden',
  0.9594],
 'Czech': [0.9985,
  'India, Japan',
  0.8859,


In [17]:
sum_av_per_lang = []
[sum_av_per_lang.append(x[-1]) for x in lang_info.values()]
np.average(sum_av_per_lang)
# sum_av_per_topic

0.9428586206896551

In [16]:
df_all_lang = pd.DataFrame.from_dict(lang_info, orient='index')
df_all_lang.columns = ['Max sim', 'Max lang', 'Min sim', 'Min lang', 'Avg sim']
df_all_lang.to_latex('cosine-sim-per-lang/all_lang_info.csv', caption = 'Cosine imilarity per language', label = 'table:cosinesimlang')

In [17]:
df_all_lang.describe()

Unnamed: 0,Max sim,Min sim,Avg sim
count,58.0,58.0,58.0
mean,0.997238,0.740162,0.942859
std,0.002604,0.185428,0.058615
min,0.987,0.2582,0.6791
25%,0.996725,0.649725,0.941525
50%,0.9979,0.7959,0.96145
75%,0.9987,0.88295,0.976725
max,0.9997,0.9458,0.9896


In [18]:
max_topic_lang = df_all_lang['Max lang'].tolist()
max_topic_lang_counter=collections.Counter(max_topic_lang)
max_topic_lang_counter

Counter({'Iran, South Africa': 1,
         'Gold, Silver': 1,
         'Italy, United Kingdom': 1,
         'Europe, United States': 1,
         'China, South Africa': 1,
         'South Africa, United States': 2,
         'Eye, Milk': 1,
         'Japan, United States': 1,
         'China, Japan': 2,
         'Italy, United States': 2,
         'India, Japan': 2,
         'China, Soviet Union': 1,
         'Moon, Sun': 3,
         'India, United States': 2,
         'China, Iran': 1,
         'India, Iran': 2,
         'China, India': 1,
         'Iran, Russia': 1,
         'New York City, United States': 1,
         'China, Russia': 2,
         'Iran, Italy': 1,
         'Africa, South America': 1,
         'Christianity, Judaism': 1,
         'Koran, Sun': 1,
         'Barack Obama, George W. Bush': 1,
         'Human, South America': 1,
         'Russia, United States': 1,
         'India, Paris': 1,
         'India, Russia': 1,
         'Asia, Europe': 2,
         'Bible, Europe':

In [20]:
{k: v for k, v in sorted(max_topic_lang_counter.items(), key=lambda item: item[1], reverse=True)}

{'Moon, Sun': 3,
 'South Africa, United States': 2,
 'China, Japan': 2,
 'Italy, United States': 2,
 'India, Japan': 2,
 'India, United States': 2,
 'India, Iran': 2,
 'China, Russia': 2,
 'Asia, Europe': 2,
 'Italy, Russia': 2,
 'Iran, South Africa': 1,
 'Gold, Silver': 1,
 'Italy, United Kingdom': 1,
 'Europe, United States': 1,
 'China, South Africa': 1,
 'Eye, Milk': 1,
 'Japan, United States': 1,
 'China, Soviet Union': 1,
 'China, Iran': 1,
 'China, India': 1,
 'Iran, Russia': 1,
 'New York City, United States': 1,
 'Iran, Italy': 1,
 'Africa, South America': 1,
 'Christianity, Judaism': 1,
 'Koran, Sun': 1,
 'Barack Obama, George W. Bush': 1,
 'Human, South America': 1,
 'Russia, United States': 1,
 'India, Paris': 1,
 'India, Russia': 1,
 'Bible, Europe': 1,
 'Iran, Soviet Union': 1,
 'Iraq, South Africa': 1,
 'Asia, South America': 1,
 'English Language, Japan': 1,
 'China, United States': 1,
 'China, Italy': 1,
 'Russia, South Africa': 1,
 'Islam, Judaism': 1,
 'Iraq, New Yor

In [19]:
min_topic_lang = df_all_lang['Min lang'].tolist()
min_topic_lang_counter=collections.Counter(min_topic_lang)
min_topic_lang_counter

Counter({'Association Football, Religion': 1,
         'Muhammad, Soviet Union': 1,
         'George W. Bush, South Africa': 1,
         'English Language, Osama Bin Laden': 1,
         'Money, Osama Bin Laden': 1,
         'Association Football, World War Ii': 1,
         'English Language, Money': 1,
         'China, Moses': 1,
         'Human, South Africa': 1,
         'Islam, Osama Bin Laden': 1,
         'Osama Bin Laden, Roman Catholic Church': 1,
         'Asia, Pope Benedict Xvi': 1,
         'Barack Obama, Sun': 1,
         'Julius Caesar, New York City': 1,
         'Pope John Paul Ii, South America': 1,
         'Asia, Osama Bin Laden': 1,
         'George W. Bush, Hinduism': 1,
         'Iran, Moses': 1,
         'Rome, Sun': 1,
         'Jesus, South America': 1,
         'Julius Caesar, Rice': 1,
         'Roman Catholic Church, Silver': 1,
         'Niger, Osama Bin Laden': 1,
         'English Language, Eye': 1,
         'Osama Bin Laden, Rice': 1,
         'Eye, Human

In [21]:
{k: v for k, v in sorted(min_topic_lang_counter.items(), key=lambda item: item[1], reverse=True)}

{'Osama Bin Laden, Water': 2,
 'Asia, Barack Obama': 2,
 'English Language, Pope John Paul Ii': 2,
 'Association Football, Religion': 1,
 'Muhammad, Soviet Union': 1,
 'George W. Bush, South Africa': 1,
 'English Language, Osama Bin Laden': 1,
 'Money, Osama Bin Laden': 1,
 'Association Football, World War Ii': 1,
 'English Language, Money': 1,
 'China, Moses': 1,
 'Human, South Africa': 1,
 'Islam, Osama Bin Laden': 1,
 'Osama Bin Laden, Roman Catholic Church': 1,
 'Asia, Pope Benedict Xvi': 1,
 'Barack Obama, Sun': 1,
 'Julius Caesar, New York City': 1,
 'Pope John Paul Ii, South America': 1,
 'Asia, Osama Bin Laden': 1,
 'George W. Bush, Hinduism': 1,
 'Iran, Moses': 1,
 'Rome, Sun': 1,
 'Jesus, South America': 1,
 'Julius Caesar, Rice': 1,
 'Roman Catholic Church, Silver': 1,
 'Niger, Osama Bin Laden': 1,
 'English Language, Eye': 1,
 'Osama Bin Laden, Rice': 1,
 'Eye, Human': 1,
 'English Language, World War Ii': 1,
 'Human, Osama Bin Laden': 1,
 'Ghana, Osama Bin Laden': 1,
 'Pop

In [15]:
Barack_Obama = create_dataframe_by_topic('cosine-sim-per-topic/cosine-sim-plain-data/Barack_Obama-cosine-sim.txt', sorted_lang, lang_mapping)
Barack_Obama

Unnamed: 0,afrikaans,arabic,armenian,basque,belarusian,bulgarian,catalan,chinese,classical_chinese,croatian,...,swedish,tamil,telugu,turkish,ukrainian,urdu,uyghur,vietnamese,welsh,wolof
afrikaans,1.0,0.740499,0.645476,0.709308,0.764804,0.917652,0.941216,0.636963,0,0.905333,...,0.898306,0.657098,0.650381,0.629815,0.864364,0.822071,0.691817,0.629302,0.896418,0
arabic,0.740499,1.0,0.460484,0.731411,0.813803,0.826204,0.675491,0.717349,0,0.771868,...,0.759872,0.854264,0.490425,0.519632,0.849624,0.740629,0.617805,0.52502,0.829936,0
armenian,0.645476,0.460484,1.0,0.723931,0.662375,0.638839,0.622327,0.617424,0,0.793283,...,0.788443,0.575944,0.690886,0.859562,0.718919,0.583799,0.877069,0.618555,0.71255,0
basque,0.709308,0.731411,0.723931,1.0,0.879942,0.752758,0.678833,0.841968,0,0.831769,...,0.745908,0.900607,0.588648,0.656925,0.884836,0.634647,0.846441,0.574505,0.817934,0
belarusian,0.764804,0.813803,0.662375,0.879942,1.0,0.871272,0.719914,0.798464,0,0.862767,...,0.80059,0.833767,0.544189,0.579708,0.945893,0.703386,0.782668,0.578098,0.880846,0
bulgarian,0.917652,0.826204,0.638839,0.752758,0.871272,1.0,0.860686,0.688282,0,0.935965,...,0.907253,0.710092,0.6158,0.602991,0.925761,0.852656,0.70897,0.633176,0.925746,0
catalan,0.941216,0.675491,0.622327,0.678833,0.719914,0.860686,1.0,0.651665,0,0.855523,...,0.869611,0.58692,0.645379,0.636297,0.805756,0.789681,0.627943,0.671813,0.876653,0
chinese,0.636963,0.717349,0.617424,0.841968,0.798464,0.688282,0.651665,1.0,0,0.733407,...,0.75648,0.846494,0.629613,0.557989,0.83306,0.668203,0.78915,0.738799,0.778571,0
classical_chinese,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
croatian,0.905333,0.771868,0.793283,0.831769,0.862767,0.935965,0.855523,0.733407,0,1.0,...,0.93707,0.75568,0.741359,0.705604,0.93603,0.860361,0.843813,0.709472,0.922124,0


In [25]:
earth = create_dataframe_by_topic('cosine-sim-per-topic/cosine-sim-plain-data/Earth-cosine-sim.txt', sorted_lang, lang_mapping)
earth

Unnamed: 0,afrikaans,arabic,armenian,basque,belarusian,bulgarian,catalan,chinese,classical_chinese,croatian,...,swedish,tamil,telugu,turkish,ukrainian,urdu,uyghur,vietnamese,welsh,wolof
afrikaans,1.0,0.717868,0.737107,0.784911,0.843477,0.854647,0.957855,0.635391,0.54239,0.834255,...,0.910377,0.705331,0.668543,0.729167,0.805735,0.815809,0.633716,0.718369,0.924586,0.786745
arabic,0.717868,1.0,0.572862,0.765315,0.862397,0.870333,0.75008,0.717639,0.545526,0.822724,...,0.784902,0.88406,0.547872,0.6763,0.817848,0.798621,0.63677,0.600479,0.813661,0.582477
armenian,0.737107,0.572862,1.0,0.822031,0.820188,0.74004,0.641944,0.648993,0.501025,0.859671,...,0.819504,0.692977,0.787433,0.868504,0.836587,0.679553,0.758473,0.621737,0.730573,0.735974
basque,0.784911,0.765315,0.822031,1.0,0.889179,0.799527,0.762523,0.859931,0.621538,0.879532,...,0.829046,0.914378,0.809631,0.769343,0.897802,0.733441,0.837458,0.68619,0.872219,0.800168
belarusian,0.843477,0.862397,0.820188,0.889179,1.0,0.941232,0.822175,0.786829,0.58084,0.958669,...,0.908872,0.878818,0.744316,0.786972,0.982175,0.841625,0.783111,0.706854,0.891804,0.702307
bulgarian,0.854647,0.870333,0.74004,0.799527,0.941232,1.0,0.857342,0.720861,0.586022,0.93988,...,0.929925,0.79657,0.654617,0.717805,0.902245,0.914926,0.702696,0.720172,0.893354,0.651316
catalan,0.957855,0.75008,0.641944,0.762523,0.822175,0.857342,1.0,0.661127,0.530181,0.79611,...,0.89873,0.690888,0.632526,0.677148,0.782854,0.825162,0.586426,0.720417,0.94616,0.779693
chinese,0.635391,0.717639,0.648993,0.859931,0.786829,0.720861,0.661127,1.0,0.613016,0.774934,...,0.723389,0.820563,0.752488,0.648826,0.817126,0.682119,0.853253,0.77314,0.741518,0.701533
classical_chinese,0.54239,0.545526,0.501025,0.621538,0.58084,0.586022,0.530181,0.613016,1.0,0.602436,...,0.617902,0.584359,0.593445,0.516433,0.541321,0.535703,0.652453,0.567002,0.601543,0.543926
croatian,0.834255,0.822724,0.859671,0.879532,0.958669,0.93988,0.79611,0.774934,0.602436,1.0,...,0.921881,0.850913,0.745961,0.795785,0.949439,0.83488,0.819868,0.729877,0.870601,0.684122


In [25]:
earth.to_csv('earth.csv', sep=',')

In [26]:
data_earth = earth.replace([0], np.nan).replace([1], np.nan)
data_earth.describe()

Unnamed: 0,afrikaans,arabic,armenian,basque,belarusian,bulgarian,catalan,chinese,classical_chinese,croatian,...,swedish,tamil,telugu,turkish,ukrainian,urdu,uyghur,vietnamese,welsh,wolof
count,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,...,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0
mean,0.782352,0.708874,0.722937,0.7782,0.810312,0.79243,0.768247,0.692261,0.554319,0.807845,...,0.82364,0.727457,0.691386,0.714335,0.79281,0.748568,0.697269,0.679358,0.809801,0.692585
std,0.143641,0.136183,0.141286,0.109804,0.134254,0.141679,0.154872,0.107167,0.09261,0.133295,...,0.124547,0.121447,0.103806,0.115087,0.133987,0.133419,0.112701,0.10565,0.128688,0.103268
min,0.303803,0.313805,0.162614,0.453049,0.392026,0.37812,0.320242,0.408813,0.308191,0.356189,...,0.33278,0.391305,0.274008,0.292557,0.382666,0.341094,0.404694,0.259882,0.42617,0.240545
25%,0.705331,0.604548,0.641944,0.733441,0.753881,0.717805,0.661127,0.640436,0.516433,0.748582,...,0.770729,0.65162,0.654617,0.669593,0.734167,0.673625,0.623005,0.650496,0.737483,0.667674
50%,0.815809,0.744004,0.74004,0.791205,0.812806,0.82503,0.790716,0.717639,0.559483,0.822724,...,0.856376,0.707924,0.713454,0.729167,0.789027,0.781032,0.709409,0.706884,0.855622,0.707868
75%,0.893679,0.822724,0.835795,0.859931,0.908872,0.905889,0.89873,0.77314,0.604127,0.915076,...,0.910377,0.820563,0.752488,0.79314,0.897802,0.84368,0.797151,0.729541,0.89546,0.749092
max,0.970156,0.88406,0.922777,0.948029,0.982175,0.959937,0.998108,0.859931,0.81871,0.979507,...,0.983728,0.928044,0.869905,0.892693,0.982175,0.970134,0.886409,0.921201,0.964445,0.82847


In [27]:
data_earth.min().min()

0.11802990260501298

In [28]:
data_earth.max().max()

0.9981078950602288

In [6]:
def create_dataframe_by_lang(doc, topics, topic_mapping):
    """
    Creates pandas dataframe by language
    
    Args:
      - doc: the document with the information.
      - topics: the list of sorted topics.
      - topic_mapping: a dictionary mapping the sorted
        topics to an integer.
    Returns:
      - df: pandas dataframe with information.
    """
    if '/' in doc:
        language_name = doc.split('/')[-1].split('-cosine_sim')[0]
    else:
        language_name = doc.split('-cosine_sim')[0]
    
    lang = {}
    for topic in topics:
        lang[topic] = [0]*len(topics)
        
    f = open(doc, "r")
    for line in f.readlines():
        if line[0].isdigit():
            cosine_sim = float(line[:-1])
            lang[topic1][topic_mapping[topic2]] = cosine_sim
            lang[topic2][topic_mapping[topic1]] = cosine_sim
            lang[topic1][topic_mapping[topic1]] = 1
            lang[topic2][topic_mapping[topic2]] = 1
        else: 
            languages = line[:-1].split(' ')
            topic1 = languages[0].split('-')[0]
            topic2 = languages[1].split('-')[0]
    f.close()
    df = pd.DataFrame(lang)
    df.index = topics
    df.style.set_caption(f'{language_name} by topic')
    
    return df

In [37]:
lang_path = 'cosine-sim-per-lang/cosine-sim-plain-data/'
lang_save_path = 'cosine-sim-per-lang/cosine-sim-dataframes/'
lang_files = os.listdir(lang_path)
for file in lang_files:
    name = file.split("-")[0]
    df = create_dataframe_by_lang(lang_path+file, topics, topic_mapping)
    df.to_csv(f'{lang_save_path}{name}.csv', sep=',')

In [8]:
english = create_dataframe_by_lang('cosine-sim-per-lang/cosine-sim-plain-data/english-cosine_sim.txt', topics, topic_mapping)
english

Unnamed: 0,Adolf_Hitler,Africa,Asia,Association_football,Barack_Obama,Bible,Buddha,Buddhism,China,Christianity,...,Silver,South_Africa,South_America,Soviet_Union,Sun,United_Kingdom,United_States,Water,Wikipedia,World_War_II
Adolf_Hitler,1.000000,0.975701,0.977857,0.985614,0.985782,0.974498,0.981897,0.972502,0.982869,0.980596,...,0.966793,0.978423,0.971761,0.987888,0.978980,0.978873,0.978302,0.984134,0.990179,0.985624
Africa,0.975701,1.000000,0.994797,0.976733,0.965361,0.988471,0.978266,0.990315,0.996866,0.991179,...,0.985633,0.991365,0.994513,0.990895,0.984339,0.991548,0.991452,0.992417,0.983504,0.987609
Asia,0.977857,0.994797,1.000000,0.977482,0.967870,0.988378,0.981573,0.991312,0.994839,0.992581,...,0.985958,0.990583,0.998255,0.988823,0.978005,0.994719,0.993356,0.994743,0.985114,0.983125
Association_football,0.985614,0.976733,0.977482,1.000000,0.975193,0.983742,0.980402,0.971284,0.980872,0.978592,...,0.964904,0.982058,0.973527,0.993619,0.991644,0.985044,0.979509,0.978977,0.985753,0.984915
Barack_Obama,0.985782,0.965361,0.967870,0.975193,1.000000,0.949647,0.953112,0.948126,0.979764,0.953694,...,0.960973,0.983000,0.964727,0.975057,0.961137,0.979197,0.980888,0.974872,0.986477,0.979243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
United_Kingdom,0.978873,0.991548,0.994719,0.985044,0.979197,0.984397,0.973701,0.980435,0.994727,0.984588,...,0.977239,0.994979,0.994235,0.992554,0.981753,1.000000,0.996866,0.988862,0.985952,0.989701
United_States,0.978302,0.991452,0.993356,0.979509,0.980888,0.981583,0.974863,0.982548,0.995702,0.981814,...,0.983552,0.998223,0.994783,0.989393,0.976817,0.996866,1.000000,0.990525,0.987425,0.990076
Water,0.984134,0.992417,0.994743,0.978977,0.974872,0.983268,0.983440,0.990241,0.993454,0.988742,...,0.992090,0.990876,0.992951,0.986573,0.976996,0.988862,0.990525,1.000000,0.992396,0.984507
Wikipedia,0.990179,0.983504,0.985114,0.985753,0.986477,0.977733,0.982723,0.979578,0.989806,0.979606,...,0.982957,0.990261,0.982873,0.986947,0.979060,0.985952,0.987425,0.992396,1.000000,0.985818


In [9]:
data_english = english.replace([0], np.nan).replace([1], np.nan)
data_english.describe()

Unnamed: 0,Adolf_Hitler,Africa,Asia,Association_football,Barack_Obama,Bible,Buddha,Buddhism,China,Christianity,...,Silver,South_Africa,South_America,Soviet_Union,Sun,United_Kingdom,United_States,Water,Wikipedia,World_War_II
count,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,...,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
mean,0.979579,0.983967,0.984568,0.977783,0.966201,0.981943,0.980224,0.982165,0.984906,0.983723,...,0.975879,0.982317,0.982444,0.98516,0.977771,0.982298,0.982286,0.985444,0.982867,0.981059
std,0.009062,0.01094,0.009947,0.008764,0.014191,0.010886,0.010114,0.011317,0.009889,0.01073,...,0.011811,0.011396,0.011778,0.007962,0.010192,0.011008,0.011422,0.008001,0.006715,0.008083
min,0.945645,0.959306,0.962532,0.95295,0.929198,0.941112,0.93056,0.940058,0.96403,0.937826,...,0.950634,0.954617,0.955222,0.961212,0.945135,0.953356,0.953626,0.96291,0.965803,0.963821
25%,0.975405,0.976733,0.977857,0.973527,0.95881,0.977498,0.976611,0.977511,0.979764,0.978592,...,0.966636,0.976105,0.973527,0.979222,0.973283,0.975381,0.976629,0.979873,0.978995,0.976419
50%,0.980227,0.988471,0.987961,0.978592,0.968147,0.983742,0.98183,0.985198,0.986856,0.985716,...,0.977239,0.985219,0.98665,0.986405,0.977862,0.985044,0.983552,0.987836,0.982957,0.98305
75%,0.985782,0.991452,0.992474,0.983742,0.975801,0.989794,0.986054,0.990073,0.992918,0.991393,...,0.984628,0.991803,0.991803,0.991914,0.984725,0.991548,0.990525,0.991774,0.988285,0.987153
max,0.993299,0.997363,0.998255,0.993619,0.996159,0.995271,0.994531,0.997588,0.99863,0.997832,...,0.996065,0.998223,0.998255,0.996613,0.998739,0.996866,0.998223,0.996894,0.992749,0.993895


In [10]:
data_english.min().min()

0.9015785702126288

In [54]:
afrikaans = create_dataframe_by_lang('cosine-sim-per-lang/cosine-sim-plain-data/afrikaans-cosine_sim.txt', topics, topic_mapping)
afrikaans

Unnamed: 0,Adolf_Hitler,Africa,Asia,Association_football,Barack_Obama,Bible,Buddha,Buddhism,China,Christianity,...,Silver,South_Africa,South_America,Soviet_Union,Sun,United_Kingdom,United_States,Water,Wikipedia,World_War_II
Adolf_Hitler,1.000000,0.966598,0.963506,0.972560,0.985633,0.934698,0.923473,0.967446,0.990543,0.954437,...,0.968214,0.993541,0.948863,0.982856,0.968393,0.987178,0.988613,0.990689,0.969033,0.989324
Africa,0.966598,1.000000,0.990798,0.977404,0.952154,0.973128,0.953380,0.983808,0.977767,0.981333,...,0.960753,0.979101,0.976029,0.989032,0.986400,0.983191,0.984779,0.978775,0.969733,0.972836
Asia,0.963506,0.990798,1.000000,0.972934,0.950737,0.985044,0.964672,0.990397,0.977004,0.983630,...,0.973139,0.975508,0.983631,0.987201,0.977663,0.983685,0.981806,0.974005,0.972886,0.964176
Association_football,0.972560,0.977404,0.972934,1.000000,0.970184,0.958496,0.956600,0.978144,0.968198,0.963662,...,0.951906,0.972397,0.972491,0.988485,0.980366,0.976394,0.980823,0.969994,0.978129,0.981297
Barack_Obama,0.985633,0.952154,0.950737,0.970184,1.000000,0.924018,0.902386,0.951621,0.979922,0.939763,...,0.953191,0.983501,0.953140,0.973080,0.960766,0.979335,0.982494,0.972672,0.958564,0.982218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
United_Kingdom,0.987178,0.983191,0.983685,0.976394,0.979335,0.961420,0.938533,0.982794,0.995752,0.963981,...,0.980077,0.995945,0.976896,0.991142,0.972383,1.000000,0.998601,0.985870,0.972456,0.981751
United_States,0.988613,0.984779,0.981806,0.980823,0.982494,0.958056,0.939177,0.980922,0.993154,0.967610,...,0.972957,0.995174,0.978155,0.994059,0.978277,0.998601,1.000000,0.986707,0.969565,0.988126
Water,0.990689,0.978775,0.974005,0.969994,0.972672,0.943676,0.932885,0.975732,0.991569,0.970932,...,0.974127,0.994073,0.951300,0.983298,0.977264,0.985870,0.986707,1.000000,0.970389,0.984244
Wikipedia,0.969033,0.969733,0.972886,0.978129,0.958564,0.968721,0.953603,0.982972,0.974986,0.955530,...,0.972725,0.974536,0.950857,0.972385,0.962428,0.972456,0.969565,0.970389,1.000000,0.957950


In [39]:
afrikaans['Earth'].idxmin(axis=1)

'Religion'

In [24]:
afrikaans.to_csv('afrikaans.csv', sep=',')

In [53]:
data_afrikaans = afrikaans.replace([1], np.nan)
data_afrikaans.describe()['Earth']

count    61.000000
mean      0.968987
std       0.053230
min       0.591695
25%       0.970080
50%       0.979808
75%       0.984229
max       0.995429
Name: Earth, dtype: float64

In [52]:
data_afrikaans.min().min()

0.5645573926867273

In [14]:
"""
I know that it is Iran + South_Africa by looking at the file, but I want to 
be able to get the column and row name directly
"""
data_afrikaans.max().max()

0.9991618300614605

In [47]:
data_afrikaans.idxmax(0)

Adolf_Hitler              South_Africa
Africa                           Earth
Asia                            Europe
Association_football      Soviet_Union
Barack_Obama            George_W._Bush
                             ...      
United_Kingdom           United_States
United_States           United_Kingdom
Water                     South_Africa
Wikipedia                         Iron
World_War_II              Soviet_Union
Length: 62, dtype: object

In [50]:
descr = data_afrikaans.describe()
one = descr.loc['max'].idxmax(0)
other = data_afrikaans.idxmax(0)[one]
print(one, other)

Iran South_Africa


In [51]:
descr = data_afrikaans.describe()
one = descr.loc['min'].idxmin(0)
other = data_afrikaans.idxmin(0)[one]
print(one, other)

Association_football Religion


In [17]:
print(data_afrikaans.where(data_afrikaans == 0.9991618300614605).dropna().index)
print(data_afrikaans.where(data_afrikaans == 0.9991618300614605).dropna().columns)

Index([], dtype='object')
Index(['Adolf_Hitler', 'Africa', 'Asia', 'Association_football',
       'Barack_Obama', 'Bible', 'Buddha', 'Buddhism', 'China', 'Christianity',
       'Christmas', 'Dog', 'Earth', 'English_Language', 'Europe', 'Eye',
       'George_W._Bush', 'Ghana', 'Gold', 'Hinduism', 'Human', 'India',
       'Internet', 'Iran', 'Iraq', 'Iron', 'Islam', 'Italy', 'Japan', 'Jesus',
       'Judaism', 'Julius_Caesar', 'Koran', 'Maize', 'Milk',
       'Mohandas_Karamchand_Gandhi', 'Money', 'Moon', 'Moses', 'Muhammad',
       'New_York_City', 'Niger', 'Osama_Bin_Laden', 'Paris', 'Periodic_table',
       'Pope_Benedict_XVI', 'Pope_John_Paul_II', 'Religion', 'Rice',
       'Roman_Catholic_Church', 'Rome', 'Russia', 'Silver', 'South_Africa',
       'South_America', 'Soviet_Union', 'Sun', 'United_Kingdom',
       'United_States', 'Water', 'Wikipedia', 'World_War_II'],
      dtype='object')


## Finding data on language families

The following is a very inelegant way to do it, but it works.

In [7]:
import requests
from bs4 import BeautifulSoup

# Attention! I am not getting the distinction 
# between families (e.g. Indo-European languages >
# Indo-Iranian > Bihari languages, Irani, etc.)

url = 'https://omniglot.com/writing/langfam.htm'
prev = requests.get(url).text
htmlParse = BeautifulSoup(prev, 'html.parser')
with open('languages.tmp', 'w') as f:
    for tag in htmlParse.find_all(["h2", "dl"]):
        f.write(str(tag) + '\n')

In [8]:
lang_fam = {}
f = open('languages.tmp', 'r')
for line in f.readlines():
    line = line.strip()
    if line.startswith('<dd'):
        line = line[4:]
        
    if line.startswith('<h2'):
        big_fam = line.split('>')[1].split('<')[0]
        lang_fam[big_fam] = {}
    elif line.startswith('<dt'):
        fam = line.split('>')[1].split('<')[0]
        lang_fam[big_fam][fam] = []
    elif line.startswith('<a'):
        lang = line.split('>')[1].split('<')[0]
        lang_fam[big_fam][fam].append(lang.lower())
f.close()

In [9]:
# lang_fam

In [10]:
only_lang = []
for big_fam, fam in lang_fam.items():
    for lang in fam.values():
        only_lang.extend(lang)

In [11]:
# only_lang

In [12]:
# languages that do not match
not_match = []
for lang in languages:
    if '_' in lang:
        lang = ' '.join(lang.split('_'))
    if lang not in only_lang:
        not_match.append(lang)

In [13]:
my_lang_fam = {}

for lang in list(languages.keys()):
    if '_' in lang:
        lang = ' '.join(lang.split('_'))
    
    for big_fam, family in lang_fam.items():
        for fam, langs in family.items():
            if lang in langs:
                if big_fam not in my_lang_fam:
                    my_lang_fam[big_fam] = {}

                if fam not in my_lang_fam[big_fam]:
                    my_lang_fam[big_fam][fam] = ['_'.join(lang.split(' '))]
                else:
                    my_lang_fam[big_fam][fam].append('_'.join(lang.split(' ')))

# Adding the ones that do not match
my_lang_fam['Afroasiatic languages']['Semitic languages'].append('arabic')
my_lang_fam['Uralic languages']['Sámi languages'] = ['north_sami']
my_lang_fam['Indo-European languages']['Indo-Iranian languages'].append('sanskrit')
my_lang_fam['Sino-Tibetan languages'] = {'Sinitic (Chinese) languages' : ['classical_chinese']}
my_lang_fam['Sino-Tibetan languages']['Sinitic (Chinese) languages'].append('chinese')

In [14]:
my_lang_fam

{'Indo-European languages': {'Germanic languages': ['english',
   'afrikaans',
   'danish',
   'german',
   'gothic',
   'dutch',
   'swedish'],
  'Slavic languages': ['belarusian',
   'bulgarian',
   'czech',
   'croatian',
   'polish',
   'russian',
   'slovak',
   'slovenian',
   'serbian',
   'ukrainian',
   'old_church_slavonic'],
  'Romance languages': ['catalan',
   'spanish',
   'french',
   'galician',
   'italian',
   'portuguese',
   'romanian'],
  'Celtic languages': ['welsh', 'irish', 'scottish_gaelic'],
  'Hellenic languages': ['greek'],
  'Indo-Iranian languages': ['persian',
   'hindi',
   'marathi',
   'urdu',
   'sanskrit'],
  'Armenian languages': ['armenian'],
  'Italic languages': ['latin'],
  'Baltic languages': ['latvian', 'lithuanian']},
 'Uralic languages': {'Finnic languages': ['estonian', 'finnish'],
  'Ugric languages': ['hungarian'],
  'Sámi languages': ['north_sami']},
 'Language isolates': {'Language isolates': ['basque']},
 'Koreanic languages': {'Korean

In [14]:
# saving it into a file so I do not have to run it many times
with open('my_lang_fam.txt', 'w') as f:
    for big_family, fam_dict in my_lang_fam.items():
        f.write(f'{big_family}\n')
        for fam, languages in fam_dict.items():
            f.write(f'\t{fam}\n')
            for lang in languages:
                f.write(f'\t\t{lang}\n')

In [16]:
# saving it into a file so I do not have to run it many times
import pickle
a_file = open("ignore/my_lang_fam.pkl", "wb")
pickle.dump(my_lang_fam, a_file)
a_file.close()

In [12]:
my_lang_fam.keys()

dict_keys(['Indo-European languages', 'Uralic languages', 'Language isolates', 'Koreanic languages', 'Austronesian languages', 'Afroasiatic languages', 'Turkic languages', 'Japonic / Japanese-Ryukyuan languages', 'Dravidian languages', 'Austroasiatic languages', 'Niger-Congo languages', 'Sino-Tibetan languages'])