# Visualisation of cosine similarity

I want to visualise the cosine similarities in a matrix. Let's start with one topic: Earth.

In [7]:
import os
import pandas as pd
import copy
import numpy as np

In [2]:
def get_lang_and_topics():
    """
    Gives information about the languages and topics
    of the data.
    
    Returns:
      - languages: a dictionary with the number of topics
        available for each language.
      - sorted_lang: a (sorted) list of languages.
      - lang_mapping: a dictionary mapping the sorted
        languages to an integer.
      - topics: a (sorted) list of topics.
    """
    languages = {}
    list_of_dirs = os.listdir('data/')
    
    print(f'There are {len(list_of_dirs)} directories')

    for directory in list_of_dirs:
        path = (f'data/{directory}')
        files = os.listdir(path)
        for f in files:
            lang = f.split('-')[1].split('.')[0]
            if lang in languages:
                languages[lang] += 1
            else:
                languages[lang] = 1
    #     print(f'There are {len(files)} files in {directory}.')
    print(f'There are {len(languages.keys())} languages')
    sorted_lang = sorted(list(languages.keys()))
    lang_mapping = {x:i for i, x in enumerate(sorted_lang)}
    
    topics = sorted(os.listdir('data/'))
    topic_mapping = {x:i for i, x in enumerate(topics)}
    
    return languages, sorted_lang, lang_mapping, topics, topic_mapping

In [3]:
languages, sorted_lang, lang_mapping, topics, topic_mapping = get_lang_and_topics()

There are 62 directories
There are 58 languages


In [4]:
def create_dataframe_by_topic(doc, lang, lang_mapping):
    """
    Creates pandas dataframe by topic
    
    Args:
      - doc: the document with the information.
      - lang: the list of sorted languages.
      - lang_mapping: a dictionary mapping the sorted
        languages to an integer.
    Returns:
      - df: pandas dataframe with information.
    """
    if '/' in doc:
        topic_name = doc.split('/')[-1].split('-cosine-sim')[0]
    else:
        topic_name = doc.split('-cosine-sim')[0]
    topic = {}
    for language in lang:
        topic[language] = [0]*len(lang)
        
    f = open(doc, "r")
    for line in f.readlines():
        if line.startswith('0'):
            cosine_sim = float(line[:-1])
            topic[lang1][lang_mapping[lang2]] = cosine_sim
            topic[lang2][lang_mapping[lang1]] = cosine_sim
            topic[lang1][lang_mapping[lang1]] = 1
            topic[lang2][lang_mapping[lang2]] = 1
        else: 
            files = line[:-1].split(' ')
            lang1 = files[0].split('-')[1].split('.')[0]
            lang2 = files[1].split('-')[1].split('.')[0]
    f.close()
    df = pd.DataFrame(topic)
    df.index = lang
    
    df.style.set_caption(f'{topic_name} by language')
    return df

In [5]:
earth = create_dataframe_by_topic('cosine-sim-per-topic/Earth-cosine-sim.txt', sorted_lang, lang_mapping)
earth

Unnamed: 0,afrikaans,arabic,armenian,basque,belarusian,bulgarian,catalan,chinese,classical_chinese,croatian,...,swedish,tamil,telugu,turkish,ukrainian,urdu,uyghur,vietnamese,welsh,wolof
afrikaans,1.0,0.717868,0.0,0.784911,0.0,0.0,0.0,0.635391,0.0,0.0,...,0.0,0.705331,0.0,0.0,0.0,0.0,0.0,0.718369,0.0,0.0
arabic,0.717868,1.0,0.0,0.765315,0.0,0.0,0.0,0.717639,0.0,0.0,...,0.0,0.88406,0.0,0.0,0.0,0.0,0.0,0.600479,0.0,0.0
armenian,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.679553,0.0,0.0,0.0,0.0
basque,0.784911,0.765315,0.0,1.0,0.0,0.0,0.0,0.859931,0.0,0.0,...,0.0,0.914378,0.0,0.0,0.0,0.0,0.0,0.68619,0.0,0.0
belarusian,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.744316,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bulgarian,0.0,0.0,0.0,0.0,0.0,1.0,0.857342,0.0,0.586022,0.93988,...,0.929925,0.0,0.0,0.717805,0.902245,0.0,0.0,0.0,0.893354,0.0
catalan,0.0,0.0,0.0,0.0,0.0,0.857342,1.0,0.0,0.530181,0.79611,...,0.89873,0.0,0.0,0.677148,0.782854,0.0,0.0,0.0,0.94616,0.0
chinese,0.635391,0.717639,0.0,0.859931,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.820563,0.0,0.0,0.0,0.0,0.0,0.77314,0.0,0.0
classical_chinese,0.0,0.0,0.0,0.0,0.0,0.586022,0.530181,0.0,1.0,0.602436,...,0.617902,0.0,0.0,0.516433,0.541321,0.0,0.0,0.0,0.601543,0.0
croatian,0.0,0.0,0.0,0.0,0.0,0.93988,0.79611,0.0,0.602436,1.0,...,0.921881,0.0,0.0,0.795785,0.949439,0.0,0.0,0.0,0.870601,0.0


In [10]:
data_earth = earth.replace([0], np.nan).replace([1], np.nan)
data_earth.describe()

Unnamed: 0,afrikaans,arabic,armenian,basque,belarusian,bulgarian,catalan,chinese,classical_chinese,croatian,...,swedish,tamil,telugu,turkish,ukrainian,urdu,uyghur,vietnamese,welsh,wolof
count,19.0,19.0,4.0,19.0,7.0,19.0,19.0,19.0,19.0,19.0,...,19.0,19.0,7.0,19.0,19.0,4.0,4.0,19.0,19.0,4.0
mean,0.804369,0.741432,0.789566,0.76805,0.78415,0.807042,0.75457,0.70363,0.553191,0.839038,...,0.838135,0.729141,0.675299,0.752427,0.820766,0.704518,0.57951,0.70289,0.816097,0.55883
std,0.117396,0.130646,0.126364,0.104429,0.182235,0.133445,0.140435,0.101875,0.089463,0.117715,...,0.105673,0.136769,0.096046,0.0901,0.11836,0.117379,0.119536,0.094221,0.119838,0.242191
min,0.598216,0.313805,0.679553,0.56473,0.422579,0.575302,0.482671,0.466186,0.308191,0.602436,...,0.617902,0.391305,0.529034,0.516433,0.541321,0.600607,0.404694,0.484769,0.570531,0.240545
25%,0.718119,0.717754,0.682435,0.70771,0.754866,0.711308,0.648565,0.639683,0.523307,0.768482,...,0.815146,0.672569,0.620216,0.702702,0.760423,0.648912,0.553307,0.656892,0.747109,0.482733
50%,0.784911,0.765315,0.778479,0.765315,0.80921,0.857342,0.79611,0.720053,0.586022,0.870601,...,0.86866,0.718392,0.718204,0.792606,0.86343,0.672284,0.623048,0.709367,0.870601,0.583153
75%,0.906951,0.833908,0.88561,0.857701,0.88322,0.916987,0.847343,0.769214,0.609553,0.943374,...,0.914891,0.837152,0.737711,0.798303,0.924483,0.72789,0.649251,0.762304,0.893899,0.659251
max,0.960789,0.88406,0.921753,0.919311,0.981091,0.956683,0.95375,0.859931,0.660153,0.979507,...,0.939373,0.914378,0.764002,0.892693,0.949439,0.872899,0.667252,0.921201,0.94616,0.82847


In [25]:
data_earth.min().min()

0.24054482879856956

In [24]:
data_earth.max().max()

0.9947763899865332

In [11]:
def create_dataframe_by_lang(doc, topics, topic_mapping):
    """
    Creates pandas dataframe by language
    
    Args:
      - doc: the document with the information.
      - topics: the list of sorted topics.
      - topic_mapping: a dictionary mapping the sorted
        topics to an integer.
    Returns:
      - df: pandas dataframe with information.
    """
    if '/' in doc:
        language_name = doc.split('/')[-1].split('-cosine_sim')[0]
    else:
        language_name = doc.split('-cosine_sim')[0]
    
    lang = {}
    for topic in topics:
        lang[topic] = [0]*len(topics)
        
    f = open(doc, "r")
    for line in f.readlines():
        if line.startswith('0'):
            cosine_sim = float(line[:-1])
            lang[topic1][topic_mapping[topic2]] = cosine_sim
            lang[topic2][topic_mapping[topic1]] = cosine_sim
            lang[topic1][topic_mapping[topic1]] = 1
            lang[topic2][topic_mapping[topic2]] = 1
        else: 
            languages = line[:-1].split(' ')
            topic1 = languages[0].split('-')[0]
            topic2 = languages[1].split('-')[0]
    f.close()
    df = pd.DataFrame(lang)
    df.index = topics
    df.style.set_caption(f'{language_name} by topic')
    
    return df

In [12]:
afrikaans = create_dataframe_by_lang('cosine-sim-per-lang/afrikaans-cosine_sim.txt', topics, topic_mapping)
afrikaans

Unnamed: 0,Adolf_Hitler,Africa,Asia,Association_football,Barack_Obama,Bible,Buddha,Buddhism,China,Christianity,...,Silver,South_Africa,South_America,Soviet_Union,Sun,United_Kingdom,United_States,Water,Wikipedia,World_War_II
Adolf_Hitler,1.000000,0.966598,0.963506,0.972560,0.985633,0.934698,0.923473,0.967446,0.990543,0.954437,...,0.968214,0.993541,0.948863,0.982856,0.968393,0.987178,0.988613,0.990689,0.969033,0.989324
Africa,0.966598,1.000000,0.990798,0.977404,0.952154,0.973128,0.953380,0.983808,0.977767,0.981333,...,0.960753,0.979101,0.976029,0.989032,0.986400,0.983191,0.984779,0.978775,0.969733,0.972836
Asia,0.963506,0.990798,1.000000,0.972934,0.950737,0.985044,0.964672,0.990397,0.977004,0.983630,...,0.973139,0.975508,0.983631,0.987201,0.977663,0.983685,0.981806,0.974005,0.972886,0.964176
Association_football,0.972560,0.977404,0.972934,1.000000,0.970184,0.958496,0.956600,0.978144,0.968198,0.963662,...,0.951906,0.972397,0.972491,0.988485,0.980366,0.976394,0.980823,0.969994,0.978129,0.981297
Barack_Obama,0.985633,0.952154,0.950737,0.970184,1.000000,0.924018,0.902386,0.951621,0.979922,0.939763,...,0.953191,0.983501,0.953140,0.973080,0.960766,0.979335,0.982494,0.972672,0.958564,0.982218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
United_Kingdom,0.987178,0.983191,0.983685,0.976394,0.979335,0.961420,0.938533,0.982794,0.995752,0.963981,...,0.980077,0.995945,0.976896,0.991142,0.972383,1.000000,0.998601,0.985870,0.972456,0.981751
United_States,0.988613,0.984779,0.981806,0.980823,0.982494,0.958056,0.939177,0.980922,0.993154,0.967610,...,0.972957,0.995174,0.978155,0.994059,0.978277,0.998601,1.000000,0.986707,0.969565,0.988126
Water,0.990689,0.978775,0.974005,0.969994,0.972672,0.943676,0.932885,0.975732,0.991569,0.970932,...,0.974127,0.994073,0.951300,0.983298,0.977264,0.985870,0.986707,1.000000,0.970389,0.984244
Wikipedia,0.969033,0.969733,0.972886,0.978129,0.958564,0.968721,0.953603,0.982972,0.974986,0.955530,...,0.972725,0.974536,0.950857,0.972385,0.962428,0.972456,0.969565,0.970389,1.000000,0.957950


In [13]:
data_afrikaans = afrikaans.replace([0], np.nan).replace([1], np.nan)
data_afrikaans.describe()

Unnamed: 0,Adolf_Hitler,Africa,Asia,Association_football,Barack_Obama,Bible,Buddha,Buddhism,China,Christianity,...,Silver,South_Africa,South_America,Soviet_Union,Sun,United_Kingdom,United_States,Water,Wikipedia,World_War_II
count,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,...,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
mean,0.961067,0.967118,0.969178,0.962527,0.95086,0.95552,0.940662,0.970649,0.967301,0.957449,...,0.958617,0.968334,0.954096,0.97137,0.958614,0.970095,0.96975,0.964333,0.963129,0.959614
std,0.051589,0.050411,0.045045,0.054544,0.056043,0.04047,0.042536,0.04797,0.048607,0.046277,...,0.045954,0.051064,0.05325,0.053008,0.055187,0.051263,0.053666,0.050251,0.045009,0.057126
min,0.617726,0.610595,0.649098,0.564557,0.577278,0.675978,0.646094,0.626065,0.640312,0.634323,...,0.646123,0.620611,0.579452,0.596029,0.57684,0.613234,0.594113,0.627215,0.636215,0.567365
25%,0.958584,0.969296,0.967964,0.9631,0.944395,0.95072,0.934225,0.971066,0.963908,0.95553,...,0.953245,0.963574,0.951922,0.972385,0.960766,0.972383,0.969565,0.962196,0.963436,0.957953
50%,0.97256,0.977767,0.978157,0.972491,0.962759,0.963477,0.946157,0.980448,0.977767,0.964384,...,0.968214,0.98027,0.966005,0.983298,0.970766,0.981909,0.981862,0.976464,0.972456,0.972836
75%,0.98467,0.983618,0.984447,0.979988,0.978258,0.971469,0.959955,0.987016,0.987126,0.9727,...,0.977818,0.988876,0.974018,0.987919,0.978016,0.987388,0.988484,0.984244,0.977708,0.982128
max,0.993541,0.995073,0.992186,0.988485,0.994664,0.985044,0.974667,0.99473,0.998571,0.986137,...,0.990452,0.999162,0.984023,0.995429,0.994289,0.998601,0.998601,0.994073,0.988344,0.991379


In [26]:
data_afrikaans.min().min()

0.5645573926867273

In [14]:
"""
I know that it is Iran + South_Africa by looking at the file, but I want to 
be able to get the column and row name directly
"""
data_afrikaans.max().max()

0.9991618300614605

In [29]:
data_afrikaans.idxmax(0)

Adolf_Hitler              South_Africa
Africa                           Earth
Asia                            Europe
Association_football      Soviet_Union
Barack_Obama            George_W._Bush
                             ...      
United_Kingdom           United_States
United_States           United_Kingdom
Water                     South_Africa
Wikipedia                         Iron
World_War_II              Soviet_Union
Length: 62, dtype: object

In [33]:
descr = data_afrikaans.describe()
descr.loc['max'].idxmax(0)

'Iran'

In [27]:
print(data_afrikaans.where(data_afrikaans == 0.9991618300614605).dropna().index)
print(data_afrikaans.where(data_afrikaans == 0.9991618300614605).dropna().columns)

Index([], dtype='object')
Index(['Adolf_Hitler', 'Africa', 'Asia', 'Association_football',
       'Barack_Obama', 'Bible', 'Buddha', 'Buddhism', 'China', 'Christianity',
       'Christmas', 'Dog', 'Earth', 'English_Language', 'Europe', 'Eye',
       'George_W._Bush', 'Ghana', 'Gold', 'Hinduism', 'Human', 'India',
       'Internet', 'Iran', 'Iraq', 'Iron', 'Islam', 'Italy', 'Japan', 'Jesus',
       'Judaism', 'Julius_Caesar', 'Koran', 'Maize', 'Milk',
       'Mohandas_Karamchand_Gandhi', 'Money', 'Moon', 'Moses', 'Muhammad',
       'New_York_City', 'Niger', 'Osama_Bin_Laden', 'Paris', 'Periodic_table',
       'Pope_Benedict_XVI', 'Pope_John_Paul_II', 'Religion', 'Rice',
       'Roman_Catholic_Church', 'Rome', 'Russia', 'Silver', 'South_Africa',
       'South_America', 'Soviet_Union', 'Sun', 'United_Kingdom',
       'United_States', 'Water', 'Wikipedia', 'World_War_II'],
      dtype='object')
