# Measuring particularity and similarity in archaic Greek alphabets with NLP


In [1]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import defaultdict
from utils import heatmap, annotate_heatmap, get_top_abs_correlations

In [2]:
#uncomment this line if you would like to see all rows in the dataframes
#pd.set_option("display.max_rows", None)

In [3]:
#uncomment this line if you would like to see all columns in the dataframes
# pd.set_option("display.max_columns", None)

In [4]:
# tf-idf per document. Careful with tokenizer! Punctuation = new token! token_pattern = r'\S+'
vectorizer = TfidfVectorizer(token_pattern=r'\S+')


## Sanity check example

A simple example to test TF-IDF and cosine similarity on three sample sentences.

In [5]:
ex_docs = ['The Cretan alphabet is a green alphabet', 'The Euboean alphabet is red', 'Cretan is a Greek dialect']

We parse the sentences with the TF-IDF vectorizer and show the results in a dataframe.

In [6]:
ex_matrix = vectorizer.fit_transform(ex_docs)

In [7]:
docs_df = pd.DataFrame(ex_matrix.toarray(),columns=vectorizer.get_feature_names())
docs_df

Unnamed: 0,a,alphabet,cretan,dialect,euboean,greek,green,is,red,the
0,0.32735,0.6547,0.32735,0.0,0.0,0.0,0.430426,0.254217,0.0,0.32735
1,0.0,0.406192,0.0,0.0,0.534093,0.0,0.0,0.315444,0.534093,0.406192
2,0.406192,0.0,0.406192,0.534093,0.0,0.534093,0.0,0.315444,0.0,0.0


We calculate the cosine similarity of each pair of sentences and put the results in a dataframe.

In [8]:
ex_similarity = cosine_similarity(ex_matrix, ex_matrix)
ex_similarity

array([[1.        , 0.47909206, 0.34612509],
       [0.47909206, 1.        , 0.09950501],
       [0.34612509, 0.09950501, 1.        ]])

In [9]:
sent_comp_df = pd.DataFrame(ex_similarity,columns=ex_docs, index=ex_docs)
sent_comp_df

Unnamed: 0,The Cretan alphabet is a green alphabet,The Euboean alphabet is red,Cretan is a Greek dialect
The Cretan alphabet is a green alphabet,1.0,0.479092,0.346125
The Euboean alphabet is red,0.479092,1.0,0.099505
Cretan is a Greek dialect,0.346125,0.099505,1.0


## Data pre-processing

We load the dataset, which in this case consists on a record of all Greek alphabetic inscriptions from the 8th and 7th centuries BC.

In [10]:
# read data
with open('data/dataset.json') as f:
  data = json.load(f)

The total number of inscriptions in the dataset:

In [11]:
len(data)

714

The fields found in each inscription: 

In [12]:
data[0].keys()

dict_keys(['Reference', 'Date', 'Origin', 'Lat', 'Long', 'Context', 'Material: object', 'Writing method', 'Writing direction', 'Content', 'a/a:', 'e', 'i/i:', 'o', 'u/u:/y/y:', '3:', '0:', 'e:', 'o:', 'm', 'n', 'l', 'r', 'w', 'h', 's', 'b', 'd', 'g', 'p', 't', 'k', 'ko/ku', 'ph', 'th', 'kh', 'ps', 'ks', 'dz', 'Division', 'Uncertain'])

We create a dictionary where each key represents an archaeological site or region and each value is a list with all grapheme-phoneme pairs recorded in the dataset for that site or region.

In [25]:
def build_documents(data):
  # build a dictionary with all documents to parse through the vectoriser
  # {origin: [list of grapheme-phoneme tokens]}
  documents = defaultdict(list)
  
  # build a dictionary with all grapheme-phoneme pairs seen for each place of origin
  # {origin: {phoneme: [list of graphemes]}}
  alphabets = {}
  
  checking_keys = ['a/a:', 'e', 'i/i:', 'o', 'u/u:/y/y:', '3:', '0:', 'e:', 'o:', 'm', 'n', 'l', 'r', 'w', 'h', 's', 'b', 'd', 'g', 'p', 't', 'k', 'ko/ku', 'ph', 'th', 'kh', 'ps', 'ks', 'dz', 'Division']

  # in each inscription, add each grapheme-phoneme pair to origin
  for inscription in data:
    origin = inscription['Origin']
   
    if '?' not in origin: #exclude uncertain

      # add origin to alphabets dict
      if origin not in alphabets.keys():
        alphabets[origin] = {key: [] for key in checking_keys}

      # add values to documents and alphabets dict
      for key, values in inscription.items():
        if key in checking_keys:
          for value in values:
            if '?' not in value: #exclude uncertain
              token = key+value
              documents[origin].append(token)
              alphabets[origin][key].append(value)

    else:
      continue
  
  # we turn the lists into strings, filter empty values and put in alphabetical order
  documents = {k: ' '.join(v) for k, v in documents.items() if v}
  documents = sorted(documents.items())

  for alphabet in alphabets.keys():
    for k,v in alphabets[alphabet].items():
          alphabets[alphabet][k] = ', '.join(list(set(v)))
  alphabets = {k:v for k,v in sorted(alphabets.items()) if v}
  
  return documents,alphabets


In [26]:
documents,alphabets = build_documents(data)

We order them alphabetically and show them in a dataframe:

In [27]:
documents_df = pd.DataFrame(documents, columns=['Origin', 'Text'])
documents_df

Unnamed: 0,Origin,Text
0,Achaia,a/a:1 e15 o43 m37 s38 d13 k33
1,Acrocorinth,a/a:1 e3 i/i:30* o43 0:43 e:15 n40 n40* l10 w1...
2,Aegina,e15 i/i:28 o43 n40 s32 p11 a/a:1 e15 m38 n40 l...
3,Aeolian Larisa,a/a:1 e15 e21 3:21 n40 d13 t57 a/a:1 e15 e21 3...
4,Aetos,a/a:1 e15 i/i:54 o43 n40 l34 w18 h23 s38 t57 p...
...,...,...
62,Thebes,a/a:1 e15 i/i:28 o43 u/u:/y/y:58 3:15 0:43 o:4...
63,Thera,i/i:31 0:43 n40 r52 kh33+23 a/a:1 e15 i/i:31 i...
64,Thermon,a/a:1 e15 i/i:31 o43 n40 l10 r52 w18 s38 d14 t...
65,Vari,a/a:1 e15 i/i:28 o:43 r52 d13 p47 kh41


Here is the relation of all graphematic relationships recorded for each site/region:

In [29]:
alphabets_df = pd.DataFrame(alphabets).T
alphabets_df

Unnamed: 0,a/a:,e,i/i:,o,u/u:/y/y:,3:,0:,e:,o:,m,...,t,k,ko/ku,ph,th,kh,ps,ks,dz,Division
Achaia,1,15,,43,,,,,,37,...,,33,,,,,,,,
Acrocorinth,1,3,30*,43,,,43,15,,,...,57,,,,,,,,,
Aegina,1,15,28,43,,15,,,,"38, 37",...,,33,,,,,,,,
Aeolian Larisa,1,"15, 21",28,43,,21,,,,,...,57,,,,,,,,,66
Aetos,1,15,"54, 32",43,,,,,,,...,57,33,,60,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Thebes,1,15,28,43,58,15,43,,43,36,...,57,33,51,,25,,,,,
Thera,1,15,"32*, 31*, 55, 30, 30*, 31, 54*","43, 44",58,"15, 23","43, 44","15, 15+31",43,"36, 37",...,57,33,51,50+23,"25+23, 27, 25","33+23, 51+23",,33+38,42,68
Thermon,1,15,31,43,,,,,,,...,57,,,,,41,,,,
Vari,1,15,28,,,,,,43,,...,,,,,,41,,,,


In [30]:
alphabets_df.to_excel('output/all_alphabets.xlsx')

## Particularity: TF-IDF 

We measure the relevance of grapheme-phoneme pairs for each site/region using TF-IDF.

In [None]:
matrix = vectorizer.fit_transform(documents_df['Text'])

In [None]:
matrix_df = pd.DataFrame(matrix.toarray(),columns=vectorizer.get_feature_names())
matrix_df

These are the 2 tokens with highest TF-IDF score per site/region:

In [None]:

values_df = pd.DataFrame(documents_df['Origin'])
values_df['1token'] = matrix_df.columns[matrix_df.values.argsort(1)[:, -1]]
values_df['1score'] = matrix_df.apply(lambda row: sorted(row)[-1], axis=1)
values_df['2token'] = matrix_df.columns[matrix_df.values.argsort(1)[:, -2]]
values_df['2score'] = matrix_df.apply(lambda row: sorted(row)[-2], axis=1)
values_df

These results turned out not to be a good representation of the most idiosyncratic graphematic relationships per alphabet, as those sites with very low numbers of evidence gave higher scores to all of their features, irrespective of whether these are common across alphabets or not. That is what happens with Al Mina or Zagora.



In [None]:
# how many tokens in Al Mina and Zagora as opposed to Athens or Penteskouphia
print(len(documents['Al Mina'].split()))
print(len(documents['Zagora'].split()))

print(len(documents['Penteskouphia'].split()))
print(len(documents['Athens'].split()))

Run the following cell if you wish to save the the results as an excel file:

In [None]:
values_df.to_excel('output/most_characteristic_not_normalized.xlsx')

### Normalising the results

To get more plausible results we normalise the results of TF-IDF using the L1 norm, which will neutralise the broad differences in the length of these documents.

In [None]:
l1_matrix = preprocessing.normalize(matrix, norm='l1',axis=0)

In [None]:
l1_matrix_df = pd.DataFrame(l1_matrix.toarray(),columns=vectorizer.get_feature_names())
l1_values_df = pd.DataFrame(documents_df['Origin'])

Here is a table with the 10 most representative graphematic relationships per site/region after L1 normalisation.

In [None]:
value = 0
while value > -10:
  value -= 1
  token = 'token'+str(value)
  score = 'score'+str(value)
  l1_values_df[token] = l1_matrix_df.columns[l1_matrix_df.values.argsort(1)[:, value]]
  l1_values_df[score] = l1_matrix_df.apply(lambda row: sorted(row)[value], axis=1)

l1_values_df

Run the following cell if you wish to save the the results as an excel file:

In [None]:
l1_values_df.to_excel('output/most_characteristic_per_alph_l1.xlsx')

## Similarity: cosine similarity

Now we will calculate the cosine similarity of all pairs of alphabets using their non-normalised TF-IDF weights.

In [None]:
similarity = cosine_similarity(matrix, matrix)
l1_similarity = cosine_similarity(l1_matrix,l1_matrix)

In [None]:
plt.rcParams['figure.figsize'] = [30, 26]
fig, ax = plt.subplots()

im = heatmap(similarity, documents_df['Origin'], documents_df['Origin'], ax=ax,
                   cmap="YlGn")
texts = annotate_heatmap(im, valfmt="{x:.1f}")

fig.tight_layout()
plt.show()

In [None]:
plt.savefig('output/heatmap.png')

Low scores not necessarily very different alphabets, but alphabets with low evidence (e.g. Paros).

In [None]:
similarity_df = pd.DataFrame(similarity, columns = documents_df['Origin'])

In [None]:
similarity_df.corr()

In [None]:
top_pairs = get_top_abs_correlations(similarity_df, min_val=0.8)
top_pairs

In [None]:
top_pairs.to_excel('output/top_alph_pairs.xlsx')