# Preamble
___

In [1]:
# Import modules:
import pandas as pd
import numpy as np
import os
from pandarallel import pandarallel
from tabulate import tabulate
import itertools
from tqdm import tqdm
import swifter
import warnings
from sklearn.metrics.pairwise import cosine_similarity

# Suppress PerformanceWarning
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

In [2]:
# Working directory:
directorio = "/Users/nicolasdecamino/Documents/Investigacion/Paper Polities/Replicacion Jaccard"
os.chdir(directorio)

# Functions
___

In [2]:
def jaccard_similarity(set_1, set_2):
    intersection_size = len(set_1.intersection(set_2))
    union_size = len(set_1.union(set_2))
    similarity = intersection_size / union_size if union_size != 0 else 0
    return similarity

In [6]:
def mean_distance(row, type):

    # Obtain cultures' names:
    culture_1, culture_2 = row['culture_1'], row['culture_2']
    
    # Return distance == 0 if the culture is the same:
    if culture_1 == culture_2:
        return 0.0

    # Get descriptions (as lists of sets of words or shingles) for each culture:
    descriptions_1, descriptions_2 = row[f'{type}_1'], row[f'{type}_2']

    # Calculate the mean distance between any pair of descriptions:
    distances = []
    for description_1 in descriptions_1:
        for description_2 in descriptions_2:
            similarity = jaccard_similarity(description_1, description_2)
            distance = 1 - similarity
            distances.append(distance)

    # Return the mean distance:
    mean_distance = np.mean(distances)
    return mean_distance

In [40]:
def cultures_distances(df, type, length_filter, by_period):    

    # Filter by length of the description:
    df = df.loc[df['lem_desc_length'] > length_filter]

    # Obtain culture pairs dataframe:
    df_pairs = pairs(df, by_period=by_period)  
    
    # Calculate the mean distance of each culture pair:
    pandarallel.initialize(verbose=0, progress_bar=False)
    df_pairs['mean_distance'] = df_pairs.parallel_apply(
                lambda row: mean_distance(row, type=type), axis=1
        )

    # Return culture pairs dataframe:
    return df_pairs

In [8]:
def distance_matrix(mean_distances_df):
    
    # If the 'period' column doesn't exist print one mean distance matrix:
    if 'period' not in mean_distances_df.columns:
        mean_distances_matrix = mean_distances_df.pivot(index='culture_1', columns='culture_2', values='mean_distance')
        mean_distances_matrix = mean_distances_matrix.fillna(mean_distances_matrix.T)
        headers = [''] + mean_distances_matrix.columns.tolist()
        print(tabulate(mean_distances_matrix, headers=headers, floatfmt=".4f"))

    # If the 'period' column exists print one mean distance matrix for each period:
    else:
        periods = mean_distances_df.period.unique().tolist()
        for period in periods:
            mean_distances_matrix = mean_distances_df[mean_distances_df.period == period].pivot(index='culture_1', columns='culture_2', values='mean_distance')
            mean_distances_matrix = mean_distances_matrix.fillna(mean_distances_matrix.T)
            headers = [''] + mean_distances_matrix.columns.tolist()
            print(f"Periodo: {period}")
            print(tabulate(mean_distances_matrix, headers=headers, floatfmt=".4f"))
            print()

# Jaccard similarity
___

In [23]:
# Load data:
df = pd.read_csv('Data/base-n150/base-n150.csv', sep=';')

## Bag of words similarity

## Shingles similarity

In [None]:
# Shingle length selection based mean description length:
table = [['Length filter', 'Mean word length']]
for length_filter in [0, 1, 2, 3]:
        filtered_df = df.loc[df['lem_desc_length'] > length_filter]
        words = filtered_df['lemmatized_description'].str.cat(sep=' ').split()
        word_lengths = [len(word) for word in words]
        mean_length = sum(word_lengths) / len(word_lengths) if len(word_lengths) > 0 else 0
        table.append([length_filter, mean_length])
print(tabulate(table, headers="firstrow"))

# Distances between cultures
___

In [26]:
# Transform descriptions to sets of tokens and shingles:
df['words'] = df['lemmatized_description'].swifter.progress_bar(False).apply(
    lambda row: set(row.split())
)
df['shingles'] = df['lemmatized_description'].swifter.progress_bar(False).apply(shingles)

In [41]:
# Parameters to iterate over:
types = ['words', 'shingles']
length_filters = [0, 1, 2, 3]
by_periods = [False, True]

# Create dictionary of results for each combination of parameters:
results = {}
for type, length_filter, by_period in itertools.product(types, length_filters, by_periods):
    results[(type, length_filter, by_period)] = None

In [None]:
# Get mean distances:
for key in tqdm(results.keys()):
    type, length_filter, by_period = key
    results[key] = cultures_distances(df, type, length_filter, by_period)

In [32]:
distance_matrix(results['words', 0, False])

              Cajamarca    Chancay    Chimu    Cupisnique    Moche    Nasca    Recuay    Salinar    Sican    Tiahuanaco    Vicus    Wari
----------  -----------  ---------  -------  ------------  -------  -------  --------  ---------  -------  ------------  -------  ------
Cajamarca        0.0000     0.8736   0.9745        0.9235   0.9702   0.9190    0.9530     0.9528   0.9782        0.8285   0.9770  0.9409
Chancay          0.8736     0.0000   0.9426        0.9345   0.9326   0.9138    0.9180     0.9211   0.9496        0.8503   0.9377  0.8950
Chimu            0.9745     0.9426   0.0000        0.8967   0.8732   0.9374    0.9137     0.8738   0.8454        0.9499   0.8171  0.8964
Cupisnique       0.9235     0.9345   0.8967        0.0000   0.9101   0.9471    0.9427     0.8966   0.9065        0.9317   0.8819  0.9313
Moche            0.9702     0.9326   0.8732        0.9101   0.0000   0.9333    0.9093     0.8897   0.8848        0.9334   0.8547  0.9000
Nasca            0.9190     0.9138   0.93

In [34]:
distance_matrix(results['shingles', 0, False])

              Cajamarca    Chancay    Chimu    Cupisnique    Moche    Nasca    Recuay    Salinar    Sican    Tiahuanaco    Vicus    Wari
----------  -----------  ---------  -------  ------------  -------  -------  --------  ---------  -------  ------------  -------  ------
Cajamarca        0.0000     0.9394   0.9849        0.9798   0.9752   0.9614    0.9643     0.9626   0.9812        0.9108   0.9806  0.9576
Chancay          0.9394     0.0000   0.9594        0.9667   0.9511   0.9369    0.9382     0.9442   0.9602        0.8928   0.9522  0.9270
Chimu            0.9849     0.9594   0.0000        0.9301   0.9020   0.9442    0.9311     0.9151   0.8901        0.9615   0.8628  0.9246
Cupisnique       0.9798     0.9667   0.9301        0.0000   0.9339   0.9646    0.9604     0.9356   0.9449        0.9598   0.9209  0.9548
Moche            0.9752     0.9511   0.9020        0.9339   0.0000   0.9423    0.9291     0.9251   0.9188        0.9470   0.8879  0.9278
Nasca            0.9614     0.9369   0.94

# TD-IDF similarity
___

# BERT similarity
___

In [None]:
# Función para calcular la similitud del coseno entre dos textos
def calculate_similarity(text1, text2):
    # Tokenizar y codificar los textos
    inputs1 = tokenizer(text1, return_tensors='pt', truncation=True, max_length=512)
    inputs2 = tokenizer(text2, return_tensors='pt', truncation=True, max_length=512)

    # Obtener los embeddings de los textos
    with torch.no_grad():
        embeddings1 = model(**inputs1).last_hidden_state.mean(dim=1)
        embeddings2 = model(**inputs2).last_hidden_state.mean(dim=1)

    # Calcular la similitud del coseno entre los embeddings
    similarity = cosine_similarity(embeddings1, embeddings2)

    return similarity

In [None]:
def get_positive_positions(v1, v2):
    positions = [i for i in range(len(df.embeddingCLS[0])) if v1[i] != 0.0 and v2[i] != 0.0]
    return positions

def _cosine_similarity(v1, v2):
    positions = get_positive_positions(v1, v2)
    if len(positions) == 0:
        return 0
    else:
      dot_product = np.dot(v1, v2)
      norm_vector1 = np.linalg.norm(v1)
      norm_vector2 = np.linalg.norm(v2)

      similarity = dot_product / (norm_vector1 * norm_vector2)
      return similarity