In [40]:
import ollama
import csv

## Prompt
You are a linguistic expert evaluating the interpretability of two datasets. 

Evaluate the interpretability of the two sets of words below. A set is more interpretable if it reflects clear semantic patterns or categories, has meaningful diversity, and maintains contextual coherence across the words. Consider repetition only if it highlights important linguistic patterns. Choose the set that better meets these criteria


Dataset 1: {dataset1} 

Dataset 2: {dataset2}

Which dataset is more interpretable?

Respond ONLY with the dataset number  "1" or "2"

In [56]:
def more_interpretable(dataset1, dataset2):
    return ollama.generate(
        model='gemma2:9b', 
        options={'temperature': 0.001},
        system = '''You are a linguistic expert evaluating the interpretability of two datasets. 
        Evaluate the interpretability of the two sets of words below. A set is more interpretable if it reflects clear semantic patterns or categories, has meaningful diversity, and maintains contextual coherence across the words. Consider repetition only if it highlights important linguistic patterns. Choose the set that better meets these criteria
        ''',    
        prompt=f'''        
        Dataset 1: {dataset1}
        Dataset 2: {dataset2}

        Which dataset is more interpretable?
        Respond ONLY with the dataset number  "1" or "2"
    ''')['response']
     


In [57]:
# crea un objeto con numero de fila original, peso y texto
class Dataset:
    def __init__(self, Originalindex, weight, text):
        self.Originalindex = Originalindex
        self.weight = weight
        self.text = text

In [58]:
# Leer el archivo CSV sin usar pandas
with open('output_tokens.csv') as f:
    reader = csv.reader(f)
    # Convertir cada fila es un objeto Dataset, row debe serun numero consecutivo el peso 5 y concatenando el texto
    filas = [Dataset(i, 5, ' '.join(row)) for i, row in enumerate(reader)]



In [59]:
import random

def Compara_interpretabilidad(filas):
    # hace un shuffle de las filas
    random.shuffle(filas)
    # Aplicar el método de burbuja para ordenar por interpretabilidad
    n = len(filas)
    for i in range(n-1):    
        # Si la fila actual es menos interpretable que la siguiente, le resto 1 al peso
        comparacion = more_interpretable(filas[i].text, filas[i+1].text)
        # si la variable comparacion contiene el caracter '2' la fila i le resto 1 al peso, si contiene 1 le resto 1 al peso de la fila i+1
        if comparacion.find('2') != -1:
            filas[i].weight -= 1
        else:
            filas[i+1].weight -= 1
    return 


In [60]:
def genera_top_k(filas, k):
    #mientras haya mas k filas 
    while len(filas) > k:
        print(len(filas))
        Compara_interpretabilidad(filas)
        # ordenar las filas por peso de mayor a menor
        filas.sort(key=lambda x: x.weight, reverse=True)        
        # elimino las filas con peso <= 0 a menos que sean las primeras k filas de la lista
        filas = [x for i, x in enumerate(filas) if x.weight > 0 or i < k]
    return filas

In [61]:
filas = genera_top_k(filas, 40)

1024
1024
1024
751
526
382
263
183
121
84
54
41


In [62]:
filas.sort(key=lambda x: x.weight, reverse=True)        
# imprimo el detalle de todas las fials
for fila in filas:
    print(fila.Originalindex, fila.weight, fila.text)
    

262 5  association  Roman  Roman  naval  communal  communal  Roman  Roman  architectural  urban
224 5  participants  participants  competitors  found  found  puzzles  puzzles  teams  teams  objects
643 5  schools  houses  families  houses  houses  installations  installations  flags  flags  relocate
815 4  killer  killer  character  theme  theme  designer  designer  superhero  superhero  character
64 4  actions  actions  commitment  commitment  destiny  destiny  independence  commitment  achievements  achievements
278 4  Glacier  Glacier water water  Glacier  Glacier land  Springs  Springs  Springs
737 4  theological  political  political  social  social  social  social  social  social  athletic
340 3  stuck  stuck  interaction  interaction  relation  relation  resonate  cooperation  relation  collaborating
215 3  districts  grassroots  grassroots  shelters  shelters  teams  security  security  goals  goals
818 3  tissues  plants  proteins  bacteria  bacteria  proteins  proteins  prote

In [63]:
import pandas as pd
df = pd.DataFrame({
    "Indices Originales": [x.Originalindex for x in filas],
    "Words": [x.text for x in filas]
})

output_csv_path = "interpretable_tokens.csv"
df.to_csv(output_csv_path, index=False, header=False)