In [31]:
import os
import sys
sys.path.append(os.path.abspath("../../shared"))  # Agrega la carpeta al PYTHONPATH
from tokenizer_v2 import tokenizer, loadFileStopWords

In [32]:
COLLECTION_DIR = '../RI-tknz-data'
# COLLECTION_DIR = './TestCollection_ER2'
STOP_WORDS_FILE = '../../shared/stop-words.txt'

In [33]:
def averageTermLength(terms):
    if not terms:  # Verificar si la lista está vacía
        return 0
    total_length = sum(len(term) for term in terms)  # Sumar las longitudes de todos los términos
    return total_length / len(terms)  # Calcular el promedio dividiendo la suma por la cantidad de términos

In [34]:
def shortestAndLargestDoc(docs):
    min_tokens = float('inf')
    max_tokens = 0
    llave_min_tokens = None
    llave_max_tokens = None

    for llave, tupla in docs.items():
        cantidad_tokens, _ = tupla
        if cantidad_tokens < min_tokens:
            min_tokens = cantidad_tokens
            llave_min_tokens = llave
        if cantidad_tokens > max_tokens:
            max_tokens = cantidad_tokens
            llave_max_tokens = llave
    return docs[llave_min_tokens], docs[llave_max_tokens]

In [35]:
def obtener_terminos(diccionario):
    # Ordenar en orden ascendente para los términos menos frecuentes
    dic_less_freq = sorted(diccionario.items(), key=lambda x: x[1])
    # Ordenar en orden descendente para los términos más frecuentes
    dic_more_freq = sorted(diccionario.items(), key=lambda x: x[1], reverse=True)
    
    # Seleccionar los 10 primeros términos
    return dic_less_freq[:10], dic_more_freq[:10]


In [36]:
def writeTermsToFile(output_file: str, word_freq_total, word_freq_docs):
    word_freq_total_ordered = dict(sorted(word_freq_total.items()))
    with open(output_file, 'w', encoding='utf-8') as file:
        for term in word_freq_total_ordered:
            file.write(f"{term} {word_freq_total[term]} {word_freq_docs[term]}\n")

In [37]:
def writeStadisticsToFile(output_file: str, total_docs, total_tokens, total_terms, key_list_terms, shortestAndLargestDoc, one_time_terms):
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(f"{total_docs}\n")
        file.write(f"{total_tokens} {total_terms}\n")
        file.write(f"{total_tokens * 100 / total_tokens} {total_terms * 100 / total_tokens}\n")
        file.write(f"{averageTermLength(key_list_terms)}\n")
        file.write(f"{shortestAndLargestDoc[0][0]} {shortestAndLargestDoc[0][1]} {shortestAndLargestDoc[1][0]} {shortestAndLargestDoc[1][1]}\n")
        file.write(f"{one_time_terms}\n")

In [38]:
def writeFrequencyToFile(output_file: str, word_freq_docs, firstTenTerms):
    with open(output_file, 'w', encoding='utf-8') as file:
        for term, _ in firstTenTerms[0]:
            file.write(f"{term} {word_freq_docs[term]}\n")
        for term, _ in firstTenTerms[1]:
            file.write(f"{term} {word_freq_docs[term]}\n")

In [39]:
def process_dir(directory: str, stopWordsFile = ''):
    stop_words_list = loadFileStopWords(stopWordsFile) if stopWordsFile != '' else None
    directory = os.path.abspath(directory)
    total_tokens = 0
    total_docs = len(os.listdir(directory))
    word_freq_total = {}
    word_freq_docs = {}
    docs = {}
    for file in os.listdir(directory):
        complete_route = os.path.join(directory, file)
        with open(complete_route, 'r', encoding='utf8') as f:
            doc_count_terms = 0
            content = f.read()
            content_tokenized = tokenizer(content, stop_words_list)
            total_tokens += len(content_tokenized)
            unique_words_in_file = set(content_tokenized)
            for word in unique_words_in_file:
                word_freq_total[word] = word_freq_total.get(word, 0) + content_tokenized.count(word)
                word_freq_docs[word] = word_freq_docs.get(word, 0) + 1
                doc_count_terms += 1
            docs[file] = (len(content_tokenized), doc_count_terms)
    return word_freq_total, word_freq_docs, total_docs, total_tokens, docs

In [40]:
# word_freq_total, word_freq_docs = tokenizer(COLLECTION_DIR)
word_freq_total, word_freq_docs, total_docs, total_tokens, docs = process_dir(COLLECTION_DIR, STOP_WORDS_FILE)
writeTermsToFile('terminos.txt', word_freq_total, word_freq_docs)
writeStadisticsToFile('estadisticas.txt', total_docs, total_tokens, len(word_freq_total), list(word_freq_total.keys()), shortestAndLargestDoc(docs), len([value for value in word_freq_total.values() if value == 1]))
writeFrequencyToFile('frequencia.txt', word_freq_docs, obtener_terminos(word_freq_total))