In [1]:
import os
import re

In [2]:
COLLECTION_DIR = '../RI-tknz-data'
STOP_WORDS_FILE = '../stop-words.txt'
MAX_LONG = 20
MIN_LONG = 3

In [3]:
def loadFileStopWords(stop_words_file: str):
    complete_route = os.path.abspath(stop_words_file)
    with open(complete_route, 'r', encoding='utf-8') as file:
        stop_words = set(word.strip() for word in file.readlines())
    return stop_words

In [4]:
def removeStopWords(words_list, stop_words_list):
    cleaned_text = [word for word in words_list if word not in stop_words_list]
    return cleaned_text

In [5]:
def averageTermLength(terms):
    if not terms:  # Verificar si la lista está vacía
        return 0
    total_length = sum(len(term) for term in terms)  # Sumar las longitudes de todos los términos
    return total_length / len(terms)  # Calcular el promedio dividiendo la suma por la cantidad de términos


In [6]:
def shortestAndLargestDoc(docs):
    min_tokens = float('inf')
    max_tokens = 0
    llave_min_tokens = None
    llave_max_tokens = None

    for llave, tupla in docs.items():
        cantidad_tokens, _ = tupla
        if cantidad_tokens < min_tokens:
            min_tokens = cantidad_tokens
            llave_min_tokens = llave
        if cantidad_tokens > max_tokens:
            max_tokens = cantidad_tokens
            llave_max_tokens = llave

    return docs[llave_min_tokens], docs[llave_max_tokens]

In [7]:
def obtener_terminos(diccionario):
    # Ordenar en orden ascendente para los términos menos frecuentes
    dic_less_freq = sorted(diccionario.items(), key=lambda x: x[1])
    # Ordenar en orden descendente para los términos más frecuentes
    dic_more_freq = sorted(diccionario.items(), key=lambda x: x[1], reverse=True)
    
    # Seleccionar los 10 primeros términos
    return dic_less_freq[:10], dic_more_freq[:10]


In [8]:
def writeTermsToFile(output_file: str, word_freq_total, word_freq_docs):
    word_freq_total_ordered = dict(sorted(word_freq_total.items()))
    with open(output_file, 'w', encoding='utf-8') as file:
        for term in word_freq_total_ordered:
            file.write(f"{term} {word_freq_total[term]} {word_freq_docs[term]}\n")

In [9]:
def writeStadisticsToFile(output_file: str, total_docs, total_tokens, total_terms, key_list_terms, shortestAndLargestDoc, one_time_terms):
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(f"{total_docs}\n")
        file.write(f"{total_tokens} {total_terms}\n")
        file.write(f"{total_tokens * 100 / total_tokens} {total_terms * 100 / total_tokens}\n")
        file.write(f"{averageTermLength(key_list_terms)}\n")
        file.write(f"{shortestAndLargestDoc[0][0]} {shortestAndLargestDoc[0][1]} {shortestAndLargestDoc[1][0]} {shortestAndLargestDoc[1][1]}\n")
        file.write(f"{one_time_terms}\n")

In [10]:
def writeFrequencyToFile(output_file: str, word_freq_docs, firstTenTerms):
    with open(output_file, 'w', encoding='utf-8') as file:
        for term, _ in firstTenTerms[0]:
            file.write(f"{term} {word_freq_docs[term]}\n")
        for term, _ in firstTenTerms[1]:
            file.write(f"{term} {word_freq_docs[term]}\n")

In [11]:
def findAbbreviations(content):
    REGEX_ABBREVIATIONS = r'\b[a-zA-Z][bcdfgh-np-tvxz]+(?![A-Z])\.'
    return re.findall(REGEX_ABBREVIATIONS, content)

In [12]:
def findEmailsAndUrls(content):
    REGEX_EMAILS = r'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
    REGEX_URLS = r'(https?://)?(www\.[a-z0-9]+(?:\.[a-z0-9]+)+)'
    list_emails = re.findall(REGEX_EMAILS, content)
    list_urls = [''.join(url_tuple) for url_tuple in re.findall(REGEX_URLS, content)]
    return list_emails + list_urls

In [13]:
def findNumbersAndPhones(content):
    REGEX_NUMBERS = r' (\d+)'
    REGEX_PHONES = r'\+?\d{6,}|\+?\d{8,}|\d{2,}-\d{5,}|\+?\d+-\d{2,}-\d{5,}'
    list_numbers = re.findall(REGEX_NUMBERS, content)
    list_phones = re.findall(REGEX_PHONES, content)
    return list_phones + list_numbers

In [14]:
def findWords(content):
    REGEX_WORDS = f'[A-zÀ-ú]{{{MIN_LONG},{MAX_LONG}}}'
    return re.findall(REGEX_WORDS, content)

In [15]:
def tokenizer(content, stop_words_list = None):
    abbreviations_list = findAbbreviations(content)
    emailsAndUrls_list = findEmailsAndUrls(content)
    numbersAndPhones_list = findNumbersAndPhones(content)
    words_list = findWords(content)
    content = abbreviations_list + emailsAndUrls_list + numbersAndPhones_list + words_list
    if stop_words_list != None:
        content = removeStopWords(content, stop_words_list)
    return content

In [16]:
def process_dir(directory: str, stopWordsFile = ''):
    stop_words_list = loadFileStopWords(stopWordsFile) if stopWordsFile != '' else None
    directory = os.path.abspath(directory)
    total_tokens = 0
    total_docs = len(os.listdir(directory))
    word_freq_total = {}
    word_freq_docs = {}
    docs = {}
    for file in os.listdir(directory):
        complete_route = os.path.join(directory, file)
        with open(complete_route, 'r', encoding='utf8') as f:
            doc_count_terms = 0
            content = f.read()
            content = tokenizer(content, stop_words_list)
            total_tokens += len(content)
            unique_words_in_file = set(content)
            for word in unique_words_in_file:
                word_freq_total[word] = word_freq_total.get(word, 0) + content.count(word)
                word_freq_docs[word] = word_freq_docs.get(word, 0) + 1
                doc_count_terms += content.count(word)
            docs[file] = (len(content), doc_count_terms)
    return word_freq_total, word_freq_docs, total_docs, total_tokens, docs

In [17]:
# word_freq_total, word_freq_docs = tokenizer(COLLECTION_DIR)
word_freq_total, word_freq_docs, total_docs, total_tokens, docs = process_dir(COLLECTION_DIR, STOP_WORDS_FILE)
writeTermsToFile('terminos.txt', word_freq_total, word_freq_docs)
writeStadisticsToFile('estadisticas.txt', total_docs, total_tokens, len(word_freq_total), list(word_freq_total.keys()), shortestAndLargestDoc(docs), len([value for value in word_freq_total.values() if value == 1]))
writeFrequencyToFile('frequencia.txt', word_freq_docs, obtener_terminos(word_freq_total))