In [1]:
import pandas as pd
import numpy as np
import os

# **Dataset : Fashion Product Images**

In [2]:
fashion_data = pd.read_csv('csv/fashion_product_images.csv', delimiter=',')

#Elegimos solo las columnas textuales
fashion_data = fashion_data[['gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'year', 'usage', 'productDisplayName']]

fashion_data.head()

Unnamed: 0,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


**Ahora concatenamos y creamos los txt's**

In [3]:
# Ruta de la carpeta donde se guardarán los archivos de texto
output_folder = 'unprocessed_txt'

# Crea la carpeta si no existe
os.makedirs(output_folder, exist_ok=True)

# Itera a través de cada fila y guarda en un archivo de texto
for index, row in fashion_data.iterrows():
    # Convierte la fila en una cadena, separando los elementos por un espacio
    row_str = ' '.join(str(item) for item in row)
    
    # Crea un archivo de texto para cada fila en la carpeta específica
    file_path = os.path.join(output_folder, f'doc{index + 1}.txt')
    with open(file_path, 'w') as file:
        file.write(row_str)

## **Pre-procesamiento**

In [4]:
import nltk 
import nltk.downloader
nltk.downloader.download('punkt')
from nltk.corpus import stopwords #Stopwords
from nltk.stem import SnowballStemmer #Stemming
import re


#Descargamos las stopwords
nltk.download('stopwords')

#Como todo el contenido esta en ingles, las stopwords tambien lo van a estar
stopwords = stopwords.words('english')

#De la misma forma como todo esta en ingles, el stemming tmb lo va a ser
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
directorio_entrada = 'unprocessed_txt'
directorio_salida = 'processed_txt'

# Crea la carpeta de salida si no existe
os.makedirs(directorio_salida, exist_ok=True)

contador_docs = 0

for filename in os.listdir(directorio_entrada):
    if filename.endswith(".txt"):

        contador_docs += 1

        filepath = os.path.join(directorio_entrada, filename)

        #Tokenizamos nuestro txt
        file_content = open(filepath, encoding="utf-8").read().lower()
        tokens = nltk.word_tokenize(file_content)

        #Filtramos para que no pertenezca a los stopwords o no sea un valor no alfanumerico (Stopwords / Valores raros)
        texto_filtrado = [word for word in tokens if not word in stopwords and re.match("^[a-zA-Z]+$", word)]
        #texto_filtrado = [word for word in tokens if not word in stopwords and word.isalnum()]

        #Hacemos Stemming en el idioma respectivo
        texto_filtrado = [stemmer.stem(w) for w in texto_filtrado]

        texto_procesado = " ".join(texto_filtrado)

        #Guardamos la lista unida en un archivo txt en el directorio de salida
        salida_filepath = os.path.join(directorio_salida, f"{filename}")
        
        #Guardamos el txt en el directorio de salida
        with open(salida_filepath, 'w', encoding="utf-8") as output_file:
            output_file.write(texto_procesado)


# **Indice Invertido en Memoria Secundaria**

## **Cómo funciona la creacion de los bloques:**

In [6]:
import sys
import collections
import os
import heapq
import sys


### **Metodos auxiliares**

In [7]:
def sort_terms(term_postings_list):
    """ Sorts dictionary terms in alphabetical order """
    print(" -- Sorting terms...")
    sorted_dictionary = {} # keep track of insertion order
    sorted_terms = sorted(term_postings_list)
    for term in sorted_terms:
        result = [docIds for docIds in term_postings_list[term]]
        result_tftd = calculate_tftd(result)
        sorted_dictionary[term] = result_tftd
    return sorted_dictionary

def calculate_tftd(pl_with_duplicates):
    """ Add term frequency of term in each document """
    # print(pl_with_duplicates)
    counter = collections.Counter(pl_with_duplicates)
    pl_tftd = [[docId, counter[docId]] for docId in counter.keys()]
    return pl_tftd

In [8]:
def write_block_to_disk(term_postings_list, block_number):
    """ Writes index of the block (dictionary + postings list) to disk """

    # Crea la carpeta de salida si no existe
    os.makedirs('index_blocks', exist_ok=True)

    # Define block
    base_path = 'index_blocks/'
    block_name = 'block-' + str(block_number) + '.txt'
    block = open(base_path + block_name, 'a+')
    print(" -- Writing term-positing list block: " + block_name + "...")
    # Write term : posting lists to block
    for index, term in enumerate(term_postings_list):
        # Term - Posting List Format
        # term:[docID1, docID2, docID3]
        # e.g. cat:[4,9,21,42]
        block.write(str(term) + ":" + str((term_postings_list[term])) + "\n")
    block.close()

### **Creacion de los bloques**

In [9]:
# MERGE BLOCKS

def merge_blocks(block_folder, output_file):
    block_files = [os.path.join(block_folder, block) for block in os.listdir(block_folder)]
    block_files.sort()  
    block_handles = [open(file, 'r', encoding='utf-8') for file in block_files] 
    merge_heap = []  
    term_postings_dict = {}

    for i, block_handle in enumerate(block_handles):
        line = block_handle.readline().strip()
        if line:
            term, postings = line.split(':', 1)
            postings = eval(postings) 
            term_postings_dict[term] = postings
            heapq.heappush(merge_heap, (term, i))

    with open(output_file, 'w', encoding='utf-8') as output_handle:
        while merge_heap:
            min_term, block_index = heapq.heappop(merge_heap)

            output_handle.write(f"{min_term}:{str(term_postings_dict[min_term])}\n")

            line = block_handles[block_index].readline().strip()
            if line:
                term, postings = line.split(':', 1)
                postings = eval(postings)  
                term_postings_dict[term] = postings
                heapq.heappush(merge_heap, (term, block_index))
    for block_handle in block_handles:
        block_handle.close()


In [10]:
# SPIMI

def spimi_invert(directory, block_size_limit, output_folder):
    documents = os.listdir(directory)
    documents_count = len(documents)
    documents_counter = 0
    block_number = 0
    dictionary = {}

    for docID in documents:
        if docID.endswith(".txt"):

            documents_counter += 1

            file_route = os.path.join(directory, docID)

            # Tokenizamos nuestro txt
            file_content = open(file_route, encoding="utf-8").read().lower()
            terms = file_content.split()

            for term in terms:
                if term not in dictionary:
            
                    dictionary[term] = [docID]
                else:
                    dictionary[term].append(docID)

            if sys.getsizeof(dictionary) > block_size_limit or (documents_counter == documents_count - 1):
                temp_dict = sort_terms(dictionary)
                write_block_to_disk(temp_dict, block_number)
                temp_dict = {}
                block_number += 1
                dictionary = {}

    print("BLOCKS creation complete!")
    output_file = os.path.join(output_folder, 'inverted_index.txt')
    merge_blocks('index_blocks', output_file)

block_size_limit = 900 * 100
output_folder = 'output_folder'  
os.makedirs(output_folder, exist_ok=True)
spimi_invert('processed_txt', block_size_limit, output_folder)



 -- Sorting terms...
 -- Writing term-positing list block: block-0.txt...
 -- Sorting terms...
 -- Writing term-positing list block: block-1.txt...
 -- Sorting terms...
 -- Writing term-positing list block: block-2.txt...
 -- Sorting terms...
 -- Writing term-positing list block: block-3.txt...
 -- Sorting terms...
 -- Writing term-positing list block: block-4.txt...
 -- Sorting terms...
 -- Writing term-positing list block: block-5.txt...
BLOCKS creation complete!


In [None]:
# BINARY_SEARCH



In [10]:
block_size_limit = 900*100

# Llama a la función spimi_invert con la dirección del directorio y el límite del tamaño del diccionario
spimi_invert('processed_txt', block_size_limit)


 -- Sorting terms...
 -- Writing term-positing list block: block-0.txt...
 -- Sorting terms...
 -- Writing term-positing list block: block-1.txt...
 -- Sorting terms...
 -- Writing term-positing list block: block-2.txt...
 -- Sorting terms...
 -- Writing term-positing list block: block-3.txt...
 -- Sorting terms...
 -- Writing term-positing list block: block-4.txt...
 -- Sorting terms...
 -- Writing term-positing list block: block-5.txt...
BLOCKS creation complete!
