In [22]:
import numpy as np
import pandas as pd
import re
import csv
import os

In [23]:
def procesar_reseñas(fichero_csv, carpeta_objetivo):
    """
    Procesa las reseñas de un fichero CSV con etiquetas de sentimiento,
    elimina etiquetas `<br /><br />` y guarda cada reseña en un archivo independiente
    dentro de una carpeta objetivo.

    Args:
        fichero_csv (str): Ruta al fichero CSV con las reseñas.
        carpeta_objetivo (str): Ruta a la carpeta donde se guardarán los archivos.
    """

    # Expresión regular para eliminar etiquetas `<br /><br />`
    regex_br = re.compile(r"<br\s*/?>")

    with open(fichero_csv, "r", encoding="utf-8") as f_csv, open("log.txt", "w", encoding="utf-8") as f_log:
        reader = csv.reader(f_csv, delimiter=",")
        next(reader)  # Skip header row

        for idx, row in enumerate(reader, start=1):
            # Texto de la reseña
            review, sentiment = row

            # Eliminar etiquetas
            review_sin_br = regex_br.sub("", review)

            # Construir nombre de archivo con índice, sentimiento y extensión
            filename = f"{carpeta_objetivo}/{idx}{sentiment}.txt"

            try:
                with open(filename, "w", encoding="utf-8") as f_out:
                    f_out.write(review_sin_br)
            except Exception as e:
                f_log.write(f"Error al procesar reseña {idx}: {e}\n")

fichero_csv = "IMDB Dataset.csv"
carpeta_objetivo = "script"

#procesar_reseñas(fichero_csv, carpeta_objetivo)



def get_word(fichero_csv, palabra1, palabra2):
    dictionary = dict()
    dictionary[palabra1] = []
    dictionary[palabra2] = []
    dictionary[f"{palabra1}-{palabra2}"] = []


    with open(fichero_csv, newline='', encoding="utf-8") as csvfile:
        i = 0
        for linea in csvfile:
            i += 1
            if palabra1 in linea:
                dictionary[palabra1].append(i)
            if palabra2 in linea:
                dictionary[palabra2].append(i)
    with open(fichero_csv, newline='', encoding="utf-8") as csvfile2:
        i = 0
        if len(dictionary[palabra1]) <= len(dictionary[palabra2]):
            for linea in csvfile2:
                i += 1
                if i in dictionary[palabra1] and palabra2 in linea:
                    dictionary[f"{palabra1}-{palabra2}"].append(i)
        else:
            for linea in csvfile2:
                i += 1
                if i in dictionary[palabra2] and palabra1 in linea:
                    dictionary[f"{palabra1}-{palabra2}"].append(i)
    return dictionary[f"{palabra1}-{palabra2}"]

In [24]:
coincidences = get_word("IMDB_raiz.csv", "brutal", "prison")
print(f"Coincidences: {coincidences}")

Coincidences: [1, 375, 805, 2349, 5026, 5741, 6151, 6501, 6706, 8167, 10466, 11893, 14363, 15169, 15587, 15794, 19259, 19815, 20062, 20683, 22019, 22556, 25684, 26718, 28854, 29041, 29242, 30115, 31847, 33811, 34394, 34462, 35173, 37005, 37380, 39079, 39334, 39368, 39429, 40193, 40208, 42084, 42868, 43669, 45035, 45222, 45815, 46402, 46562, 46817, 47105, 47287, 47984, 48297, 49668]


In [17]:
def Jaccard(consulta, doc):
    #consulta: objeto que contiene la lista de palabras y que set admite como listas 
    #doc: string formado por varias palabras
    setcon = set(consulta)
    setdoc = set(re.findall(r"[\w']+", doc))
    coef = len(setcon.intersection(setdoc))/len(setcon.union(setdoc))
    return coef

In [44]:
with open(fichero_csv, newline='', encoding="utf-8") as file:
    reader = csv.reader(file)
    lines = list(reader)
    doc = lines[coincidences[0]][0]
    consulta = ['brutal', 'prison']
    print(Jaccard(consulta, doc))

0.005050505050505051
