## Importación de bilbiotecas y descarga de datos

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import random
import os

import openai
from openai import OpenAI
from dotenv import load_dotenv

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

from gensim.models import Word2Vec

from scipy.spatial.distance import cosine

In [50]:
# copy kaggle.json to /root/.kaggle/ folder so that kaggle cli can access it.
if not os.path.exists('/.kaggle'):
    !mkdir /.kaggle
    !mv kaggle.json /.kaggle
    !mv /.kaggle /root/
    !chmod 600 ~/.kaggle/kaggle.json

if not os.path.exists('60k-stack-overflow-questions-with-quality-rate.zip'):
  !kaggle datasets download imoore/60k-stack-overflow-questions-with-quality-rate
  !unzip *.zip

df = pd.read_csv("train.csv")
df = df[:200]
print(df.shape)
df.head()

mkdir: /.kaggle: Read-only file system
mv: rename kaggle.json to /.kaggle: No such file or directory
mv: rename /.kaggle to /root/: No such file or directory
(200, 6)


Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,2016-01-01 05:21:48,HQ


## Uso de OpenAI

In [51]:
# Cargar variables de entorno
load_dotenv()

# Configurar el motor de OpenAI
engine = "gpt-3.5-turbo"
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [52]:
# Se define la función get_embedding para generar un embedding de un texto a través de openai
def get_embedding(text):
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    # Obtener el embedding del primer elemento de la respuesta
    embedding = response.data[0].embedding
    return embedding

In [53]:
# Se obtienen los embeddings para los títulos
if not os.path.exists('stack_overflow_embeddings.pkl'):
    df['embedding'] = df['Title'].apply(get_embedding)

In [54]:
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocesar(texto):
    palabras = word_tokenize(texto.lower())
    return [palabra for palabra in palabras if palabra.isalpha() and palabra not in stop_words]

df['Title_preprocesado'] = df['Title'].apply(preprocesar)

def limpiar_etiquetas(tags_str):
    # Eliminar los símbolos '<' y '>' y dividir en palabras individuales
    return tags_str.replace('>', ' ').replace('<', '').split()

# Aplica la función a tu columna de etiquetas
df['Tags_clean'] = df['Tags'].apply(limpiar_etiquetas)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/n.c.rodriguez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/n.c.rodriguez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
# Guardamos a un archivo para no tener que volver a pedir los embedding a openai
df.to_pickle("stack_overflow_embeddings.pkl")

## Uso de Word2Vec

In [56]:
# Entrenar el modelo Word2Vec
modelo_etiquetas = Word2Vec(df['Tags_clean'], vector_size=100, window=5, min_count=1, workers=4)

In [57]:
# Cargar los embeddings guardados
df = pd.read_pickle("stack_overflow_embeddings.pkl")

In [58]:
# Función para calcular el promedio de las etiquetas de los embeddings
def promedio_embeddings_etiquetas(etiquetas, modelo):
    embeddings = [modelo.wv[etiqueta] for etiqueta in etiquetas if etiqueta in modelo.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(modelo.vector_size)

df['Tags_embedding'] = df['Tags_clean'].apply(lambda etiquetas: promedio_embeddings_etiquetas(etiquetas, modelo_etiquetas))

In [59]:
df.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y,embedding,Title_preprocesado,Tags_clean,Tags_embedding
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE,"[-0.02291659079492092, -0.01639697514474392, -...","[java, repeat, task, every, random, seconds]","[java, repeat]","[-0.0022062121, 0.0014641413, -0.0012125303, -..."
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ,"[0.0082218237221241, -0.027794253081083298, -0...","[java, optionals, immutable]","[java, optional]","[-0.0039062311, -0.0026351665, -0.0028454037, ..."
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ,"[-0.0016710858326405287, -0.004290113691240549...","[text, overlay, image, darkened, opacity, reac...","[javascript, image, overlay, react-native, opa...","[-0.0013092578, 0.0007860021, -8.930145e-05, -..."
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ,"[-0.007209911942481995, -0.0018732174066826701...","[ternary, operator, swift, picky]","[swift, operators, whitespace, ternary-operato...","[-0.0042868583, -0.0029199016, -0.0023774763, ..."
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,2016-01-01 05:21:48,HQ,"[-0.009016715921461582, -0.012110617011785507,...","[fab, scale, animation]","[android, material-design, floating-action-but...","[-0.00092979387, 0.0026591339, 0.0034426488, 0..."


In [60]:
# Concatenar los embeddings de las preguntas y las etiquetas
df['combined_embedding'] = df.apply(lambda fila: np.concatenate([fila['embedding'], fila['Tags_embedding']]), axis=1)

In [61]:
def recomendar_preguntas_similares(embedding_nueva_pregunta, embeddings_existentes, top_n=5):
    # Calcula la similitud coseno entre la nueva pregunta y todas las existentes
    similitudes = [1 - cosine(embedding_nueva_pregunta, emb) for emb in embeddings_existentes]

    # Ordena las preguntas por similitud
    indices_ordenados = np.argsort(similitudes)[::-1]

    # Retorna los top_n índices más similares
    return indices_ordenados[:top_n]

# Ejemplo de uso
embedding_nueva_pregunta = get_embedding("What is a tuple in python?")  # Obtén el embedding de la API de OpenAI
tags_nueva_pregunta = "<python> <programming>"  # Etiquetas de la nueva pregunta
tags_nueva_pregunta_embedding = promedio_embeddings_etiquetas(limpiar_etiquetas(tags_nueva_pregunta), modelo_etiquetas)

# Combina el embedding de la pregunta con el de las etiquetas
combined_embedding_nueva_pregunta = np.concatenate([embedding_nueva_pregunta, tags_nueva_pregunta_embedding])

# Encuentra preguntas relacionadas
indices_recomendados = recomendar_preguntas_similares(combined_embedding_nueva_pregunta, df['combined_embedding'].tolist())
preguntas_recomendadas = df.iloc[indices_recomendados]

print(preguntas_recomendadas['Title'])

46                  Has set been deprecated in python 2?
127    Why do we use print statements while defining ...
93       Python for loop - to iterate or not to iterate?
165    Print type of variable in python3.5 on django 1.8
193          python 2.7 : find nested keys in dictionary
Name: Title, dtype: object
