In [1]:
# RAG_Pipeline.ipynb
import os
import pandas as pd
import joblib
import re
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
import nltk


In [None]:
# Konfigurera NLTK
nltk_data_path = os.path.join(os.path.expanduser("~"), "nltk_data")
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet', download_dir=nltk_data_path)
    nltk.download('punkt', download_dir=nltk_data_path)

from nltk.stem import WordNetLemmatizer

# --------------------------
# Globala funktioner
# --------------------------
ingredient_synonyms = {
    'chicken': ['poultry', 'hen', 'chicken breast'],
    'beef': ['ground beef', 'sirloin', 'roast beef'],
    'potato': ['potatoes', 'spuds', 'yukon gold']
}

lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = str(text).lower()
    for key, synonyms in ingredient_synonyms.items():
        for synonym in synonyms:
            text = re.sub(r'\b' + re.escape(synonym) + r'\b', key, text)
    text = re.sub(r'[^\w\s,-]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# --------------------------
# RAG-klass
# --------------------------
class RecipeRAG:
    def __init__(self, model_path, data_path):
        self.pipeline = joblib.load(model_path)
        self.df = pd.read_csv(data_path)
        load_dotenv()
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    
    def retrieve(self, query, top_k=5):
        """Hämta recept med KNN-modellen"""
        processed_query = preprocess(query)
        query_vec = self.pipeline['tfidf'].transform([processed_query])
        distances, indices = self.pipeline['knn'].kneighbors(query_vec, n_neighbors=top_k)
        return self.df.iloc[indices[0]]
    
    def generate_description(self, recipes):
        """Generera LLM-baserade beskrivningar"""
        descriptions = []
        for _, recipe in recipes.iterrows():
            prompt = f"""Beskriv detta recept på ett lockande sätt:
            Namn: {recipe['name']}
            Ingredienser: {recipe['ingredients']}
            Taggar: {recipe['tag_name']}
            """
            
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo-0125",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=150
            )
            descriptions.append(response.choices[0].message.content)
        return descriptions
    
    def save(self, path):
        """Spara pipeline"""
        joblib.dump(self, path)

# --------------------------
# Huvudkörning
# --------------------------
if __name__ == "__main__":
    # Initiera RAG
    rag = RecipeRAG(
        model_path="models/full_pipeline.pkl",
        data_path="recipes_with_ingredients_and_tags.csv"
    )
    
    # Testa retrieval
    test_query = "chicken, rice, soy sauce"
    results = rag.retrieve(test_query)
    print(f"\n🔍 Rekommendationer för '{test_query}':")
    print(results[['name', 'ingredients']].head(3))
    
    # Testa generering
    try:
        descriptions = rag.generate_description(results.head(2))
        print("\n📝 Genererade beskrivningar:")
        for desc in descriptions:
            print(f"- {desc}")
    except Exception as e:
        print(f"\n❌ Genereringsfel: {str(e)}")
        print("Kontrollera din OpenAI API-nyckel i .env-filen!")
