In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to C:\Users\User/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\User/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline, make_pipeline
import joblib
import os
from RAG_Pipeline import TextPreprocessor

In [3]:
# # Definiera synonymer
# ingredient_synonyms = {
#     'chicken': ['poultry', 'hen', 'chicken breast'],
#     'beef': ['ground beef', 'sirloin', 'roast beef'],
#     'potato': ['potatoes', 'spuds', 'yukon gold']
# }

# # Initiera lemmatizer
# lemmatizer = WordNetLemmatizer()

# def preprocess(text):
#     text = str(text).lower()
    
#     # Ersätt synonymer
#     for key, synonyms in ingredient_synonyms.items():
#         for synonym in synonyms:
#             text = re.sub(r'\b' + re.escape(synonym) + r'\b', key, text)
    
#     # Ta bort specialtecken och siffror
#     text = re.sub(r'[^\w\s,-]', '', text)
#     text = re.sub(r'\d+', '', text)
    
#     # Lemmatisera
#     tokens = nltk.word_tokenize(text)
#     tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
#     return ' '.join(tokens)

# # Applicera preprocessing på data
# df['processed'] = (
#     df['ingredients'].apply(preprocess) + ' ' + 
#     df['tag_name'].apply(preprocess)
# )

# # Skriv ut resultat
# print("\nProcesserad data:")
# print(df['processed'].head(3))


In [4]:
# # Skapa TF-IDF pipeline
# # Uppdatera TF-IDF med:
# tfidf = TfidfVectorizer(
#     stop_words='english',
#     ngram_range=(1, 2),  # Fånga flerordsuttryck
#     max_features=30000,  # Fler features
#     token_pattern=r'\b[a-z-]+\b'  # Fånga bindestreck
# )


# # Skapa KNN-modell
# knn = NearestNeighbors(
#     n_neighbors=30,
#     metric='cosine',
#     algorithm='brute'
# )

# # Bygg pipeline
# model = make_pipeline(tfidf, knn)
# model.fit(df['processed'])

# # Spara modellen
# os.makedirs("models", exist_ok=True)
# joblib.dump(model, "models/knn_pipeline.pkl")
# print("\n✅ Modell sparad!")


In [5]:
# ---- DATAINLÄSNING ----
print("✅ Laddar data...")
df = pd.read_csv("recipes_with_ingredients_and_tags.csv")
print(f"Data dimensioner: {df.shape}")
print(df[['name', 'ingredients']].head(3))

✅ Laddar data...
Data dimensioner: (4956, 30)
                                         name  \
0  1-Day Noodles (Taiwanese Beef Noodle Soup)   
1                         1-Hour Banana Bread   
2               1-Hour Buffalo  Chicken Wings   

                                         ingredients  
0  bone-in chuck beef short ribs, beef shin bones...  
1  unsalted butter, caster sugar, self-raising fl...  
2  nonstick cooking spray, chicken wings, kosher ...  


In [6]:
# ---- PREPROCESSING ----
print("\n⚙️ Preprocessar data...")
preprocessor = TextPreprocessor()

# KORRIGERA HÄR: Lägg till [x] i transform-anropen
df['processed_ingredients'] = df['ingredients'].apply(lambda x: preprocessor.transform([x])[0])  # ← Lägg till []
df['processed_tags'] = df['tag_name'].apply(lambda x: preprocessor.transform([x])[0])  # ← Lägg till []
df['processed'] = df['processed_ingredients'] + ' ' + df['processed_tags']

# Ta bort tomma rader
initial_count = len(df)
df = df[df['processed'].str.strip() != '']
print(f"\nRader kvar efter rensning: {len(df)}/{initial_count}")

print("\nExempel på processerad data:")
print(df['processed'].head(10).tolist())





⚙️ Preprocessar data...

Rader kvar efter rensning: 4956/4956

Exempel på processerad data:
['bone-in chuck beef short rib beef shin bone oxtail white onion carrot large tomato red apple celery garlic fresh ginger scallion rice wine star anise sichuan peppercorn coriander seed vegetable oil doubanjiang soy sauce dark soy sauce rock sugar kosher salt granulated sugar fresh noodle baby bok choy mustard green dairy-free taiwanese lunar new year big batch special occasion pan fry dinner pyrex measuring spoon liquid measuring cup dry measuring cup cutting board oven mitt chef knife stove top spatula saute pan mixing bowl', 'unsalted butter caster sugar self-raising flour large banana baking soda egg oven easy dessert indulgent sweet', 'nonstick cooking spray chicken wing kosher salt freshly ground black pepper aluminum-free baking powder smoked paprika onion powder garlic powder sour cream mayonnaise buttermilk apple cider vinegar garlic powder onion powder fresh dill kosher salt unsalted 

In [7]:
# Kontrollera antal tomma rader före/efter
print("Tomma 'processed_ingredients':", df['processed_ingredients'].str.strip().eq('').sum())
print("Tomma 'processed_tags':", df['processed_tags'].str.strip().eq('').sum())


Tomma 'processed_ingredients': 0
Tomma 'processed_tags': 0


In [8]:
print("Antal rader med tomma ingredienser:", df['ingredients'].isnull().sum())


Antal rader med tomma ingredienser: 0


In [9]:
# ---- MODELLTRÄNING ----
tfidf = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    max_features=30000,
    token_pattern=r'\b[a-z-]+\b',
    min_df=2
)

knn = NearestNeighbors(
    n_neighbors=30,
    metric='cosine',
    algorithm='brute'
)

main_pipeline = make_pipeline(tfidf, knn)
knn_pipeline = make_pipeline(tfidf, knn)


In [10]:
print("🔍 Tränar huvudpipeline...")
main_pipeline.fit(df['processed'])

print("🔍 Tränar KNN-pipeline...")
knn_pipeline.fit(df['processed'])


🔍 Tränar huvudpipeline...
🔍 Tränar KNN-pipeline...


In [11]:
# Spara med korrekta stegnamn
print("\n💾 Sparar modeller...")
os.makedirs("models", exist_ok=True)
joblib.dump({
    'preprocessor': preprocessor,
    'tfidf': main_pipeline.named_steps['tfidfvectorizer'],
    'knn': main_pipeline.named_steps['nearestneighbors'],
    'df': df
}, "models/full_pipeline.pkl")
joblib.dump(knn_pipeline, "models/knn_pipeline.pkl")



💾 Sparar modeller...


['models/knn_pipeline.pkl']

In [12]:
# ---- TESTFUNKTIONER ----
def get_recommendations(query, pipeline, top_n=5):
    processed_query = preprocessor.transform(query)[0]
    query_vec = pipeline.named_steps['tfidfvectorizer'].transform([processed_query])
    distances, indices = pipeline.named_steps['nearestneighbors'].kneighbors(query_vec, n_neighbors=top_n)
    results = df.iloc[indices[0]].copy()
    results['similarity'] = 1 - distances[0]
    return results[['name', 'ingredients', 'similarity']]


# Testa båda pipelines
print("\n🧪 Testar huvudpipeline:")
test_query = "chicken, rice, soy sauce"
print(get_recommendations(test_query, main_pipeline))

print("\n🧪 Testar KNN-pipeline:")
print(get_recommendations(test_query, knn_pipeline))




🧪 Testar huvudpipeline:
                             name  \
4943         Zucchini Hash Browns   
4942               Zucchini Fries   
4941  Zucchini Enchilada Roll-Ups   
4940               Zucchini Curry   
4939               Zucchini Chips   

                                            ingredients  similarity  
4943  zucchinis, salt, parmesan cheese, fresh chives...         0.0  
4942  zucchinis, panko breadcrumbs, grated parmesan ...         0.0  
4941  shredded chicken, salt, onion, red bell pepper...         0.0  
4940  oil, red onion, garlic, ginger, coriander, smo...         0.0  
4939  large zucchini, olive oil, salt, pepper, garli...         0.0  

🧪 Testar KNN-pipeline:
                             name  \
4943         Zucchini Hash Browns   
4942               Zucchini Fries   
4941  Zucchini Enchilada Roll-Ups   
4940               Zucchini Curry   
4939               Zucchini Chips   

                                            ingredients  similarity  
4943  zucchinis

In [13]:
# Testa med exempeldata
# Testa med exempeldata
sample_data = pd.DataFrame({
    'ingredients': ['chicken breast, rice, soy sauce'],
    'tag_name': ['asian, quick']
})

preprocessor = TextPreprocessor()
sample_data['processed_ingredients'] = sample_data['ingredients'].apply(lambda x: preprocessor.transform(x)[0])
sample_data['processed_tags'] = sample_data['tag_name'].apply(lambda x: preprocessor.transform(x)[0])
sample_data['processed'] = sample_data['processed_ingredients'] + ' ' + sample_data['processed_tags']

print(sample_data['processed'].iloc[0])
# Output: 'chicken rice soy sauce asian quick'


 


In [14]:
print("Stegnamn i huvudpipelinen:", main_pipeline.named_steps.keys())
# Output: ['tfidfvectorizer', 'nearestneighbors']



Stegnamn i huvudpipelinen: dict_keys(['tfidfvectorizer', 'nearestneighbors'])


In [15]:
import joblib

loaded = joblib.load("models/full_pipeline.pkl")
print(hasattr(loaded['preprocessor'], 'transform'))  # Ska vara True


True


In [16]:
# Lägg till denna kod i din notebook efter preprocessing
print("\nExempel på processerade ingredienser:")
print(df['processed'].sample(5).values)



Exempel på processerade ingredienser:
['chicken green onion ginger sesame oil canola oil panko breadcrumb sesame seed soy sauce rice vinegar sake sugar bamboo skewer dairy-free high-protein low-carb snack kid-friendly stove top casual party date night special occasion pan fry appetizer easy japanese'
 'chuck roast all-purpose flour salt pepper olive oil red onion celery stalk carrot garlic can plum tomato red wine bay leaf fresh parsley chopped fresh sage contains alcohol walmart holiday bundle pyrex tongs chef knife cutting board measuring spoon liquid measuring cup dry measuring cup big batch kid-friendly weeknight slow cooker easy comfort food american dinner winter fall stove top casual party'
 'honey soy sauce hoisin sauce sriracha rice vinegar sesame oil garlic ginger salt pepper baby back rib oil soy sauce hoisin sauce sesame oil rice vinegar honey toasted sesame seed green onion bbq chef knife oven mitt wooden spoon saute pan tongs baking pan whisk measuring spoon liquid measu

In [17]:
from RAG_Pipeline import RecipeRAG

rag = RecipeRAG(
    model_path="models/full_pipeline.pkl", 
    data_path="recipes_with_ingredients_and_tags.csv"
)

test_query = "chicken, rice, soy sauce"
results = rag.retrieve(test_query, top_k=5)
print(results[['name', 'ingredients', 'similarity']].head())


NameError: name 'load_dotenv' is not defined

In [None]:
# I din notebook efter träning
print(rag.pipeline['tfidf'].get_feature_names_out()[:100])


In [None]:
from RAG_Pipeline import RecipeChatbot

chatbot = RecipeChatbot(rag)
response = chatbot.handle_message("Vad kan jag laga med kyckling och ris?")
print(f"ChefBot: {response}")


In [None]:
preprocessor = TextPreprocessor()
test_text = "Chicken breast, 2 cups rice, soy sauce"
print(preprocessor.transform([test_text]))  # Förväntat: ['chicken 2 cup rice soy sauce']

