In [1]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import make_pipeline
import joblib
import os

nltk.download('wordnet')
nltk.download('punkt')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Kontrollera att filen finns
assert os.path.exists("recipes_with_ingredients_and_tags.csv"), "Ladda upp CSV-filen till notebook-miljön!"

# Ladda data
df = pd.read_csv("recipes_with_ingredients_and_tags.csv")
print("Data laddad:")
print(df[['name', 'ingredients']].head(3))


Data laddad:
                                         name  \
0  1-Day Noodles (Taiwanese Beef Noodle Soup)   
1                         1-Hour Banana Bread   
2               1-Hour Buffalo  Chicken Wings   

                                         ingredients  
0  bone-in chuck beef short ribs, beef shin bones...  
1  unsalted butter, caster sugar, self-raising fl...  
2  nonstick cooking spray, chicken wings, kosher ...  


In [3]:
# Definiera synonymer
ingredient_synonyms = {
    'chicken': ['poultry', 'hen', 'chicken breast'],
    'beef': ['ground beef', 'sirloin', 'roast beef'],
    'potato': ['potatoes', 'spuds', 'yukon gold']
}

# Initiera lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = str(text).lower()
    
    # Ersätt synonymer
    for key, synonyms in ingredient_synonyms.items():
        for synonym in synonyms:
            text = re.sub(r'\b' + re.escape(synonym) + r'\b', key, text)
    
    # Ta bort specialtecken och siffror
    text = re.sub(r'[^\w\s,-]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Lemmatisera
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

# Applicera på data
df['processed'] = df['ingredients'].apply(preprocess) + ' ' + df['tag_name'].apply(preprocess)
print("\nProcesserad data:")
print(df['processed'].head(3))



Processerad data:
0    bone-in chuck beef short rib , beef shin bone ...
1    unsalted butter , caster sugar , self-raising ...
2    nonstick cooking spray , chicken wing , kosher...
Name: processed, dtype: object


In [4]:
# Skapa TF-IDF pipeline
tfidf = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    max_features=10000,
    token_pattern=r'\b[a-z-]+\b'
)

# Skapa KNN-modell
knn = NearestNeighbors(
    n_neighbors=20,
    metric='cosine',
    algorithm='brute'
)

# Bygg pipeline
model = make_pipeline(tfidf, knn)
model.fit(df['processed'])

# Spara modellen
os.makedirs("models", exist_ok=True)
joblib.dump(model, "models/knn_pipeline.pkl")
print("\n✅ Modell sparad!")



✅ Modell sparad!


In [10]:
def get_recommendations(query, top_n=5):
    # Förbearbeta query
    processed_query = preprocess(query)
    
    # Hämta rekommendationer
    distances, indices = model.named_steps['nearestneighbors'].kneighbors(
        model.named_steps['tfidfvectorizer'].transform([processed_query]),
        n_neighbors=top_n
    )
    
    # Formatera resultat
    results = df.iloc[indices[0]]
    results['similarity'] = 1 - distances[0]
    return results[['name', 'ingredients', 'similarity']]

# Testa
test_query = "chicken, rice, soy sauce"
print("\nTestresultat:")
print(get_recommendations(test_query))



Testresultat:
                                             name  \
3240           One-Pot Chicken Teriyaki With Rice   
2770                        Loco Moco Rice Burger   
1997             Fried Rice: Soy, Soy Revolution!   
1998  Fried Rice: Soy, Spice, and Everything Nice   
3579              Pork Red Na (Noodles And Gravy)   

                                            ingredients  similarity  
3240  olive oil, chicken breasts, salt, pepper, garl...    0.281681  
2770  rice, soy sauce, ground beef, onion, garlic, p...    0.259637  
1997  firm tofu, teriyaki sauce, vegetable oil, kimc...    0.255591  
1998  firm tofu, teriyaki sauce, vegetable oil, kimc...    0.255252  
3579  pork shoulder, rice noodle, dark soy sauce, ra...    0.249048  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['similarity'] = 1 - distances[0]



Query: chicken rice soy sauce
Träffar: 0/2
Top 3 rekommendationer:
                                    name  similarity
3240  One-Pot Chicken Teriyaki With Rice    0.281681
2770               Loco Moco Rice Burger    0.259637
1997    Fried Rice: Soy, Soy Revolution!    0.255591

Query: beef potato
Träffar: 0/2
Top 3 rekommendationer:
                                            name  similarity
2629  Kenyan Beef And Potato Pilau By Kiano Moju    0.231859
902             Cheesesteak Stew In A Bread Bowl    0.224979
479                              Beef Stroganoff    0.211501

Sammanfattning: 0/4 träffar (0%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['similarity'] = 1 - distances[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['similarity'] = 1 - distances[0]


In [7]:
# Sparar allt i en enda cell för enkel återanvändning
final_pipeline = {
    'preprocessor': preprocess,
    'model': model,
    'df': df
}

joblib.dump(final_pipeline, "models/full_pipeline.pkl")


['models/full_pipeline.pkl']

### Användning i Framtiden


In [11]:
def recommend(query, top_n=5):
    # Ladda pipeline och data
    loaded_pipeline = joblib.load("models/full_pipeline.pkl")
    df = pd.read_csv("recipes_with_ingredients_and_tags.csv")
    
    # Förbearbeta query
    processed_query = loaded_pipeline['preprocessor'](query)
    
    # Hämta närmaste grannar
    tfidf = loaded_pipeline['tfidf']
    knn = loaded_pipeline['knn']
    query_vec = tfidf.transform([processed_query])
    distances, indices = knn.kneighbors(query_vec, n_neighbors=top_n)
    
    # Returnera resultat
    return df.iloc[indices[0]][['name', 'ingredients']]


In [12]:
# Uppdatera sparandet
final_pipeline = {
    'preprocessor': preprocess,
    'tfidf': tfidf,
    'knn': knn
}

joblib.dump(final_pipeline, "models/full_pipeline.pkl")


['models/full_pipeline.pkl']

In [13]:
# Ladda pipeline
loaded_pipeline = joblib.load("models/full_pipeline.pkl")

# Testa med ny funktion
print(recommend("beef and potatoes", top_n=3))


                                            name  \
2629  Kenyan Beef And Potato Pilau By Kiano Moju   
902             Cheesesteak Stew In A Bread Bowl   
479                              Beef Stroganoff   

                                            ingredients  
2629  ground cumin, paprika, ground cardamom, black ...  
902   beef sirloin, salt, pepper, onion powder, all-...  
479   beef sirloin, salt, freshly ground black peppe...  


In [14]:
# Importera bibliotek
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Ladda data och modeller
df = pd.read_csv("recipes_with_ingredients_and_tags.csv")
loaded_pipeline = joblib.load("models/full_pipeline.pkl")

def get_recommendations(query, top_n=5):
    # Förbearbeta
    processed_query = loaded_pipeline['preprocessor'](query)
    
    # Transformera och hämta grannar
    query_vec = loaded_pipeline['tfidf'].transform([processed_query])
    distances, indices = loaded_pipeline['knn'].kneighbors(query_vec, n_neighbors=top_n)
    
    # Presentera resultat
    return df.iloc[indices[0]][['name', 'ingredients', 'tag_name']]

# Testa
print(get_recommendations("chicken, garlic, soy sauce"))


                                             name  \
2592                               Japanese Curry   
3459                 Peking Duck-Inspired Burrito   
4869  Weekday Meal-prep Chicken Teriyaki Stir-fry   
3579              Pork Red Na (Noodles And Gravy)   
3355                     Pan-Roasted Chicken Rice   

                                            ingredients  \
2592  kobe beef, garlic, soy sauce, ginger, black pe...   
3459  duck breasts, oil, salt, garlic, soy sauce, ri...   
4869  chicken breasts, salt, pepper, garlic, soy sau...   
3579  pork shoulder, rice noodle, dark soy sauce, ra...   
3355  chicken breast, greek yogurt, lemon juice, veg...   

                                               tag_name  
2592  Dairy-Free, Low-Fat, Low-Sugar, Low-Calorie, C...  
3459  Tongs, Oven Mitts, Baking Pan, Mixing Bowl, St...  
4869  McCormick Easy Dinner, Pyrex, Dry Measuring Cu...  
3579   Dairy-Free, Weeknight, Dinner, Stove Top, Fusion  
3355         High-Protein, Weeknight