In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, RidgeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix
from collections import Counter

food_df = pd.read_csv("FINAL_COMBINED.csv")
disease_df = pd.read_csv("Disease.csv")

# === Breed Size Classification ===
def classify_breed_size(row):
    weight = (row['min_weight'] + row['max_weight']) / 2
    if weight <= 10:
        return 'Small Breed'
    elif 10 < weight <= 25:
        return 'Medium Breed'
    else:
        return 'Large Breed'

disease_df['breed_size_category'] = disease_df.apply(classify_breed_size, axis=1)


selected_nutrients = [
    'protein', 'fat', 'carbohydrate (nfe)', 'crude fibre', 'calcium',
    'phospohorus', 'potassium', 'sodium', 'magnesium', 'vitamin e',
    'vitamin c', 'omega-3-fatty acids', 'omega-6-fatty acids'
]

for col in selected_nutrients:
    food_df[col] = food_df[col].astype(str).str.replace('%', '').str.replace('IU/kg', '').str.extract(r'([\d.]+)').astype(float)

#Combine Text Fields 
food_df['combined_text'] = (
    food_df['ingredients'].fillna('') * 3 + ' ' +
    food_df['key benefits'].fillna('') * 2 + ' ' +
    food_df['product title'].fillna('') + ' ' +
    food_df['product description'].fillna('') + ' ' +
    food_df['helpful tips'].fillna('') + ' ' +
    food_df['need/preference'].fillna('') + ' ' +
    food_df['alternate product recommendation'].fillna('')
)

# === TF-IDF + SVD ===
vectorizer = TfidfVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(food_df['combined_text'])
svd = TruncatedSVD(n_components=300, random_state=42)
X_text_reduced = svd.fit_transform(X_text)

# === Categorical Encoding ===
encoder = OneHotEncoder(sparse_output=True)
X_categorical = encoder.fit_transform(food_df[['breed size', 'lifestage']].fillna('Unknown'))
X_combined = hstack([csr_matrix(X_text_reduced), X_categorical])

# === Nutrients Scaling ===
scale_nutrients = ['sodium', 'omega-3-fatty acids', 'omega-6-fatty acids', 'calcium', 'phospohorus', 'potassium', 'magnesium']
scalers = {}
scaled_targets = {}
for nutrient in selected_nutrients:
    y = food_df[nutrient].fillna(food_df[nutrient].median())
    if nutrient in scale_nutrients:
        scaler = StandardScaler()
        y_scaled = scaler.fit_transform(y.values.reshape(-1, 1)).flatten()
        scalers[nutrient] = scaler
        scaled_targets[nutrient] = y_scaled
    else:
        scaled_targets[nutrient] = y

# === Train Ridge Models for Nutrients ===
def train_ridge(X, y):
    ridge = Ridge()
    grid = GridSearchCV(ridge, {'alpha': [0.1, 1.0, 10.0]}, scoring='r2', cv=3)
    grid.fit(X, y)
    return grid.best_estimator_

ridge_models = {}
for nutrient in selected_nutrients:
    y = scaled_targets[nutrient]
    X_train, _, y_train, _ = train_test_split(X_combined, y, test_size=0.2, random_state=42)
    ridge_models[nutrient] = train_ridge(X_train, y_train)

# === Train Ridge Classifiers for Ingredients= include all with freq >= 5)
all_ingredients_flat = []
for ing_list in food_df['ingredients'].dropna():
    ings = [i.strip().lower() for i in ing_list.split(',')]
    all_ingredients_flat.extend(ings)

ingredient_counter = Counter(all_ingredients_flat)
frequent_ingredients = [ingredient for ingredient, count in ingredient_counter.items() if count >= 5]

ingredient_targets = {}
for ing in frequent_ingredients:
    ingredient_targets[ing] = food_df['ingredients'].fillna('').apply(lambda x: int(ing in x.lower()))

ingredient_models = {}
for ing in frequent_ingredients:
    y = ingredient_targets[ing]
    model = RidgeClassifier()
    model.fit(X_combined, y)
    ingredient_models[ing] = model

# === Disorder Keywords ===
disorder_keywords = {
    "Inherited musculoskeletal disorders": "joint mobility glucosamine arthritis cartilage flexibility",
    "Inherited gastrointestinal disorders": "digest stomach bowel sensitive diarrhea gut ibs",
    "Inherited endocrine disorders": "thyroid metabolism weight diabetes insulin hormone glucose",
    "Inherited eye disorders": "vision eye retina cataract antioxidant sight ocular",
    "Inherited nervous system disorders": "brain seizure cognitive nerve neuro neurological cognition",
    "Inherited cardiovascular disorders": "heart cardiac circulation omega-3 blood pressure vascular",
    "Inherited skin disorders": "skin allergy itch coat omega-6 dermatitis eczema flaky",
    "Inherited immune disorders": "immune defense resistance inflammatory autoimmune",
    "Inherited urinary and reproductive disorders": "urinary bladder kidney renal urine reproductive",
    "Inherited respiratory disorders": "breath respiratory airway lung cough breathing nasal",
    "Inherited blood disorders": "anemia blood iron hemoglobin platelets clotting hemophilia"
}

# User Interaction 
user_breed = input("Enter dog breed: ").strip().lower()
breed_info = disease_df[disease_df['Breed'].str.lower() == user_breed]

if breed_info.empty:
    print("Breed not found in disease dataset.")
else:
    breed_size = breed_info['breed_size_category'].values[0]
    disorder_options = breed_info['Disease'].unique()
    print(f"\nDisorders for {user_breed.title()} ({breed_size}):")
    for idx, dis in enumerate(disorder_options):
        print(f"{idx + 1}. {dis}")

    selection = int(input("\nSelect disorder (enter number): ")) - 1
    selected_disorder = disorder_options[selection]
    disorder_type = breed_info[breed_info['Disease'] == selected_disorder]['Disorder'].values[0]

    keyword_string = disorder_keywords.get(disorder_type, selected_disorder)
    keyword_vec = vectorizer.transform([keyword_string])
    keyword_reduced = svd.transform(keyword_vec)
    keyword_combined = hstack([csr_matrix(keyword_reduced), encoder.transform([[breed_size, 'Adult']])])

    # === Predict Nutrients ===
    nutrient_forecast = {}
    for nutrient, model in ridge_models.items():
        pred = model.predict(keyword_combined)[0]
        if nutrient in scalers:
            pred = scalers[nutrient].inverse_transform([[pred]])[0][0]
        nutrient_forecast[nutrient] = round(pred, 2)

    # === Predict Ingredients ===
    ingredient_scores = {}
    for ing, model in ingredient_models.items():
        score = model.decision_function(keyword_combined)[0]
        ingredient_scores[ing] = score

    top_ingredients = sorted(ingredient_scores.items(), key=lambda x: x[1], reverse=True)[:10]
    final_ingredients = [ing.title() for ing, score in top_ingredients]

    # === Find Example Products ===
    filtered_products = food_df[
        (food_df['breed size'].str.lower() == breed_size.lower()) |
        (food_df['breed size'].str.lower() == 'unknown')
    ]
    similarities = cosine_similarity(keyword_vec, vectorizer.transform(filtered_products['combined_text'])).flatten()
    top_indices = similarities.argsort()[-3:][::-1]
    recommended_products = filtered_products.iloc[top_indices]['product title'].dropna().tolist()

    # Output ===
    print("\n============================================")
    print(f"\U0001F372 Recommended Diet for {user_breed.title()} ({breed_size}):")
    print(f"Selected Disorder: {selected_disorder} ({disorder_type})\n")
    
    print("Ingredients:")
    for ing in final_ingredients:
        print(f"- {ing}")

    print("\nForecasted Nutrients (% of dry matter):")
    for nutrient, value in nutrient_forecast.items():
        print(f"- {nutrient}: {value}%")

    print("\nExample Commercial Products:")
    for product in recommended_products:
        print(f"- {product}")

    print("============================================\n")



Disorders for Chihuahua (Small Breed):
1. Hemophilia
2. Mitral valve dysplasia
3. Patent ductus arteriosus (PDA)
4. Pulmonic stenosis
5. Cryptorchidism, retained testicle
6. Keratoconjunctivitis sicca (KCS) - "dry eye"
7. Corneal dystrophy
8. Glaucoma
9. Progressive retinal atrophy
10. Patellar luxation
11. Chiari-like malformation (CM) and syringomyelia (SM)
12. Hydrocephalus
13. Neuroaxonal dystrophy
14. Spina bifida
15. colour dilution alopecia
16. Demodicosis/ Demodectic mange
17. pattern baldness

🍲 Recommended Diet for Chihuahua (Small Breed):
Selected Disorder: Patent ductus arteriosus (PDA) (Inherited cardiovascular disorders)

Ingredients:
- Minerals
- Beta-Carotene
- Chicken
- Digest
- Wheat
- Maize
- Beta-Carotene 1.5Mg
- With Natural Antioxidant.
- Turkey
- Rice

Forecasted Nutrients (% of dry matter):
- protein: 18.08%
- fat: 11.91%
- carbohydrate (nfe): 52.66%
- crude fibre: 4.07%
- calcium: 0.74%
- phospohorus: 0.52%
- potassium: 0.79%
- sodium: 0.25%
- magnesium: 0.11%



УСТИНДЕГИСИ КОНЕЧНЫЙ

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, RidgeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix
from collections import Counter

# === Load Data ===
food_df = pd.read_csv("FINAL_COMBINED.csv")
disease_df = pd.read_csv("Disease.csv")

# === Breed Size Classification ===
def classify_breed_size(row):
    weight = (row['min_weight'] + row['max_weight']) / 2
    if weight <= 10:
        return 'Small Breed'
    elif 10 < weight <= 25:
        return 'Medium Breed'
    else:
        return 'Large Breed'

disease_df['breed_size_category'] = disease_df.apply(classify_breed_size, axis=1)

# === Nutrients Cleaning ===
selected_nutrients = [
    'protein', 'fat', 'carbohydrate (nfe)', 'crude fibre', 'calcium',
    'phospohorus', 'potassium', 'sodium', 'magnesium', 'vitamin e',
    'vitamin c', 'omega-3-fatty acids', 'omega-6-fatty acids'
]

for col in selected_nutrients:
    food_df[col] = food_df[col].astype(str).str.replace('%', '').str.replace('IU/kg', '').str.extract(r'([\d.]+)').astype(float)

# === Combined Text for TF-IDF ===
food_df['combined_text'] = (
    food_df['product title'].fillna('') + ' ' +
    food_df['product description'].fillna('') + ' ' +
    food_df['key benefits'].fillna('') + ' ' +
    food_df['ingredients'].fillna('')
)

vectorizer = TfidfVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(food_df['combined_text'])
svd = TruncatedSVD(n_components=300, random_state=42)
X_text_reduced = svd.fit_transform(X_text)

# === Categorical Features ===
encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
X_cat = encoder.fit_transform(food_df[['breed size', 'lifestage']].fillna('Unknown'))

# === Ingredient Classifiers from Text + Category ===
all_ingredients_flat = []
for ing_list in food_df['ingredients'].dropna():
    ings = [i.strip().lower() for i in ing_list.split(',')]
    all_ingredients_flat.extend(ings)

ingredient_counter = Counter(all_ingredients_flat)
frequent_ingredients = [ingredient for ingredient, count in ingredient_counter.items() if count >= 5]

ingredient_targets = {}
for ing in frequent_ingredients:
    ingredient_targets[ing] = food_df['ingredients'].fillna('').apply(lambda x: int(ing in x.lower()))

X_ing_full = hstack([csr_matrix(X_text_reduced), X_cat])
ingredient_models = {}
for ing in frequent_ingredients:
    y = ingredient_targets[ing]
    model = RidgeClassifier()
    model.fit(X_ing_full, y)
    ingredient_models[ing] = model

# === Nutrient Regression from Nutrient Columns ===
scale_nutrients = ['sodium', 'omega-3-fatty acids', 'omega-6-fatty acids', 'calcium', 'phospohorus', 'potassium', 'magnesium']
ridge_models = {}
scalers = {}
print("\nEvaluating Nutrient Prediction Models:")
for nutrient in selected_nutrients:
    feature_cols = selected_nutrients.copy()
    feature_cols.remove(nutrient)

    X = food_df[feature_cols].fillna(0)
    y = food_df[nutrient].fillna(food_df[nutrient].median())

    scaler_X = StandardScaler()
    X_scaled = scaler_X.fit_transform(X)

    if nutrient in scale_nutrients:
        scaler_y = StandardScaler()
        y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()
        scalers[nutrient] = scaler_y
    else:
        y_scaled = y
        scalers[nutrient] = None

    model = Ridge()
    grid = GridSearchCV(model, {'alpha': [0.01, 0.1, 1.0]}, scoring='r2', cv=3)
    grid.fit(X_scaled, y_scaled)
    ridge_models[nutrient] = (grid.best_estimator_, scaler_X)

# === Disorder Keywords ===
disorder_keywords = {
    "Inherited musculoskeletal disorders": "joint mobility glucosamine arthritis cartilage flexibility",
    "Inherited gastrointestinal disorders": "digest stomach bowel sensitive diarrhea gut ibs",
    "Inherited endocrine disorders": "thyroid metabolism weight diabetes insulin hormone glucose",
    "Inherited eye disorders": "vision eye retina cataract antioxidant sight ocular",
    "Inherited nervous system disorders": "brain seizure cognitive nerve neuro neurological cognition",
    "Inherited cardiovascular disorders": "heart cardiac circulation omega-3 blood pressure vascular",
    "Inherited skin disorders": "skin allergy itch coat omega-6 dermatitis eczema flaky",
    "Inherited immune disorders": "immune defense resistance inflammatory autoimmune",
    "Inherited urinary and reproductive disorders": "urinary bladder kidney renal urine reproductive",
    "Inherited respiratory disorders": "breath respiratory airway lung cough breathing nasal",
    "Inherited blood disorders": "anemia blood iron hemoglobin platelets clotting hemophilia"
}

# === User Interaction ===
user_breed = input("Enter dog breed: ").strip().lower()
breed_info = disease_df[disease_df['Breed'].str.lower() == user_breed]

if breed_info.empty:
    print("Breed not found in disease dataset.")
else:
    breed_size = breed_info['breed_size_category'].values[0]
    disorder_options = breed_info['Disease'].unique()
    print(f"\nDisorders for {user_breed.title()} ({breed_size}):")
    for idx, dis in enumerate(disorder_options):
        print(f"{idx + 1}. {dis}")

    selection = int(input("\nSelect disorder (enter number): ")) - 1
    selected_disorder = disorder_options[selection]
    disorder_type = breed_info[breed_info['Disease'] == selected_disorder]['Disorder'].values[0]

    keyword_string = disorder_keywords.get(disorder_type, selected_disorder)
    print(f"\nSelected Disorder: {selected_disorder} ({disorder_type})")

    keyword_vec = vectorizer.transform([keyword_string])
    keyword_reduced = svd.transform(keyword_vec)
    keyword_combined = hstack([csr_matrix(keyword_reduced), encoder.transform([[breed_size, 'Adult']])])

    # === Predict Ingredients (text-based hybrid model) ===
    ingredient_scores = {}
    for ing, model in ingredient_models.items():
        score = model.decision_function(keyword_combined)[0]
        ingredient_scores[ing] = score
    top_ingredients = sorted(ingredient_scores.items(), key=lambda x: x[1], reverse=True)[:10]
    final_ingredients = [ing.title() for ing, score in top_ingredients]

    # === Predict Nutrients (ridge-based model) ===
    dummy_input = food_df[selected_nutrients].fillna(0).mean().to_frame().T[selected_nutrients].copy()
    nutrient_forecast = {}
    for nutrient, (model, scaler_X) in ridge_models.items():
        X_input = dummy_input[selected_nutrients].copy()
        X_input.drop(columns=[nutrient], inplace=True)
        X_input_scaled = scaler_X.transform(X_input)
        pred = model.predict(X_input_scaled)[0]
        if scalers[nutrient]:
            pred = scalers[nutrient].inverse_transform([[pred]])[0][0]
        nutrient_forecast[nutrient] = round(pred, 2)

    # === Find Example Commercial Products ===
    filtered_products = food_df[
        (food_df['breed size'].str.lower() == breed_size.lower()) |
        (food_df['breed size'].str.lower() == 'unknown')
    ]
    similarities = cosine_similarity(keyword_vec, vectorizer.transform(filtered_products['combined_text'])).flatten()
    top_indices = similarities.argsort()[-3:][::-1]
    recommended_products = filtered_products.iloc[top_indices]['product title'].dropna().tolist()

    # === Final Output ===
    print("Example Commercial Products:")
    for product in recommended_products:
        print(f"- {product}")

    print("============================================")
    print(f"\U0001F372 Recommended Diet for {user_breed.title()} ({breed_size}):")
    print(f"Selected Disorder: {selected_disorder} ({disorder_type})\n")

    print("Ingredients:")
    for ing in final_ingredients:
        print(f"- {ing}")

    print("\nForecasted Nutrients (% of dry matter):")
    for nutrient, value in nutrient_forecast.items():
        print(f"- {nutrient}: {value}%")

    print("\n============================================\n")



Evaluating Nutrient Prediction Models:

Disorders for Afghan Hound (Medium Breed):
1. Hypothyroidism
2. Cataracts
3. Retinal dysplasia
4. Corneal dystrophy
5. Glaucoma
6. Progressive retinal atrophy
7. Hip dysplasia
8. Demodicosis/ Demodectic mange

Selected Disorder: Retinal dysplasia (Inherited eye disorders)
Example Commercial Products:
- Puppy Medium Breed Dry Dog Food with Lamb & Rice
- Adult Medium Breed Dry Dog Food with Chicken
- Medium Puppy Food
🍲 Recommended Diet for Afghan Hound (Medium Breed):
Selected Disorder: Retinal dysplasia (Inherited eye disorders)

Ingredients:
- Beta-Carotene
- With Natural Antioxidant.
- Minerals
- Digest
- Flaxseed
- Chicken
- Maize
- Animal Fat
- Vegetable Oil
- Chicken And Turkey Meal

Forecasted Nutrients (% of dry matter):
- protein: 20.4%
- fat: 13.62%
- carbohydrate (nfe): 44.27%
- crude fibre: 4.18%
- calcium: 0.79%
- phospohorus: 0.58%
- potassium: 0.73%
- sodium: 0.3%
- magnesium: 0.1%
- vitamin e: 638.81%
- vitamin c: 130.16%
- omega-

