Ridge Regression для предсказания нутриентов

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.sparse import hstack, csr_matrix

disease_df = pd.read_csv("Disease.csv")
food_df = pd.read_csv("FINAL_COMBINED.csv")

nutrient_columns = [
    'moisture', 'protein', 'fat', 'crude fibre', 'calcium',
    'phospohorus', 'potassium', 'sodium', 'magnesium', 'vitamin a',
    'vitamin d', 'vitamin e', 'omega-3-fatty acids', 'glucosamine', 'iron'
]

for col in nutrient_columns:
    food_df[col] = food_df[col].astype(str).str.replace('%', '').str.replace('IU/kg', '').str.extract(r'([\d.]+)').astype(float)

food_df['combined_text'] = (
    food_df['product title'].fillna('') + ' ' +
    food_df['key benefits'].fillna('') + ' ' +
    food_df['ingredients'].fillna('')
)

vectorizer = TfidfVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(food_df['combined_text'])

svd = TruncatedSVD(n_components=300, random_state=42)
X_text_reduced = svd.fit_transform(X_text)

encoder = OneHotEncoder(sparse_output=True)
X_categorical = encoder.fit_transform(food_df[['breed size', 'lifestage', 'food form']].fillna('Unknown'))

X_combined_base = hstack([csr_matrix(X_text_reduced), X_categorical])

# Ridge Regression with hyperparameter tuning
ridge_models = {}
param_grid = {'alpha': [0.1, 1.0, 10.0]}
for nutrient in nutrient_columns:
    if food_df[nutrient].notna().sum() > 0:
        y = food_df[nutrient].fillna(food_df[nutrient].median())
        X_train, X_test, y_train, y_test = train_test_split(X_combined_base, y, test_size=0.2, random_state=42)
        ridge = Ridge()
        grid = GridSearchCV(ridge, param_grid, scoring='r2', cv=3, verbose=0)
        grid.fit(X_train, y_train)
        ridge_models[nutrient] = grid.best_estimator_

ridge_evaluation_results = []
for nutrient, model in ridge_models.items():
    y = food_df[nutrient].fillna(food_df[nutrient].median())
    X_train, X_test, y_train, y_test = train_test_split(X_combined_base, y, test_size=0.2, random_state=42)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    ridge_evaluation_results.append({
        'Nutrient': nutrient,
        'MAE': round(mae, 4),
        'MSE': round(mse, 4),
        'R2': round(r2, 4)
    })
ridge_eval_df = pd.DataFrame(ridge_evaluation_results)
print(ridge_eval_df)

               Nutrient        MAE           MSE       R2
0              moisture     6.3390  1.387557e+02   0.6341
1               protein     2.5070  1.760170e+01   0.7280
2                   fat     2.0261  1.146390e+01   0.7173
3           crude fibre     1.1195  2.217500e+00   0.8744
4               calcium     0.1068  2.020000e-02   0.7975
5           phospohorus     0.0749  1.330000e-02   0.8058
6             potassium     0.0853  1.350000e-02   0.7908
7                sodium     0.0600  1.940000e-02   0.5038
8             magnesium     0.0171  6.000000e-04   0.5747
9             vitamin a  2293.0658  3.885510e+07   0.0987
10            vitamin d   101.1204  5.765429e+04   0.1699
11            vitamin e   122.9976  2.522485e+04   0.7048
12  omega-3-fatty acids     0.3062  1.776000e-01   0.6051
13          glucosamine    54.5636  8.829000e+03  -0.0578
14                 iron     0.9841  1.770700e+00 -18.2696


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.sparse import hstack, csr_matrix

food_df = pd.read_csv("FINAL_COMBINED.csv")

selected_nutrients = [
    'protein', 'fat', 'carbohydrate (nfe)', 'crude fibre', 'calcium',
    'phospohorus', 'potassium', 'sodium', 'magnesium', 'vitamin e',
    'vitamin c', 'omega-3-fatty acids', 'omega-6-fatty acids'
]

for col in selected_nutrients:
    food_df[col] = food_df[col].astype(str).str.replace('%', '').str.replace('IU/kg', '').str.extract(r'([\d.]+)').astype(float)

food_df['combined_text'] = (
    food_df['product title'].fillna('') + ' ' +
    food_df['key benefits'].fillna('') + ' ' +
    food_df['ingredients'].fillna('')
)

vectorizer = TfidfVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(food_df['combined_text'])

# Dimensionality reduction (SVD)
svd = TruncatedSVD(n_components=300, random_state=42)
X_text_reduced = svd.fit_transform(X_text)


encoder = OneHotEncoder(sparse_output=True)
X_categorical = encoder.fit_transform(food_df[['breed size', 'lifestage', 'food form']].fillna('Unknown'))

X_combined = hstack([csr_matrix(X_text_reduced), X_categorical])

def train_ridge_models(X, y, param_grid={'alpha': [0.1, 1.0, 10.0]}):
    ridge = Ridge()
    grid = GridSearchCV(ridge, param_grid, scoring='r2', cv=3, verbose=0)
    grid.fit(X, y)
    return grid.best_estimator_

ridge_models = {}
for nutrient in selected_nutrients:
    y = food_df[nutrient].fillna(food_df[nutrient].median())
    X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
    best_ridge = train_ridge_models(X_train, y_train)
    ridge_models[nutrient] = (best_ridge, X_test, y_test)

evaluation_results = []
for nutrient, (model, X_test, y_test) in ridge_models.items():
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    evaluation_results.append({
        'Nutrient': nutrient,
        'MAE': round(mae, 4),
        'MSE': round(mse, 4),
        'R2': round(r2, 4)
    })

results_df = pd.DataFrame(evaluation_results)
print(results_df)

               Nutrient       MAE         MSE      R2
0               protein    2.5070     17.6017  0.7280
1                   fat    2.0261     11.4639  0.7173
2    carbohydrate (nfe)    4.2482     52.8212  0.7997
3           crude fibre    1.1195      2.2175  0.8744
4               calcium    0.1068      0.0202  0.7975
5           phospohorus    0.0749      0.0133  0.8058
6             potassium    0.0853      0.0135  0.7908
7                sodium    0.0600      0.0194  0.5038
8             magnesium    0.0171      0.0006  0.5747
9             vitamin e  122.9976  25224.8524  0.7048
10            vitamin c   40.5666   3300.1392  0.5198
11  omega-3-fatty acids    0.3062      0.1776  0.6051
12  omega-6-fatty acids    0.4582      0.5730  0.6872


Ridge + XGBoost для разных групп нутриентов
 (фиксированные параметры для XGBoost)

XGBoost Train only for these 5 nutrients

Ridge for the other 8 nutrients

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.sparse import hstack, csr_matrix
import xgboost as xgb

food_df = pd.read_csv("FINAL_COMBINED.csv")

ridge_nutrients = [
    'protein', 'fat', 'carbohydrate (nfe)', 'crude fibre', 'calcium',
    'phospohorus', 'potassium', 'vitamin e'
]
xgb_nutrients = ['sodium', 'magnesium', 'vitamin c', 'omega-3-fatty acids', 'omega-6-fatty acids']

all_nutrients = ridge_nutrients + xgb_nutrients
for col in all_nutrients:
    food_df[col] = food_df[col].astype(str).str.replace('%', '').str.replace('IU/kg', '').str.extract(r'([\d.]+)').astype(float)

food_df['combined_text'] = (
    food_df['product title'].fillna('') + ' ' +
    food_df['key benefits'].fillna('') + ' ' +
    food_df['ingredients'].fillna('')
)

# Disorder keywords
disorder_keywords = {
    "Inherited musculoskeletal disorders": "joint mobility glucosamine arthritis cartilage flexibility",
    "Inherited gastrointestinal disorders": "digest stomach bowel sensitive diarrhea gut ibs",
    "Inherited endocrine disorders": "thyroid metabolism weight diabetes insulin hormone glucose",
    "Inherited eye disorders": "vision eye retina cataract antioxidant sight ocular",
    "Inherited nervous system disorders": "brain seizure cognitive nerve neuro neurological cognition",
    "Inherited cardiovascular disorders": "heart cardiac circulation omega-3 blood pressure vascular",
    "Inherited skin disorders": "skin allergy itch coat omega-6 dermatitis eczema flaky",
    "Inherited immune disorders": "immune defense resistance inflammatory autoimmune",
    "Inherited urinary and reproductive disorders": "urinary bladder kidney renal urine reproductive",
    "Inherited respiratory disorders": "breath respiratory airway lung cough breathing nasal",
    "Inherited blood disorders": "anemia blood iron hemoglobin platelets clotting hemophilia"
}

vectorizer = TfidfVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(food_df['combined_text'])

svd = TruncatedSVD(n_components=300, random_state=42)
X_text_reduced = svd.fit_transform(X_text)

encoder = OneHotEncoder(sparse_output=True)
X_categorical = encoder.fit_transform(food_df[['breed size', 'lifestage', 'food form']].fillna('Unknown'))

def get_disorder_vector(disorder_series):
    keyword_texts = disorder_series.map(lambda d: disorder_keywords.get(d, ''))
    return vectorizer.transform(keyword_texts)

if 'Disorder' not in food_df.columns:
    food_df['Disorder'] = 'Unknown'
X_keywords = get_disorder_vector(food_df['Disorder'])
X_combined = hstack([csr_matrix(X_text_reduced), X_categorical, X_keywords])

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mae, mse, r2

results = []
for nutrient in all_nutrients:
    y = food_df[nutrient].fillna(food_df[nutrient].median())
    X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

    if nutrient in ridge_nutrients:
        model = Ridge(alpha=1.0)
    else:  # Use XGBoost 
        model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=3, learning_rate=0.1)

    mae, mse, r2 = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append({
        'Nutrient': nutrient,
        'Model': 'Ridge' if nutrient in ridge_nutrients else 'XGBoost',
        'MAE': round(mae, 4),
        'MSE': round(mse, 4),
        'R2': round(r2, 4)
    })

results_df = pd.DataFrame(results)
print(results_df)


               Nutrient    Model       MAE         MSE      R2
0               protein    Ridge    3.1170     23.9689  0.6296
1                   fat    Ridge    2.3540     15.0946  0.6277
2    carbohydrate (nfe)    Ridge    5.4242     68.6259  0.7398
3           crude fibre    Ridge    1.5342      3.6219  0.7948
4               calcium    Ridge    0.1242      0.0249  0.7507
5           phospohorus    Ridge    0.0931      0.0175  0.7440
6             potassium    Ridge    0.1018      0.0173  0.7326
7             vitamin e    Ridge  122.9976  25224.8522  0.7048
8                sodium  XGBoost    0.0808      0.0273  0.3007
9             magnesium  XGBoost    0.0177      0.0005  0.6375
10            vitamin c  XGBoost   36.0898   2982.6328  0.5660
11  omega-3-fatty acids  XGBoost    0.3370      0.2826  0.3718
12  omega-6-fatty acids  XGBoost    0.5779      0.9447  0.4843


Ridge + XGBoost с подбором параметров (GridSearchCV для XGBoost)

XGBoost hyperparameter tuning

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.sparse import hstack, csr_matrix
import xgboost as xgb

food_df = pd.read_csv("FINAL_COMBINED.csv")

ridge_nutrients = [
    'protein', 'fat', 'carbohydrate (nfe)', 'crude fibre', 'calcium',
    'phospohorus', 'potassium', 'vitamin e'
]
xgb_nutrients = ['sodium', 'magnesium', 'vitamin c', 'omega-3-fatty acids', 'omega-6-fatty acids']

all_nutrients = ridge_nutrients + xgb_nutrients
for col in all_nutrients:
    food_df[col] = food_df[col].astype(str).str.replace('%', '').str.replace('IU/kg', '').str.extract(r'([\d.]+)').astype(float)

food_df['combined_text'] = (
    food_df['product title'].fillna('') + ' ' +
    food_df['key benefits'].fillna('') + ' ' +
    food_df['ingredients'].fillna('')
)

# Disorder keywords
disorder_keywords = {
    "Inherited musculoskeletal disorders": "joint mobility glucosamine arthritis cartilage flexibility",
    "Inherited gastrointestinal disorders": "digest stomach bowel sensitive diarrhea gut ibs",
    "Inherited endocrine disorders": "thyroid metabolism weight diabetes insulin hormone glucose",
    "Inherited eye disorders": "vision eye retina cataract antioxidant sight ocular",
    "Inherited nervous system disorders": "brain seizure cognitive nerve neuro neurological cognition",
    "Inherited cardiovascular disorders": "heart cardiac circulation omega-3 blood pressure vascular",
    "Inherited skin disorders": "skin allergy itch coat omega-6 dermatitis eczema flaky",
    "Inherited immune disorders": "immune defense resistance inflammatory autoimmune",
    "Inherited urinary and reproductive disorders": "urinary bladder kidney renal urine reproductive",
    "Inherited respiratory disorders": "breath respiratory airway lung cough breathing nasal",
    "Inherited blood disorders": "anemia blood iron hemoglobin platelets clotting hemophilia"
}


vectorizer = TfidfVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(food_df['combined_text'])

svd = TruncatedSVD(n_components=300, random_state=42)
X_text_reduced = svd.fit_transform(X_text)

encoder = OneHotEncoder(sparse_output=True)
X_categorical = encoder.fit_transform(food_df[['breed size', 'lifestage', 'food form']].fillna('Unknown'))

def get_disorder_vector(disorder_series):
    keyword_texts = disorder_series.map(lambda d: disorder_keywords.get(d, ''))
    return vectorizer.transform(keyword_texts)

if 'Disorder' not in food_df.columns:
    food_df['Disorder'] = 'Unknown'
X_keywords = get_disorder_vector(food_df['Disorder'])

X_combined = hstack([csr_matrix(X_text_reduced), X_categorical, X_keywords])

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mae, mse, r2

results = []
for nutrient in all_nutrients:
    y = food_df[nutrient].fillna(food_df[nutrient].median())
    X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

    if nutrient in ridge_nutrients:
        model = Ridge(alpha=1.0)
    else:
        xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
        param_grid = {
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'n_estimators': [50, 100, 200]
        }
        grid_search = GridSearchCV(xgb_model, param_grid, scoring='r2', cv=3, verbose=1)
        grid_search.fit(X_train, y_train)
        model = grid_search.best_estimator_

    mae, mse, r2 = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append({
        'Nutrient': nutrient,
        'Model': 'Ridge' if nutrient in ridge_nutrients else 'XGBoost',
        'MAE': round(mae, 4),
        'MSE': round(mse, 4),
        'R2': round(r2, 4)
    })

results_df = pd.DataFrame(results)
print(results_df)


Fitting 3 folds for each of 27 candidates, totalling 81 fits
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Fitting 3 folds for each of 27 candidates, totalling 81 fits
               Nutrient    Model       MAE         MSE      R2
0               protein    Ridge    3.1170     23.9689  0.6296
1                   fat    Ridge    2.3540     15.0946  0.6277
2    carbohydrate (nfe)    Ridge    5.4242     68.6259  0.7398
3           crude fibre    Ridge    1.5342      3.6219  0.7948
4               calcium    Ridge    0.1242      0.0249  0.7507
5           phospohorus    Ridge    0.0931      0.0175  0.7440
6             potassium    Ridge    0.1018      0.0173  0.7326
7             vitamin e    Ridge  122.9976  25224.8522  0.7048
8                sodium  XGBoost    0.0798      0.0272  0.3033
9             magnesium  XGBoost    0.0240      0.0010  0.2899
10

Ridge Regression с масштабированием целевых переменных (StandardScaler)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.sparse import hstack, csr_matrix

food_df = pd.read_csv("FINAL_COMBINED.csv")

selected_nutrients = [
    'protein', 'fat', 'carbohydrate (nfe)', 'crude fibre', 'calcium',
    'phospohorus', 'potassium', 'sodium', 'magnesium', 'vitamin e',
    'vitamin c', 'omega-3-fatty acids', 'omega-6-fatty acids'
]

for col in selected_nutrients:
    food_df[col] = food_df[col].astype(str).str.replace('%', '').str.replace('IU/kg', '').str.extract(r'([\d.]+)').astype(float)

food_df['combined_text'] = (
    food_df['product title'].fillna('') + ' ' +
    food_df['key benefits'].fillna('') + ' ' +
    food_df['ingredients'].fillna('')
)

vectorizer = TfidfVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(food_df['combined_text'])

svd = TruncatedSVD(n_components=300, random_state=42)
X_text_reduced = svd.fit_transform(X_text)

encoder = OneHotEncoder(sparse_output=True)
X_categorical = encoder.fit_transform(food_df[['breed size', 'lifestage', 'food form']].fillna('Unknown'))

X_combined = hstack([csr_matrix(X_text_reduced), X_categorical])

# StandardScaler 
scalers = {}
scaled_targets = {}
for nutrient in selected_nutrients:
    y = food_df[nutrient].fillna(food_df[nutrient].median())
    if nutrient in ['sodium', 'omega-3-fatty acids', 'omega-6-fatty acids']:
        scaler = StandardScaler()
        y_scaled = scaler.fit_transform(y.values.reshape(-1, 1)).flatten()
        scalers[nutrient] = scaler
        scaled_targets[nutrient] = y_scaled
    else:
        scaled_targets[nutrient] = y

#  Ridge models with hyperparameter tuning
def train_ridge_models(X, y, param_grid={'alpha': [0.1, 1.0, 10.0]}):
    ridge = Ridge()
    grid = GridSearchCV(ridge, param_grid, scoring='r2', cv=3, verbose=0)
    grid.fit(X, y)
    return grid.best_estimator_

ridge_models = {}
for nutrient in selected_nutrients:
    y = scaled_targets[nutrient]
    X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
    best_ridge = train_ridge_models(X_train, y_train)
    ridge_models[nutrient] = (best_ridge, X_test, y_test)

evaluation_results = []
for nutrient, (model, X_test, y_test) in ridge_models.items():
    y_pred = model.predict(X_test)
    if nutrient in scalers:
        y_test = scalers[nutrient].inverse_transform(y_test.reshape(-1, 1)).flatten()
        y_pred = scalers[nutrient].inverse_transform(y_pred.reshape(-1, 1)).flatten()
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    evaluation_results.append({
        'Nutrient': nutrient,
        'MAE': round(mae, 4),
        'MSE': round(mse, 4),
        'R2': round(r2, 4)
    })

results_df = pd.DataFrame(evaluation_results)
print(results_df)


               Nutrient       MAE         MSE      R2
0               protein    2.5070     17.6017  0.7280
1                   fat    2.0261     11.4639  0.7173
2    carbohydrate (nfe)    4.2482     52.8212  0.7997
3           crude fibre    1.1195      2.2175  0.8744
4               calcium    0.1068      0.0202  0.7975
5           phospohorus    0.0749      0.0133  0.8058
6             potassium    0.0853      0.0135  0.7908
7                sodium    0.0600      0.0194  0.5038
8             magnesium    0.0171      0.0006  0.5747
9             vitamin e  122.9976  25224.8524  0.7048
10            vitamin c   40.5666   3300.1392  0.5198
11  omega-3-fatty acids    0.3062      0.1776  0.6051
12  omega-6-fatty acids    0.4582      0.5730  0.6872


(порода + заболевание + прогноз нутриентов)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix

food_df = pd.read_csv("FINAL_COMBINED.csv")
disease_df = pd.read_csv("Disease.csv")

selected_nutrients = [
    'protein', 'fat', 'carbohydrate (nfe)', 'crude fibre', 'calcium',
    'phospohorus', 'potassium', 'sodium', 'magnesium', 'vitamin e',
    'vitamin c', 'omega-3-fatty acids', 'omega-6-fatty acids'
]

for col in selected_nutrients:
    food_df[col] = food_df[col].astype(str).str.replace('%', '').str.replace('IU/kg', '').str.extract(r'([\d.]+)').astype(float)

food_df['combined_text'] = (
    food_df['product title'].fillna('') + ' ' +
    food_df['product description'].fillna('') + ' ' +
    food_df['key benefits'].fillna('') + ' ' +
    food_df['ingredients'].fillna('')
    food_df['helpful tips'].fillna('') + ' ' +
    food_df['need/preference'].fillna('')
    food_df['alternate product recommendation'].fillna('') + ' ' +
)


# TF-IDF + SVD
vectorizer = TfidfVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(food_df['combined_text'])
svd = TruncatedSVD(n_components=300, random_state=42)
X_text_reduced = svd.fit_transform(X_text)

encoder = OneHotEncoder(sparse_output=True)
X_categorical = encoder.fit_transform(food_df[['breed size', 'lifestage', 'food form']].fillna('Unknown'))
X_combined = hstack([csr_matrix(X_text_reduced), X_categorical])


scalers = {}
scaled_targets = {}
for nutrient in selected_nutrients:
    y = food_df[nutrient].fillna(food_df[nutrient].median())
    if nutrient in ['sodium', 'omega-3-fatty acids', 'omega-6-fatty acids']:
        scaler = StandardScaler()
        y_scaled = scaler.fit_transform(y.values.reshape(-1, 1)).flatten()
        scalers[nutrient] = scaler
        scaled_targets[nutrient] = y_scaled
    else:
        scaled_targets[nutrient] = y

# Train Ridge models
def train_ridge(X, y):
    ridge = Ridge()
    grid = GridSearchCV(ridge, {'alpha': [0.1, 1.0, 10.0]}, scoring='r2', cv=3)
    grid.fit(X, y)
    return grid.best_estimator_

ridge_models = {}
for nutrient in selected_nutrients:
    y = scaled_targets[nutrient]
    X_train, _, y_train, _ = train_test_split(X_combined, y, test_size=0.2, random_state=42)
    ridge_models[nutrient] = train_ridge(X_train, y_train)

# Disorder keywords
disorder_keywords = {
    "Inherited musculoskeletal disorders": "joint mobility glucosamine arthritis cartilage flexibility",
    "Inherited gastrointestinal disorders": "digest stomach bowel sensitive diarrhea gut ibs",
    "Inherited endocrine disorders": "thyroid metabolism weight diabetes insulin hormone glucose",
    "Inherited eye disorders": "vision eye retina cataract antioxidant sight ocular",
    "Inherited nervous system disorders": "brain seizure cognitive nerve neuro neurological cognition",
    "Inherited cardiovascular disorders": "heart cardiac circulation omega-3 blood pressure vascular",
    "Inherited skin disorders": "skin allergy itch coat omega-6 dermatitis eczema flaky",
    "Inherited immune disorders": "immune defense resistance inflammatory autoimmune",
    "Inherited urinary and reproductive disorders": "urinary bladder kidney renal urine reproductive",
    "Inherited respiratory disorders": "breath respiratory airway lung cough breathing nasal",
    "Inherited blood disorders": "anemia blood iron hemoglobin platelets clotting hemophilia"
}

# Interactive input
user_breed = input("Enter dog breed: ").strip().lower()
breed_disorders = disease_df[disease_df['Breed'].str.lower() == user_breed]

if breed_disorders.empty:
    print("Breed not found in disease dataset.")
else:
    print(f"\nDisorders for {user_breed.title()}:")
    disorder_options = breed_disorders['Disease'].unique()
    for idx, dis in enumerate(disorder_options):
        print(f"{idx + 1}. {dis}")

    selection = int(input("\nSelect disorder (enter number): ")) - 1
    selected_disorder = disorder_options[selection]
    disorder_type = breed_disorders[breed_disorders['Disease'] == selected_disorder]['Disorder'].values[0]

    keyword_string = disorder_keywords.get(disorder_type, '')
    keyword_vec = vectorizer.transform([keyword_string])
    keyword_reduced = svd.transform(keyword_vec)
    keyword_combined = hstack([csr_matrix(keyword_reduced), encoder.transform([['Unknown', 'Unknown', 'Unknown']])])

    # Recommend recipe
    similarities = cosine_similarity(keyword_vec, vectorizer.transform(food_df['combined_text'])).flatten()
    top_idx = similarities.argmax()
    recommended_product = food_df.iloc[top_idx]['product title']

    # Forecast nutrients
    nutrient_forecast = {}
    for nutrient, model in ridge_models.items():
        pred = model.predict(keyword_combined)[0]
        if nutrient in scalers:
            pred = scalers[nutrient].inverse_transform([[pred]])[0][0]
        nutrient_forecast[nutrient] = round(pred, 2)

    print(f"\nSelected Disorder: {selected_disorder}")
    print(f"Recommended Recipe: {recommended_product}")
    print("Forecasted Nutrient Values:")
    for nutrient, value in nutrient_forecast.items():
        print(f"{nutrient}: {value}")



Disorders for Australian Shepherd:
1. Cataracts
2. Microphthalmia; ocular dysgenesis
3. Pannus - chronic superficial keratitis
4. Persistent pupillary membranes (PPM)
5. Retinal dysplasia
6. Collie eye anomaly (CEA)
7. Progressive retinal atrophy
8. Hip dysplasia
9. Deafness

Selected Disorder: Deafness
Recommended Recipe: Brain Care + j/d Rice & Turkey Entrée Dog Food
Forecasted Nutrient Values:
protein: 22.17
fat: 14.22
carbohydrate (nfe): 46.72
crude fibre: 1.76
calcium: 0.87
phospohorus: 0.61
potassium: 0.69
sodium: 0.34
magnesium: 0.1
vitamin e: 742.73
vitamin c: 111.5
omega-3-fatty acids: 1.6
omega-6-fatty acids: 2.78




с учетом размера породы (breed size)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix

food_df = pd.read_csv("FINAL_COMBINED.csv")
disease_df = pd.read_csv("Disease.csv")

def classify_breed_size(row):
    weight = (row['min_weight'] + row['max_weight']) / 2
    if weight <= 10:
        return 'Small Breed'
    elif 10 < weight <= 25:
        return 'Medium Breed'
    else:
        return 'Large Breed'

disease_df['breed_size_category'] = disease_df.apply(classify_breed_size, axis=1)

selected_nutrients = [
    'protein', 'fat', 'carbohydrate (nfe)', 'crude fibre', 'calcium',
    'phospohorus', 'potassium', 'sodium', 'magnesium', 'vitamin e',
    'vitamin c', 'omega-3-fatty acids', 'omega-6-fatty acids'
]

for col in selected_nutrients:
    food_df[col] = food_df[col].astype(str).str.replace('%', '').str.replace('IU/kg', '').str.extract(r'([\d.]+)').astype(float)

food_df['combined_text'] = (
    food_df['product title'].fillna('') + ' ' +
    food_df['product description'].fillna('') + ' ' +
    food_df['key benefits'].fillna('') + ' ' +
    food_df['ingredients'].fillna('') + ' ' +
    food_df['helpful tips'].fillna('') + ' ' +
    food_df['need/preference'].fillna('') + ' ' +
    food_df['alternate product recommendation'].fillna('')
)

vectorizer = TfidfVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(food_df['combined_text'])
svd = TruncatedSVD(n_components=300, random_state=42)
X_text_reduced = svd.fit_transform(X_text)

encoder = OneHotEncoder(sparse_output=True)
X_categorical = encoder.fit_transform(food_df[['breed size', 'lifestage', 'food form']].fillna('Unknown'))
X_combined = hstack([csr_matrix(X_text_reduced), X_categorical])

scale_nutrients = ['sodium', 'omega-3-fatty acids', 'omega-6-fatty acids', 'calcium', 'phospohorus', 'potassium', 'magnesium']
scalers = {}
scaled_targets = {}
for nutrient in selected_nutrients:
    y = food_df[nutrient].fillna(food_df[nutrient].median())
    if nutrient in scale_nutrients:
        scaler = StandardScaler()
        y_scaled = scaler.fit_transform(y.values.reshape(-1, 1)).flatten()
        scalers[nutrient] = scaler
        scaled_targets[nutrient] = y_scaled
    else:
        scaled_targets[nutrient] = y

def train_ridge(X, y):
    ridge = Ridge()
    grid = GridSearchCV(ridge, {'alpha': [0.1, 1.0, 10.0]}, scoring='r2', cv=3)
    grid.fit(X, y)
    return grid.best_estimator_

ridge_models = {}
for nutrient in selected_nutrients:
    y = scaled_targets[nutrient]
    X_train, _, y_train, _ = train_test_split(X_combined, y, test_size=0.2, random_state=42)
    ridge_models[nutrient] = train_ridge(X_train, y_train)

disorder_keywords = {
    "Inherited musculoskeletal disorders": "joint mobility glucosamine arthritis cartilage flexibility",
    "Inherited gastrointestinal disorders": "digest stomach bowel sensitive diarrhea gut ibs",
    "Inherited endocrine disorders": "thyroid metabolism weight diabetes insulin hormone glucose",
    "Inherited eye disorders": "vision eye retina cataract antioxidant sight ocular",
    "Inherited nervous system disorders": "brain seizure cognitive nerve neuro neurological cognition",
    "Inherited cardiovascular disorders": "heart cardiac circulation omega-3 blood pressure vascular",
    "Inherited skin disorders": "skin allergy itch coat omega-6 dermatitis eczema flaky",
    "Inherited immune disorders": "immune defense resistance inflammatory autoimmune",
    "Inherited urinary and reproductive disorders": "urinary bladder kidney renal urine reproductive",
    "Inherited respiratory disorders": "breath respiratory airway lung cough breathing nasal",
    "Inherited blood disorders": "anemia blood iron hemoglobin platelets clotting hemophilia"
}

user_breed = input("Enter dog breed: ").strip().lower()
breed_info = disease_df[disease_df['Breed'].str.lower() == user_breed]

if breed_info.empty:
    print("Breed not found in disease dataset.")
else:
    breed_size = breed_info['breed_size_category'].values[0]
    disorder_options = breed_info['Disease'].unique()
    print(f"\nDisorders for {user_breed.title()} ({breed_size}):")
    for idx, dis in enumerate(disorder_options):
        print(f"{idx + 1}. {dis}")

    selection = int(input("\nSelect disorder (enter number): ")) - 1
    selected_disorder = disorder_options[selection]
    disorder_type = breed_info[breed_info['Disease'] == selected_disorder]['Disorder'].values[0]

    filtered_products = food_df[
        (food_df['breed size'].str.lower() == breed_size.lower()) |
        (food_df['breed size'].str.lower() == 'unknown')
    ]

    keyword_string = disorder_keywords.get(disorder_type, selected_disorder)
    keyword_vec = vectorizer.transform([keyword_string])
    similarities = cosine_similarity(keyword_vec, vectorizer.transform(filtered_products['combined_text'])).flatten()
    top_idx = similarities.argmax()
    recommended_product = filtered_products.iloc[top_idx]['product title']

    keyword_reduced = svd.transform(keyword_vec)
    keyword_combined = hstack([csr_matrix(keyword_reduced), encoder.transform([[breed_size, 'Adult', 'Dry Food']])])

    nutrient_forecast = {}
    for nutrient, model in ridge_models.items():
        pred = model.predict(keyword_combined)[0]
        if nutrient in scalers:
            pred = scalers[nutrient].inverse_transform([[pred]])[0][0]
        nutrient_forecast[nutrient] = round(pred, 2)

    print(f"\nSelected Disorder: {selected_disorder} ({disorder_type})")
    print(f"Recommended Recipe: {recommended_product}")
    print("Forecasted Nutrient Values:")
    for nutrient, value in nutrient_forecast.items():
        print(f"{nutrient}: {value}")



Disorders for Affenpinscher (Small Breed):
1. Cataracts
2. Hip dysplasia

Selected Disorder: Hip dysplasia (Inherited musculoskeletal disorders)
Recommended Recipe: Healthy Mobility Adult Small & Mini Breed Dry Dog Food with Chicken
Forecasted Nutrient Values:
protein: 16.32
fat: 13.72
carbohydrate (nfe): 55.26
crude fibre: 2.78
calcium: 0.58
phospohorus: 0.46
potassium: 0.8
sodium: 0.23
magnesium: 0.11
vitamin e: 618.17
vitamin c: 103.81
omega-3-fatty acids: 1.76
omega-6-fatty acids: 3.12


