In [3]:
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error
from sklearn.preprocessing import StandardScaler,  OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor

# Create data using .pkl file (Optional)

In [5]:
import joblib
from src.utils.feature_engineering import create_features
df = pd.read_csv('data/train.csv')

# Assuming create_features is a function that adds/transforms features

df = create_features(df)

X = df.drop(columns=["id", "Calories"])
y = np.log(df['Calories'])  # Log transform the target

# Identify categorical features - in this case just "Sex"
numerical_features = [col for col in X.columns if col not in ["Sex"]]
categorical_features = ["Sex"]

# Create a preprocessor for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
    ],
    remainder="passthrough"  # Keep categorical features as is for CatBoost
).fit(X)

cat=joblib.load('models/catboost_model.pkl')
# Process test data and create submission
X_submission = pd.read_csv("data/test.csv")
out = X_submission[["id"]].copy()

# Apply same feature engineering to test data
X_submission_features = create_features(X_submission.drop(columns=["id"]))

# Apply same transformations as training data
X_submission = preprocessor.transform(X_submission_features)
#X_submission = poly.transform(X_submission)

# Predict and convert back from log scale
out["Calories"] = np.exp(cat.predict(X_submission))
out.to_csv("data/catboost_submission.csv", index=False)


# Optimize weights for each regression

In [6]:
import numpy as np
from scipy.optimize import minimize
import pandas as pd


cat = pd.read_csv('data/catboost_train_pred.csv')
xgb = pd.read_csv('data/xgb_train_pred.csv')
lgb = pd.read_csv('data/lgb_train_pred.csv')

# Example predictions from different models (rows: samples, columns: models)
predictions = pd.merge(cat,pd.merge(xgb,lgb,on='id'),on = 'id')
predictions = np.exp(np.array(predictions.drop(columns='id')))
# True labels
true_values = np.array(pd.read_csv('data/train.csv')['Calories'])

# Objective function: minimize weighted error
def objective(weights):
    weighted_preds = np.dot(predictions, weights)
    error = np.mean((weighted_preds - true_values) ** 2)  # Mean squared error
    return error

# Constraints: weights sum to 1
constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}

# Bounds: weights should be between 0 and 1
bounds = [(0, 1)] * predictions.shape[1]

# Initial weights
initial_weights = np.ones(predictions.shape[1]) / predictions.shape[1]

# Optimize weights
result = minimize(objective, initial_weights, bounds=bounds, constraints=constraints)

optimal_weights = result.x
print("Optimal Weights:", optimal_weights)

from sklearn.metrics import r2_score

# Compute weighted predictions using optimized weights
optimized_preds = np.dot(predictions, optimal_weights)

# Calculate R² score
r2 = r2_score(true_values, optimized_preds)

print("R² Score:", r2)


Optimal Weights: [0.24409889 0.75590111 0.        ]
R² Score: 0.9970733458972664


In [8]:
import pandas as pd
cat = pd.read_csv('data/catboost_submission.csv')
xgb = pd.read_csv('data/xgb_submission.csv')
lgb = pd.read_csv('data/lgb_submission.csv')

out = cat[['id']]
out['Calories'] = optimal_weights[0]*cat['Calories'] + optimal_weights[1]*xgb['Calories'] + optimal_weights[2]*lgb['Calories']
out['Calories'] = 0.3*cat['Calories'] + 0.4*xgb['Calories'] + 0.3*lgb['Calories']
out.to_csv('submission.csv', index=False)