In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### **Data Loading & Description**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict, GridSearchCV,RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,VotingClassifier, AdaBoostClassifier,GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score,confusion_matrix, precision_score, recall_score, f1_score


In [None]:
df= pd.read_csv('/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/train.csv')


In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()


We can notice two key features from this data description:

1) It seems that only **Recipe_Review** column has null values.

2) Our data has mostly **numerical** data type, some of the features however are of ordinal data type.

Let's explore further to understand the data in a better way.

### **Descriptive Statistics and Exploratory Data Analysis**

In [None]:
df.describe()

**Rating** is the score on a 1 to 5 scale that the user gave to the recipe **(Target Variable)**. The goal of our project is to predict a rating from 1 to 5 based on the features, hence making it a multi-class classification problem.

In [None]:
df['Rating'].value_counts()

We can notice that majority of the ratings given by the user is 5, which is over 70% of all the ratings given.

In [None]:
df['RecipeName'].value_counts()

In [None]:
df['UserReputation'].value_counts()

We can notice that **UserReputation** column value= 1 has the maximum count.

In [None]:
df['UserName'].value_counts()

In [None]:
numerical_vars = ['RecipeNumber','UserReputation','ReplyCount','ThumbsUpCount','ThumbsDownCount','Rating','BestScore']

In [None]:
df['ReplyCount'].value_counts()

In [None]:
corr_matrix= df[numerical_vars].corr()

corr_matrix['Rating'].sort_values(ascending=False)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

Key Observations:

1) **Rating** column has positive correlation with **UserReputation** and negative correlation with other columns **RecipeNumber, ReplyCount,ThumbsUpCount,BestScore** although the association is very small.

2) We can see some hints of multicollinearity between **BestScore and ThumbsUpCount.**



#### **Univariate and Bivariate Analysis**

In [None]:
vars = ['ReplyCount','ThumbsUpCount','ThumbsDownCount','Rating']

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

min_frequency = 10

for feature in vars:

    value_counts = df[feature].value_counts(dropna=True)
    filtered_counts = value_counts[value_counts >= min_frequency]

    # Create bar chart
    plt.figure(figsize=(4, 4))
    plt.xticks(rotation=45, ha='right')
    plt.xlim(-0.5, len(filtered_counts) - 0.5)

    filtered_counts.plot(kind='bar', color='skyblue')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.title(f'Distribution of {feature}')
    plt.tight_layout()
    plt.show()      ##Drop the column

We can notice that the data is highly imbalanced, with 10,000+ cases for 5 rating, and much lesser for other categories.

In [None]:
# Create density plots for each variable
df['Rating'].plot.kde(figsize=(4, 4))
plt.show()

**Hypothesis Testing**

In [None]:

# Does the Rating Column has any association with ThumbsUpCount?
#Null Hypothesis: Rating Column does not have any association with ThumbsUpCount

ThumbsUp_Rating_table = pd.crosstab(df['Rating'], df['ThumbsUpCount'])
print(ThumbsUp_Rating_table)


In [None]:
ThumbsUp_Rating_table = pd.crosstab(df['Rating'], df['UserReputation'])
ThumbsUp_Rating_table

We can notice that dishes with higher thumbs up count tend to have higher ratings too. We can perform chi-square test to check its statistical significance.

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Perform chi-square test
chi2, pval, degrees_of_freedom, expected_counts = chi2_contingency(ThumbsUp_Rating_table.values)

# Print the test results
print(f"Chi-square statistic: {chi2:.2f}")
print(f"p-value: {pval:.4f}")

# Interpretation
if pval < 0.05:
    print("Reject null hypothesis: There is a statistically significant association between Rating and ThumbsUpCount.")
else:
    print("Fail to reject null hypothesis: There is not enough evidence to conclude a statistically significant association between Rating and ThumbsUpCount.")



Let's check associations between other variables and Rating columnn too.

In [None]:
ThumbsDown_Rating_table = pd.crosstab(df['Rating'], df['ThumbsDownCount'])
print(ThumbsDown_Rating_table)

# Does the Rating Column has any association with ThumbsDownCount?
#Null Hypothesis: Rating Column does not have any association with ThumbsDownCount

chi2, pval, degrees_of_freedom, expected_counts = chi2_contingency(ThumbsDown_Rating_table.values)

# Print the test results
print(f"Chi-square statistic: {chi2:.2f}")
print(f"p-value: {pval:.4f}")

# Interpretation
if pval < 0.05:
    print("Reject null hypothesis: There is a statistically significant association between Rating and ThumbsDownCount.")
else:
    print("Fail to reject null hypothesis: There is not enough evidence to conclude a statistically significant association between Rating and ThumbsDownCount.")

In [None]:


ReplyCount_Rating_table = pd.crosstab(df['Rating'], df['ReplyCount'])
print(ReplyCount_Rating_table)

# Does the Rating Column has any association with ReplyCount?
#Null Hypothesis: Rating Column does not have any association with ReplyCount

# Perform chi-square test
chi2, pval, degrees_of_freedom, expected_counts = chi2_contingency(ReplyCount_Rating_table.values)

# Print the test results
print(f"Chi-square statistic: {chi2:.2f}")
print(f"p-value: {pval:.4f}")

# Interpretation
if pval < 0.05:
    print("Reject null hypothesis: There is a statistically significant association between Rating and ReplyCount.")
else:
    print("Fail to reject null hypothesis: There is not enough evidence to conclude a statistically significant association between Rating and ReplyCount.")

In [None]:


# Perform chi-square test
chi2, pval, degrees_of_freedom, expected_counts = chi2_contingency(ThumbsDown_Rating_table.values)

# Print the test results
print(f"Chi-square statistic: {chi2:.2f}")
print(f"p-value: {pval:.4f}")

# Interpretation
if pval < 0.05:
    print("Reject null hypothesis: There is a statistically significant association between Rating and ThumbsDownCount.")
else:
    print("Fail to reject null hypothesis: There is not enough evidence to conclude a statistically significant association between Rating and ThumbsDownCount.")

### **Feature Engineering**

Data Cleaning

Dropping Features

Handling Categorical Data

Feature Scaling


### **Data Cleaning**

In [None]:
df.isnull().sum()

We can see that **Recipe_Review** has 2 null values.

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

**Dropping Redundant Columns**

In [None]:
df.info()

In [None]:
df.drop(columns=['ID','RecipeNumber','RecipeCode','CommentID','UserID','UserName','CreationTimestamp'],inplace=True)

In [None]:
df.info()

#### **Feature Scaling, Handling Text Data  and Appling Column Transformer**

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
ct = ColumnTransformer([
     ("Scaler", StandardScaler(),
     ["UserReputation", "ReplyCount", "ThumbsUpCount", "ThumbsDownCount", "BestScore"]),
     ("vectoriser", TfidfVectorizer(),"RecipeName"),
    ("vectoriser2", TfidfVectorizer(),"Recipe_Review")
    ]
)

**Train Val Splitting**

In [None]:
y=df[['Rating']]
y

In [None]:
X=  df.drop(columns='Rating')
X

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
y_train.value_counts()

In [None]:
y_train

In [None]:
y_train = y_train.values.ravel()

#### **Base Model: Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_clf=LogisticRegression(random_state=42,max_iter=1000)

In [None]:
ml_pipeline1= Pipeline(
    [
        ("preprocessing", ct),
        ("lr", lr_clf),
    ]
)

In [None]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

In [None]:
ml_pipeline1.fit(X_train,y_train)

In [None]:
ml_pipeline1.score(X_val,y_val)

**Error Analysis using cross validation**

In [None]:
from sklearn.model_selection import cross_val_score,cross_val_predict,RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix


In [None]:
y_train_LR_pred= cross_val_predict(ml_pipeline1,X_train,y_train,cv=3)

In [None]:
conf_mx_lr=confusion_matrix(y_train,y_train_LR_pred)

In [None]:
confusion_matrix(y_train,y_train_LR_pred)

**Model Evaluation on Val Set**

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_val_predictLR=ml_pipeline1.predict(X_val)

In [None]:
LR_val_acc=accuracy_score(y_val,y_val_predictLR)
LR_val_acc

**HyperParameter Tuning**

In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
param_grid_lr = {
     'lr__C': [0.01, 0.1, 1],
    'lr__solver': ['lbfgs', 'newton-cg','liblinear'],
}

In [None]:
grid_search_lr=GridSearchCV(estimator=ml_pipeline1,param_grid=param_grid_lr,cv=5,scoring="accuracy")

In [None]:
grid_search_lr.fit(X_train,y_train)

In [None]:
best_params_lr=grid_search_lr.best_params_
best_params_lr

In [None]:
grid_search_lr.best_estimator_

In [None]:
cvres=grid_search_lr.cv_results_
for accuracy,params in zip(cvres["mean_test_score"],cvres["params"]):
  print(accuracy,params)

In [None]:
lr_clf_tuned=LogisticRegression(random_state=42,C=1,solver='lbfgs',penalty='l2',max_iter=1000)

In [None]:
lr_pipeline_tuned= Pipeline(
    [
        ("preprocessing", ct),
        ("lr", lr_clf_tuned),
    ]
)

In [None]:
lr_pipeline_tuned.fit(X_train,y_train)

In [None]:
from sklearn.metrics import f1_score

In [None]:
y_val_predict_lr=lr_pipeline_tuned.predict(X_val)

In [None]:
LR_acc_score_val=accuracy_score(y_val,y_val_predict_lr)
LR_acc_score_val

In [None]:
LR_f1_score_val=f1_score(y_val,y_val_predict_lr,average='weighted')
LR_f1_score_val

In [None]:
conf_matrix_lr=confusion_matrix(y_val,y_val_predict_lr)
conf_matrix_lr

**Applying Ridge Classifier**

In [None]:
from sklearn.linear_model import RidgeClassifierCV

In [None]:
ridge_pipeline = Pipeline([
    ("preprocessing", ct),
    ("ridge_clf", RidgeClassifierCV(alphas=[0.1, 1.0,10.0], cv=3))
])

In [None]:
ridge_pipeline.fit(X_train, y_train)

In [None]:
y_pred_ridge = ridge_pipeline.predict(X_val)

In [None]:
accuracy_ridge = accuracy_score(y_val, y_pred_ridge)
print("Ridge Classifier Accuracy:", accuracy_ridge)

In [None]:
conf_matrix_ridge = confusion_matrix(y_val, y_pred_ridge)
print("Confusion Matrix:")
print(conf_matrix_ridge)

**Applying KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()

In [None]:
knn_pipeline= Pipeline(
    [
        ("preprocessing", ct),
        ("knn", knn),
    ]
)

In [None]:
# param_grid_knn = {'knn__n_neighbors': [5,10,14],'knn__metric': ['euclidean', 'manhattan']}

In [None]:
#param_grid_knn = {'n_neighbors': range(1,20)}

In [None]:
# grid_search_knn = GridSearchCV(estimator=knn_pipeline, param_grid=param_grid_knn, cv=3,scoring="accuracy")

In [None]:
# grid_search_knn.fit(X_train, y_train)

In [None]:
knn = KNeighborsClassifier(n_neighbors=14,weights='distance',metric='manhattan')

In [None]:
knn_pipeline= Pipeline(
    [
        ("preprocessing", ct),
        ("knn", knn),
    ]
)

In [None]:
knn_pipeline.fit(X_train,y_train)

In [None]:
acc_scores_knn_tuned = cross_val_score(knn_pipeline, X_train, y_train, cv=3, scoring='accuracy')
acc_scores_knn_tuned

In [None]:
y_val_predict_knn=knn_pipeline.predict(X_val)

In [None]:
acc_score_knn_val=accuracy_score(y_val,y_val_predict_knn)
acc_score_knn_val

**Applying SVM Models**

In [None]:
from sklearn.svm import SVC

In [None]:
svm = SVC()

In [None]:
svm_pipeline= Pipeline(
    [
        ("preprocessing", ct),
        ("svm", svm),
    ]
)

In [None]:
param_grid_svm = {'svm__C': [0.01,0.1,1]}  #best value for C= 1

In [None]:
grid_search_svm=GridSearchCV(estimator=svm_pipeline, param_grid=param_grid_svm, cv=3,scoring="accuracy")

`

In [None]:
grid_search_svm.fit(X_train,y_train)

In [None]:
grid_search_svm.best_params_

In [None]:
#svm_rbf= SVC(kernel='rbf',C= 1, coef0= 0,gamma=0.1) #f1=0.7705718097220183 mean cv score best score svm_rb

In [None]:
svm= SVC(kernel='linear',C= 1, probability=True)

In [None]:
svm_pipeline= Pipeline(
    [
        ("preprocessing", ct),
        ("svm", svm),
    ]
)

**Check for other metrics**

In [None]:
acc_scores_svm_tuned = cross_val_score(svm_pipeline, X_train, y_train, cv=3, scoring='accuracy')
acc_scores_svm_tuned

In [None]:
import sklearn

In [None]:
svm_pipeline.fit(X_train,y_train)


In [None]:
y_val_predict_svm=svm_pipeline.predict(X_val)

In [None]:
acc_svm_val=accuracy_score(y_val,y_val_predict_svm)
acc_svm_val

In [None]:
conf_mat_svm_val=confusion_matrix(y_val,y_val_predict_svm)
conf_mat_svm_val

**Appling CART**

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [None]:
cart_model = DecisionTreeClassifier()

In [None]:
cart_pipeline = Pipeline([
    ('preprocessing', ct),
    ('cart_model', cart_model)
])

In [None]:
cart_pipeline.fit(X_train, y_train)

In [None]:
# param_grid_cart= {
#     'cart_model__max_depth': [7,5],
#     'cart_model__min_samples_split': [5,10,8],
#     'cart_model__min_samples_leaf': [10,9,8]}

In [None]:
# grid_search_cart = GridSearchCV(estimator=cart_pipeline, cv=3,param_grid=param_grid_cart,scoring="f1_weighted")

In [None]:
# grid_search_cart.fit(X_train,y_train)

In [None]:
# grid_search_cart.best_params_

In [None]:
cart_model = DecisionTreeClassifier(criterion= "entropy", max_depth= 7,min_samples_leaf =9,min_samples_split= 5)

In [None]:
cart_pipeline = Pipeline([
    ('preprocessing', ct),
    ('cart_model', cart_model)
])

In [None]:
cart_pipeline.fit(X_train,y_train)

In [None]:
y_val_predict_cart=cart_pipeline.predict(X_val)

In [None]:
acc_score_cart_val=accuracy_score(y_val,y_val_predict_cart)
acc_score_cart_val

**Bagging and Boosting**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier()


In [None]:
rf_pipeline = Pipeline([
    ('preprocessing', ct),
    ('rf_clf', rf_clf)
])

In [None]:
# param_grid_rf={'rf_clf__max_depth': [3,5,7],
# 'rf_clf__min_samples_split': [6,10,14 ],
# "rf_clf__n_estimators": [15,25,50,100],
#     'rf_clf__min_samples_leaf': [6,8,10]}

In [None]:
# grid_search_rf= GridSearchCV(estimator=rf_pipeline, param_grid=param_grid_rf, cv=3, scoring="accuracy")


In [None]:
# grid_search_rf.fit(X_train, y_train)

In [None]:
# grid_search_rf.best_params_

In [None]:
rf_clf = RandomForestClassifier(n_estimators=15,max_depth=7,min_samples_split=6,min_samples_leaf=8,random_state=0)

In [None]:
rf_pipeline = Pipeline([
    ('preprocessing', ct),
    ('rf_clf', rf_clf)
])

In [None]:
rf_pipeline.fit(X_train,y_train)

In [None]:
y_val_pred_rf= rf_pipeline.predict(X_val)

In [None]:
acc_random_forest=accuracy_score(y_val,y_val_pred_rf)
acc_random_forest

In [None]:
cm_fr=confusion_matrix(y_val,y_val_pred_rf)
cm_fr

In [None]:
'''feature_importances = rf_pipeline.feature_importances_
feature_importances'''

In [None]:
'''import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.barh(X_train.columns, feature_importances)  # Assuming X_train has feature names
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance in Random Forest Model")
plt.gca().invert_yaxis()  # Optional: Invert y-axis for readability
plt.show()'''

**XG Boost**


In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [None]:
xgb_model=xgb.XGBRFClassifier(num_class=5)

In [None]:
xgb_pipeline = Pipeline([
    ('preprocessing', ct),
    ('xgb_model', xgb_model)
])

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded= le.fit_transform(y_val)

In [None]:
# param_grid_xgb = {

#     'xgb_pipeline__learning_rate': [0.05, 0.1,1],
#      'xgb_pipeline__max_depth': [3,6,9],
#     'xgb_pipeline__n_estimators': [50, 100, 200],
#     'xgb_pipeline__subsample': [0.6, 0.8,1.0],
#     'xgb_pipeline__colsample_bytree': [0.6, 0.8, 1.0]
# }

In [None]:
# grid_search_xgb= GridSearchCV(
#     estimator=xgb_pipeline,
#     param_grid=param_grid_xgb,
#     cv=3,
#     scoring='accuracy',
#     verbose=2,
#     n_jobs=-1
# )

In [None]:
#grid_search_xgb.fit(X_train,y_train_encoded)

In [None]:
# best_params_xgb = grid_search_xgb.best_params_
# best_params_xgb

In [None]:
# best_model_xgb = grid_search_xgb.best_estimator_
# best_model_xgb

In [None]:
xgb_model = xgb.XGBRFClassifier(
    max_depth=3,   #
    num_class=5,
    learning_rate=0.01,
    n_estimators=50,
    subsample=0.5,
    random_state=42,
    colsample_bytree = 1.0,
    min_child_weight =8,
    multi_strategy= "one_output_per_tree",
    n_jobs=0

)

In [None]:
xgb_pipeline = Pipeline([
    ('preprocessing', ct),
    ('xgb_model', xgb_model)
])

In [None]:
xgb_pipeline.fit(X_train,y_train_encoded)

In [None]:
y_val_xgb_pred =xgb_pipeline.predict(X_val)

In [None]:
accuracy_xgb = accuracy_score(y_val_encoded, y_val_xgb_pred)
print("Accuracy:", accuracy_xgb)

In [None]:
conf_matrix_xgb = confusion_matrix(y_val_encoded, y_val_xgb_pred)
conf_matrix_xgb

In [None]:
from sklearn.ensemble import BaggingClassifier



lr= LogisticRegression(random_state=42,C=1,solver='lbfgs')

# Create bagging classifier with logistic regression as base estimator
lr_bagging_clf = BaggingClassifier(
    base_estimator=lr,
    n_estimators=20,  # Number of base estimators in the ensemble
    max_samples=0.5,
    max_features=1.0,  # Feature size for each base estimator (1.0 means all features)
    bootstrap=True,  # Whether to sample with replacement
    random_state=42,  # Random seed for reproducibility
    n_jobs=-1,  # Number of CPU cores to use (-1 for all cores)
)

lr_bagging_pipeline= Pipeline(
    [
        ("preprocessing", ct),
        ("lr_bagging_clf", lr_bagging_clf),
    ]
)

lr_bagging_pipeline.fit(X_train, y_train)
y_val_pred_lr_bag = lr_bagging_pipeline.predict(X_val)

accuracy_lr_baggging= accuracy_score(y_val, y_val_pred_lr_bag)
print("Accuracy_lr_baggging:", accuracy_lr_baggging)


In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Create the Voting Classifier
voting_classifier = VotingClassifier(
    estimators=[
        ('knn', knn_pipeline),
        ('lr', lr_pipeline_tuned),
        ('svm', svm_pipeline)
    ],
    voting='hard'  # Use 'hard' voting for majority rule voting
)

# Fit the Voting Classifier on the training data
voting_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_voting = voting_classifier.predict(X_val)

# Calculate accuracy
accuracy_voting = accuracy_score(y_val, y_pred_voting)
print("Voting Classifier Accuracy:", accuracy_voting)


In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_boost_clf = AdaBoostClassifier(n_estimators=100, random_state=42,learning_rate=1.0)

ada_boost_pipeline = Pipeline([
    ("preprocessing", ct),
    ("ada_boost_clf", ada_boost_clf)
])

ada_boost_pipeline.fit(X_train, y_train)

y_pred_ada_boost = ada_boost_pipeline.predict(X_val)

accuracy_ada_boost = accuracy_score(y_val, y_pred_ada_boost)
print("AdaBoost Classifier Accuracy:", accuracy_ada_boost)


In [None]:

param_grid_gb= {
    'n_estimators': [100,150,200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [ 5, 7],
    'min_samples_split': [8,10,12],
    'min_samples_leaf': [6,8,12],
    'subsample': [0.8, 0.9, 1.0],
    'max_features': ['auto', 'sqrt', 'log2'],
    'random_state': [42]
}

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


gradient_boost_clf = GradientBoostingClassifier(n_estimators=150, random_state=42,learning_rate=0.1,max_depth=7,min_samples_split=12,subsample=0.8,min_samples_leaf=10)


gradient_boost_pipeline = Pipeline([
    ("preprocessing", ct),
    ("gradient_boost_clf", gradient_boost_clf)
])






In [None]:
gradient_boost_pipeline.fit(X_train, y_train)
y_pred_gradient_boost = gradient_boost_pipeline.predict(X_val)


accuracy_gradient_boost = accuracy_score(y_val, y_pred_gradient_boost)
print("Gradient Boosting Classifier Accuracy:", accuracy_gradient_boost)

In [None]:

conf_matrix_gradient_boost = confusion_matrix(y_val, y_pred_gradient_boost)
conf_matrix_gradient_boost

We can notice the diagonal in both the confusion matrix which represents the number of correct predictions and observe:

1) Gradient boosting is generalising better for all the ratings, not just the majority class.

**Stacking technique**

In [None]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

estimators = [
    ('svm',svm_pipeline),
    ('ridge',ridge_pipeline),
    ('lr',lr_pipeline_tuned),
    ('knn',knn_pipeline),
    ('rf',rf_pipeline),
]
stack_clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(random_state=42,max_iter=1000),cv=3
)

stack_clf.fit(X_train, y_train)
y_pred_val_stack=stack_clf.predict(X_val)

accuracy_stack=accuracy_score(y_val,y_pred_val_stack)
print("Accuracy score for stack_clf is:",accuracy_stack)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

estimators = [
    ('svm',svm_pipeline),
    ('ridge',ridge_pipeline),
    ('lr',lr_pipeline_tuned),
    ('knn',knn_pipeline),
    ('rf',rf_pipeline),
]
stack_clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(random_state=42,max_iter=1000),cv=3
)

stack_clf.fit(X_train, y_train)
y_pred_val_stack=stack_clf.predict(X_val)

accuracy_stack=accuracy_score(y_val,y_pred_val_stack)
print("Accuracy score for stack_clf is:",accuracy_stack)

In [None]:
# from sklearn.svm import LinearSVC
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import StackingClassifier

# estimators = [
#     ('svm',svm_pipeline),
#     ('gb',gradient_boost_pipeline),
# ]
# stack_clf = StackingClassifier(
#     estimators=estimators, final_estimator=LogisticRegression(random_state=42)
# )

# stack_clf.fit(X_train, y_train).score(X_val, y_val)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy_lr = accuracy_score(y_val, y_val_predict_lr)
precision_lr = precision_score(y_val,y_val_predict_lr, average='weighted')
recall_lr = recall_score(y_val, y_val_predict_lr, average='weighted')
f1_lr = f1_score(y_val, y_val_predict_lr, average='weighted')

accuracy_stack = accuracy_score(y_val, y_pred_val_stack)
precision_stack = precision_score(y_val,y_pred_val_stack, average='weighted')
recall_stack = recall_score(y_val,y_pred_val_stack, average='weighted')
f1_stack = f1_score(y_val, y_pred_val_stack, average='weighted')

models = ['lr_pipeline_tuned', 'stack_clf']

# metric scores for both models
accuracy_scores = [accuracy_lr, accuracy_stack]
precision_scores = [precision_lr, precision_stack]
recall_scores = [recall_lr,recall_stack]
f1_scores = [f1_lr, f1_stack]


# Plotting comparison bar chart
fig, ax = plt.subplots(2, 2, figsize=(8,8))

ax[0, 0].bar(models, accuracy_scores, color='skyblue')
ax[0, 0].set_title('Accuracy Comparison')
ax[0, 0].set_ylim([0, 1])

ax[0, 1].bar(models, precision_scores, color='salmon')
ax[0, 1].set_title('Precision Comparison')
ax[0, 1].set_ylim([0, 1])

ax[1, 0].bar(models, recall_scores, color='lightgreen')
ax[1, 0].set_title('Recall Comparison')
ax[1, 0].set_ylim([0, 1])

ax[1, 1].bar(models, f1_scores, color='gold')
ax[1, 1].set_title('F1 Score Comparison')
ax[1, 1].set_ylim([0, 1])

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting heatmaps
plt.figure(figsize=(10, 6))

plt.subplot(1, 2, 1)
sns.heatmap(conf_matrix_lr, annot=True, cmap='Blues', fmt='g')
plt.title('Logistic Regression Confusion Matrix')

plt.subplot(1, 2, 2)
sns.heatmap(conf_matrix_gradient_boost, annot=True, cmap='Greens', fmt='g')
plt.title('Stacking Classifier Confusion Matrix')

plt.tight_layout()
plt.show()


**submission for test set**

In [None]:
df_test=pd.read_csv("/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/test.csv")

In [None]:
df_test.drop(columns=['ID','RecipeNumber','RecipeCode','CommentID','UserID','UserName','CreationTimestamp'],inplace=True)

In [None]:
X_test=df_test.copy()

**Sample Data**

In [None]:
sample=pd.read_csv(r'/kaggle/input/recipe-for-rating-predict-food-ratings-using-ml/sample.csv')

In [None]:
sample.head()

In [None]:
X_test.shape

**Logistic Regression test**


In [None]:
#y_test_predict_lr=ml_pipeline1.predict(X_test)

In [None]:
# submission = pd.DataFrame({
#                  "ID" : range(1,4547),
#                  "Rating" : ml_pipeline1.predict(X_test)
# })

**Stacking test**

In [None]:
y_test_predict_stack=stack_clf.predict(X_test)

In [None]:
submission = pd.DataFrame({
                 "ID" : range(1,4547),
                 "Rating" : stack_clf.predict(X_test)
})

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)