In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Libraries

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore')

# Load the dataset

In [8]:
data = pd.read_csv('/kaggle/input/beer-rating-reviews/train.csv')

# Initial data cleaning

In [9]:
data['review/text'] = data['review/text'].fillna("No review provided")
data['user/gender'] = data['user/gender'].fillna('Unknown')
data = data.drop(['user/ageInSeconds', 'user/birthdayRaw', 'user/birthdayUnix'], axis=1)
data = data.dropna(subset=['user/profileName'])

# Feature Engineering

In [10]:
def create_advanced_features(data):
    features = data.copy()
    
    # 1. Rating-based Features
    rating_cols = ['review/appearance', 'review/aroma', 'review/palate', 'review/taste']
    features['rating_std'] = features[rating_cols].std(axis=1)
    features['rating_range'] = features[rating_cols].max(axis=1) - features[rating_cols].min(axis=1)
    
    # Weighted Ratings
    weights = {
        'review/taste': 0.4,
        'review/aroma': 0.3,
        'review/palate': 0.2,
        'review/appearance': 0.1
    }
    features['weighted_rating'] = sum(features[col] * weight for col, weight in weights.items())
    
    # Rating Ratios
    features['taste_aroma_ratio'] = features['review/taste'] / features['review/aroma']
    features['palate_appearance_ratio'] = features['review/palate'] / features['review/appearance']
    
    # 2. Style and ABV Features
    style_means = features.groupby('beer/style')['review/overall'].mean()
    features['style_avg_rating'] = features['beer/style'].map(style_means)
    features['abv_category'] = pd.qcut(features['beer/ABV'], q=5, labels=['very_low', 'low', 'medium', 'high', 'very_high'])
    features['abv_squared'] = features['beer/ABV'] ** 2
    
    # 3. Time Features
    features['review_hour'] = pd.to_datetime(features['review/timeUnix'], unit='s').dt.hour
    features['review_day'] = pd.to_datetime(features['review/timeUnix'], unit='s').dt.dayofweek
    features['is_weekend'] = features['review_day'].isin([5, 6]).astype(int)
    
    # 4. Text Features
    features['text_length'] = features['review/text'].str.len()
    features['word_count'] = features['review/text'].str.split().str.len()
    
    # Sentiment Analysis
    def get_sentiment(text):
        return TextBlob(str(text)).sentiment.polarity
    features['review_sentiment'] = features['review/text'].apply(get_sentiment)
    
    # Keyword Features
    positive_keywords = ['excellent', 'amazing', 'perfect', 'fantastic', 'great']
    negative_keywords = ['poor', 'bad', 'disappointing', 'awful', 'mediocre']
    
    def count_keywords(text, keyword_list):
        return sum(word.lower() in str(text).lower() for word in keyword_list)
    
    features['positive_keyword_count'] = features['review/text'].apply(lambda x: count_keywords(x, positive_keywords))
    features['negative_keyword_count'] = features['review/text'].apply(lambda x: count_keywords(x, negative_keywords))
    
    # 5. User Features
    user_review_counts = features.groupby('user/profileName').size()
    user_avg_ratings = features.groupby('user/profileName')['review/overall'].mean()
    
    features['user_review_count'] = features['user/profileName'].map(user_review_counts)
    features['user_avg_rating'] = features['user/profileName'].map(user_avg_ratings)
    features['user_experience'] = pd.qcut(features['user_review_count'], q=5, labels=['novice', 'casual', 'regular', 'experienced', 'expert'])
    
    # 6. Brewery Features
    brewery_reviews = features.groupby('beer/brewerId').size()
    brewery_avg_ratings = features.groupby('beer/brewerId')['review/overall'].mean()
    
    features['brewery_review_count'] = features['beer/brewerId'].map(brewery_reviews)
    features['brewery_avg_rating'] = features['beer/brewerId'].map(brewery_avg_ratings)
    
    return features

# Create features
print("Creating features...")
features = create_advanced_features(data)


Creating features...


# Prepare for modeling

In [11]:
print("Preparing for modeling...")
# Convert categorical variables
le = LabelEncoder()
categorical_cols = ['abv_category', 'user_experience', 'beer/style']
for col in categorical_cols:
    features[col] = le.fit_transform(features[col])

# Select features
selected_features = [
    'weighted_rating', 'style_avg_rating', 'beer/ABV', 'abv_squared',
    'text_length', 'word_count', 'review_sentiment',
    'user_review_count', 'user_avg_rating', 'brewery_avg_rating',
    'positive_keyword_count', 'negative_keyword_count',
    'rating_std', 'brewery_review_count', 'is_weekend',
    'abv_category', 'user_experience', 'beer/style',
    'review/taste', 'review/aroma', 'review/appearance', 'review/palate'
]

X = features[selected_features]
y = features['review/overall']


Preparing for modeling...


# Train-test split & Scale features

In [12]:
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Scaling features...")
scaler = StandardScaler()
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

Splitting data...
Scaling features...


# LightGBM & Xgboost Training

In [13]:
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
import xgboost as xgb

# Train LightGBM Model
print("\nTraining LightGBM model...")
lgb_model = lgb.LGBMRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=-1,
    random_state=42
)
lgb_model.fit(X_train, y_train)

# Perform Cross-Validation for LightGBM
print("\nPerforming cross-validation for LightGBM...")
lgb_cv_scores = cross_val_score(lgb_model, X, y, cv=5, scoring='r2')
print(f"Cross-validation R2 scores (LightGBM): {lgb_cv_scores}")
print(f"Mean CV R2 (LightGBM): {lgb_cv_scores.mean():.4f} (+/- {lgb_cv_scores.std() * 2:.4f})")

# Train XGBoost Model
print("\nTraining XGBoost model...")
xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    objective='reg:squarederror',
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Perform Cross-Validation for XGBoost
print("\nPerforming cross-validation for XGBoost...")
xgb_cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='r2')
print(f"Cross-validation R2 scores (XGBoost): {xgb_cv_scores}")
print(f"Mean CV R2 (XGBoost): {xgb_cv_scores.mean():.4f} (+/- {xgb_cv_scores.std() * 2:.4f})")

# Model Evaluation Function
def evaluate_model(model, model_name):
    print(f"\nEvaluating {model_name}...")
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    metrics = {
        'Train R2': r2_score(y_train, train_pred),
        'Test R2': r2_score(y_test, test_pred),
        'Train RMSE': np.sqrt(mean_squared_error(y_train, train_pred)),
        'Test RMSE': np.sqrt(mean_squared_error(y_test, test_pred)),
        'Train MAE': mean_absolute_error(y_train, train_pred),
        'Test MAE': mean_absolute_error(y_test, test_pred)
    }
    
    print(f"\n{model_name} Performance Metrics:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")
    
    return metrics

# Evaluate both models
lgb_metrics = evaluate_model(lgb_model, "LightGBM")
xgb_metrics = evaluate_model(xgb_model, "XGBoost")



Training LightGBM model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002794 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1909
[LightGBM] [Info] Number of data points in the train set: 29996, number of used features: 22
[LightGBM] [Info] Start training from score 3.887202

Performing cross-validation for LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1890
[LightGBM] [Info] Number of data points in the train set: 29996, number of used features: 22
[LightGBM] [Info] Start training from score 3.889885
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005703 seconds.
You can set

# Feature Importance for LightGBM & XGboost

In [14]:
lgb_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': lgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features (LightGBM):")
print(lgb_importance.head(10))

xgb_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features (XGBoost):")
print(xgb_importance.head(10))


Top 10 Most Important Features (LightGBM):
              feature  importance
8     user_avg_rating         873
7   user_review_count         623
0     weighted_rating         540
6    review_sentiment         531
2            beer/ABV         452
5          word_count         402
4         text_length         395
12         rating_std         312
1    style_avg_rating         285
17         beer/style         245

Top 10 Most Important Features (XGBoost):
                   feature  importance
18            review/taste    0.683268
0          weighted_rating    0.143844
21           review/palate    0.043156
8          user_avg_rating    0.025953
2                 beer/ABV    0.013243
7        user_review_count    0.012845
10  positive_keyword_count    0.008100
1         style_avg_rating    0.005877
6         review_sentiment    0.005873
17              beer/style    0.005821


# Catboost Training

In [15]:
from catboost import CatBoostRegressor


# Train CatBoost Model
print("\nTraining CatBoost model...")
catboost_model = CatBoostRegressor(
    iterations=200,
    learning_rate=0.1,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=False
)
catboost_model.fit(X_train, y_train)

# Perform Cross-Validation for CatBoost
print("\nPerforming cross-validation for CatBoost...")
catboost_cv_scores = cross_val_score(catboost_model, X, y, cv=5, scoring='r2')
print(f"Cross-validation R2 scores (CatBoost): {catboost_cv_scores}")
print(f"Mean CV R2 (CatBoost): {catboost_cv_scores.mean():.4f} (+/- {catboost_cv_scores.std() * 2:.4f})")

# Evaluate CatBoost model using the existing evaluation function
catboost_metrics = evaluate_model(catboost_model, "CatBoost")



Training CatBoost model...

Performing cross-validation for CatBoost...
Cross-validation R2 scores (CatBoost): [0.73822085 0.75374321 0.74294789 0.73592595 0.73178606]
Mean CV R2 (CatBoost): 0.7405 (+/- 0.0151)

Evaluating CatBoost...

CatBoost Performance Metrics:
Train R2: 0.7584
Test R2: 0.7410
Train RMSE: 0.3449
Test RMSE: 0.3538
Train MAE: 0.2588
Test MAE: 0.2676


# Feature importance for CatBoost

In [16]:
catboost_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': catboost_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features (CatBoost):")
print(catboost_importance.head(10))


Top 10 Most Important Features (CatBoost):
              feature  importance
18       review/taste   29.231018
0     weighted_rating   25.792891
8     user_avg_rating   17.188756
21      review/palate    7.473979
7   user_review_count    6.144437
2            beer/ABV    2.189009
6    review_sentiment    1.609152
1    style_avg_rating    1.470656
3         abv_squared    1.451591
17         beer/style    1.256453


# Comparison of all three models

In [17]:
print("\nModel Comparison Summary:")
models_comparison = pd.DataFrame({
    'LightGBM': {
        'Test R2': lgb_metrics['Test R2'],
        'Test RMSE': lgb_metrics['Test RMSE'],
        'Test MAE': lgb_metrics['Test MAE'],
        'CV R2 Mean': lgb_cv_scores.mean()
    },
    'XGBoost': {
        'Test R2': xgb_metrics['Test R2'],
        'Test RMSE': xgb_metrics['Test RMSE'],
        'Test MAE': xgb_metrics['Test MAE'],
        'CV R2 Mean': xgb_cv_scores.mean()
    },
    'CatBoost': {
        'Test R2': catboost_metrics['Test R2'],
        'Test RMSE': catboost_metrics['Test RMSE'],
        'Test MAE': catboost_metrics['Test MAE'],
        'CV R2 Mean': catboost_cv_scores.mean()
    }
}).round(4)

print(models_comparison)


Model Comparison Summary:
            LightGBM  XGBoost  CatBoost
Test R2       0.7364   0.7385    0.7410
Test RMSE     0.3570   0.3555    0.3538
Test MAE      0.2698   0.2678    0.2676
CV R2 Mean    0.7378   0.7369    0.7405
