In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

In [2]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [10]:
cat_train = pd.read_csv("/content/cat_train.csv")
cat_test = pd.read_csv("/content/cat_test.csv")
tab_train = pd.read_csv("/content/tab_train.csv")
tab_test = pd.read_csv("/content/tab_test.csv")

In [8]:
def preprocess_data(df, target=None, is_categorical=True):
    df = df.copy()

    # Separate numeric and categorical features
    num_cols = df.select_dtypes(include=["number"]).columns
    cat_cols = df.select_dtypes(exclude=["number"]).columns

    # Handle missing values
    if len(num_cols) > 0:
        num_imputer = SimpleImputer(strategy="mean")
        df[num_cols] = num_imputer.fit_transform(df[num_cols])

    if len(cat_cols) > 0:
        cat_imputer = SimpleImputer(strategy="most_frequent")
        df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

        # Label encode categorical features
        for col in cat_cols:
            df[col] = LabelEncoder().fit_transform(df[col])

    # Scale numerical features
    if not is_categorical and len(num_cols) > 0:
        scaler = StandardScaler()
        df[num_cols] = scaler.fit_transform(df[num_cols])

    return df


In [6]:
# Process categorical dataset
X_cat = cat_train.drop(columns=["target"])
y_cat = cat_train["target"]
X_cat = preprocess_data(X_cat, is_categorical=True)
X_cat_train, X_cat_val, y_cat_train, y_cat_val = train_test_split(X_cat, y_cat, test_size=0.2, random_state=42)


In [11]:
# Process regression dataset
X_tab = tab_train.drop(columns=["price"])
y_tab = tab_train["price"]
X_tab = preprocess_data(X_tab, is_categorical=False)
X_tab_train, X_tab_val, y_tab_train, y_tab_val = train_test_split(X_tab, y_tab, test_size=0.2, random_state=42)


In [12]:
### Model Training & Evaluation ###
def train_evaluate_model(model, X_train, X_val, y_train, y_val, is_categorical=True):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    if is_categorical:
        print("Accuracy:", accuracy_score(y_val, y_pred))
        print(classification_report(y_val, y_pred))
    else:
        print("RMSE:", np.sqrt(mean_squared_error(y_val, y_pred)))

    return model

In [13]:
# Train models for classification
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
print("Random Forest Classifier:")
train_evaluate_model(rf_clf, X_cat_train, X_cat_val, y_cat_train, y_cat_val)
print("XGBoost Classifier:")
train_evaluate_model(xgb_clf, X_cat_train, X_cat_val, y_cat_train, y_cat_val)

Random Forest Classifier:
Accuracy: 0.81605
              precision    recall  f1-score   support

           0       0.82      0.99      0.90     97640
           1       0.58      0.04      0.08     22360

    accuracy                           0.82    120000
   macro avg       0.70      0.52      0.49    120000
weighted avg       0.78      0.82      0.75    120000

XGBoost Classifier:


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8215583333333333
              precision    recall  f1-score   support

           0       0.83      0.97      0.90     97640
           1       0.58      0.16      0.25     22360

    accuracy                           0.82    120000
   macro avg       0.71      0.57      0.57    120000
weighted avg       0.79      0.82      0.78    120000



In [14]:
# Train models for regression
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
xgb_reg = xgb.XGBRegressor()
print("Random Forest Regressor:")
train_evaluate_model(rf_reg, X_tab_train, X_tab_val, y_tab_train, y_tab_val, is_categorical=False)
print("XGBoost Regressor:")
train_evaluate_model(xgb_reg, X_tab_train, X_tab_val, y_tab_train, y_tab_val, is_categorical=False)

Random Forest Regressor:
RMSE: 603.6396236517471
XGBoost Regressor:
RMSE: 590.8401856678336


In [15]:
### Hyperparameter Tuning ###
param_grid = { 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 10], 'min_samples_split': [2, 5, 10] }

gs_clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
gs_clf.fit(X_cat_train, y_cat_train)
print("Best Parameters (Classification):", gs_clf.best_params_)

rs_reg = RandomizedSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=2, n_jobs=-1)
rs_reg.fit(X_tab_train, y_tab_train)
print("Best Parameters (Regression):", rs_reg.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters (Classification): {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 50}
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters (Regression): {'n_estimators': 50, 'min_samples_split': 5, 'max_depth': 10}
