In [None]:
pip install pycaret



# preprocessing~

In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/train.csv'  # Replace with the correct file path
data = pd.read_csv(file_path)

# Step 1: Drop identifier and irrelevant columns
data_cleaned = data.drop(columns=['id'])  # Drop 'id'

# Step 2: Drop columns with excessive missing values (>50% missing)
threshold = 0.5 * len(data_cleaned)  # Threshold for missing values
data_cleaned = data_cleaned.dropna(thresh=threshold, axis=1)

# Step 3: Impute missing values
# For numerical columns, fill missing values with the median
numeric_columns = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
data_cleaned[numeric_columns] = data_cleaned[numeric_columns].fillna(data_cleaned[numeric_columns].median())

# For categorical columns, fill missing values with the mode
categorical_columns = data_cleaned.select_dtypes(include=['object']).columns
data_cleaned[categorical_columns] = data_cleaned[categorical_columns].fillna(data_cleaned[categorical_columns].mode().iloc[0])

# Step 4: Encode categorical features (e.g., one-hot encoding or label encoding)
data_cleaned = pd.get_dummies(data_cleaned, columns=categorical_columns, drop_first=True)

# Step 5: Final check for null values
if data_cleaned.isnull().sum().sum() == 0:
    print("Preprocessing successful: No missing values remain.")
else:
    print("Warning: Missing values remain after preprocessing.")

# Display the first few rows of the cleaned dataset
data_cleaned.head()


Preprocessing successful: No missing values remain.


Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,FGC-FGC_CU,...,BIA-Season_Winter,PCIAT-Season_Spring,PCIAT-Season_Summer,PCIAT-Season_Winter,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter
0,5,0,51.0,16.877316,46.0,50.8,68.0,81.0,114.0,0.0,...,False,False,False,False,True,False,False,False,False,False
1,9,0,65.0,14.03559,48.0,46.0,75.0,70.0,122.0,3.0,...,True,False,False,False,False,False,False,False,True,False
2,10,1,71.0,16.648696,56.5,75.6,65.0,94.0,117.0,20.0,...,False,False,False,False,False,False,False,False,True,False
3,9,0,71.0,18.292347,56.0,81.6,60.0,97.0,117.0,18.0,...,False,False,True,False,False,True,False,False,False,True
4,18,1,65.0,17.937682,55.0,77.0,68.0,81.0,114.0,9.0,...,False,True,False,False,True,False,False,True,False,False


In [None]:
from pycaret.regression import setup, compare_models, add_metric
from sklearn.metrics import make_scorer
import numpy as np

# Define the QWK function
def quadratic_weighted_kappa(y_true, y_pred):
    """
    Calculate the quadratic weighted kappa metric.
    Parameters:
    - y_true: Array of actual values.
    - y_pred: Array of predicted values.
    Returns:
    - kappa: Quadratic weighted kappa score.
    """
    N = len(np.unique(np.concatenate((y_true, y_pred))))
    O = np.zeros((N, N))
    for a, p in zip(y_true, y_pred):
        O[int(a)][int(p)] += 1

    W = np.zeros((N, N))
    for i in range(N):
        for j in range(N):
            W[i][j] = ((i - j) ** 2) / ((N - 1) ** 2)

    hist_true = np.sum(O, axis=1)
    hist_pred = np.sum(O, axis=0)
    E = np.outer(hist_true, hist_pred) / np.sum(O)

    numerator = np.sum(W * O)
    denominator = np.sum(W * E)
    kappa = 1 - (numerator / denominator)

    return kappa



# PyCaret setup
regression_setup = setup(
    data=data_cleaned,         # Preprocessed data
    target='sii',              # Target variable
    session_id=42,             # Seed for reproducibility
    verbose=True               # Display setup summary
)

# Add QWK as a custom metric to PyCaret
add_metric(
    id='qwk',                       # Unique metric ID
    name='Quadratic Weighted Kappa',  # Display name
    score_func=quadratic_weighted_kappa,  # Function to calculate metric
    greater_is_better=True          # Higher values are better
)
# Compare models using the custom QWK metric
best_model = compare_models(sort='qwk')  # Use QWK as the sorting metric


Unnamed: 0,Description,Value
0,Session id,42
1,Target,sii
2,Target type,Regression
3,Original data shape,"(3960, 84)"
4,Transformed data shape,"(3960, 84)"
5,Transformed train set shape,"(2772, 84)"
6,Transformed test set shape,"(1188, 84)"
7,Numeric features,59
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Quadratic Weighted Kappa,TT (Sec)
lr,Linear Regression,0.2202,0.8032,0.523,-0.7804,0.2086,0.1882,0.0,0.583
par,Passive Aggressive Regressor,0.3695,1.1011,0.7053,-1.4289,0.2944,0.4281,0.0,0.083
lightgbm,Light Gradient Boosting Machine,0.0023,0.001,0.0206,0.9978,0.0059,0.0038,0.0,1.507
gbr,Gradient Boosting Regressor,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.108
ada,AdaBoost Regressor,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.085
et,Extra Trees Regressor,0.0292,0.0044,0.0659,0.9907,0.0403,0.0549,0.0,0.892
rf,Random Forest Regressor,0.0003,0.0001,0.0038,0.9998,0.001,0.0004,0.0,0.915
dt,Decision Tree Regressor,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.067
knn,K Neighbors Regressor,0.2696,0.2229,0.4711,0.528,0.2728,0.4475,0.0,0.069
lasso,Lasso Regression,0.3368,0.1973,0.4196,0.5746,0.2677,0.2615,0.0,0.082


Processing:   0%|          | 0/81 [00:00<?, ?it/s]