In [1]:
def round_with_thresholds(raw_preds, thresholds):
    return np.where(raw_preds < thresholds[0], int(0),
                    np.where(raw_preds < thresholds[1], int(1),
                             np.where(raw_preds < thresholds[2], int(2), int(3))))

def optimize_thresholds(y_true, raw_preds, start_vals=[0.5, 1.5, 2.5]):
    def fun(thresholds, y_true, raw_preds):
        rounded_preds = round_with_thresholds(raw_preds, thresholds)
        return -cohen_kappa_score(y_true, rounded_preds, weights='quadratic')

    res = minimize(fun, x0=start_vals, args=(y_true, raw_preds), method='Powell')
    assert res.success
    return res.x

In [2]:
def calculate_weights(series):
    # Create bins for the target variable and assign weights based on frequency
    bins = pd.cut(series, bins=10, labels=False)
    weights = bins.value_counts().reset_index()
    weights.columns = ['target_bins', 'count']
    weights['count'] = 1 / weights['count']
    weight_map = weights.set_index('target_bins')['count'].to_dict()
    weights = bins.map(weight_map)
    return weights / weights.mean() 

In [3]:
def cross_validate(model_, data, features, score_col, index_col, cv, sample_weights=False, verbose=False):
    """
    Perform cross-validation with a given model and compute the out-of-fold 
    predictions and Cohen's Kappa score for each fold.

    Returns:
    float: Mean Kappa score across all folds.
    array: Out-of-fold score predictions for the entire dataset.
    """
    kappa_scores = [] 
    oof_score_predictions = np.zeros(len(data))  

    score_to_index_thresholds = base_thresholds  
    thresholds = []
    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(data, data[index_col])):
        X_train, X_val = data[features].iloc[train_idx], data[features].iloc[val_idx]
        y_train_score = data[score_col].iloc[train_idx] 
        y_train_index = data[index_col].iloc[train_idx]
        y_val_score = data[score_col].iloc[val_idx]      
        y_val_index = data[index_col].iloc[val_idx]     
        
        # Train model with sample weights if provided
        if sample_weights:
            weights = calculate_weights(y_train_score)
            model_.fit(X_train, y_train_score, sample_weight=weights)
        else:
            model_.fit(X_train, y_train_score)
        
        #Predict on train and val: 
        y_pred_train_score = model_.predict(X_train)
        y_pred_val_score = model_.predict(X_val)
        
        oof_score_predictions[val_idx] = y_pred_val_score 

        # Find optimal threshold in sample 
        t_1 = optimize_thresholds(y_train_index, y_pred_train_score, start_vals=base_thresholds)
        thresholds.append(t_1)

        y_pred_val_index = round_with_thresholds(y_pred_val_score, t_1)

        kappa_score = cohen_kappa_score(y_val_index, y_pred_val_index, weights='quadratic')
        kappa_scores.append(kappa_score)
        
        if verbose:
            print(f"Fold {fold_idx}: Optimized Kappa Score = {kappa_score}")
    
    if verbose:
        print(f"## Mean CV Kappa Score: {np.mean(kappa_scores)} ##")
        print(f"## Std CV: {np.std(kappa_scores)}")
    
    return np.mean(kappa_scores), oof_score_predictions, thresholds

def n_cross_validate(model_, data, features, score_col, index_col, cv, seeds, sample_weights=False, verbose=False):
    scores = []
    for seed in seeds:
        cv.random_state=seed
        score, oof, _ = cross_validate(model_, data, features, score_col, index_col, cv, sample_weights=True, verbose=False)
        scores.append(score)
    return score, oof