In [None]:
# pip install pandas numpy xgboost scikit-learn pyreadr matplotlib boto3 tqdm catboost

In [3]:
import pandas as pd
import numpy as np
import pyreadr
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_curve, auc, roc_curve
from sklearn.utils import shuffle
from datetime import timedelta
from tqdm import tqdm
import random
import os
import boto3
import tempfile

In [4]:


# Define S3 info
bucket_name = 'kehmisjan2025'
file_key = 'targets_apr23.rds'

# Initialize boto3 client
s3 = boto3.client('s3')

# Download to a temporary file
with tempfile.NamedTemporaryFile(suffix=".rds") as tmp_file:
    s3.download_fileobj(bucket_name, file_key, tmp_file)
    tmp_file.seek(0)  # go back to beginning
    result = pyreadr.read_r(tmp_file.name)  # returns a dictionary

# Extract the data frame
iit_data = next(iter(result.values()))  # assumes only one object inside


In [5]:
# print(iit_data.dtypes) 
# Ensure the 'NAD' column is converted to datetime
iit_data['NAD'] = pd.to_datetime(iit_data['NAD'], format='%Y-%m-%d')
# iit_data['VisitDate'] = pd.to_datetime(iit_data['VisitDate'], format='%Y-%m-%d')

In [6]:
# remove the last quarter of the year
# Define the date range to exclude
start_exclude = pd.Timestamp('2024-10-01')
end_exclude = pd.Timestamp('2024-12-31')

# Filter out records from Sept through Dec 2024
iit_data = iit_data[~((iit_data['NAD'] >= start_exclude) & (iit_data['NAD'] <= end_exclude))]

In [7]:
print(iit_data["Sex"].value_counts(normalize=True))

Sex
Female    0.681077
Male      0.318923
Name: proportion, dtype: float64


In [8]:

# iit_data['is_december'] = iit_data['Month'].apply(lambda x: 1 if x == "December" else 0)
iit_data['is_friday'] = iit_data['Day'].apply(lambda x: 1 if x == "Fri" else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iit_data['is_friday'] = iit_data['Day'].apply(lambda x: 1 if x == "Fri" else 0)


In [9]:
print(iit_data['VisitDate'].min(),iit_data['VisitDate'].max())
print(iit_data['NAD'].min(),iit_data['NAD'].max())

iit_data = iit_data.drop(columns=[
    'OptimizedHIVRegimen', 'Drug', 'VisitDate', 'WHO_Missing', 'Type',
    'most_recent_cd4', 'regimen_switch', 'AHD', 'NAD_Imputation_Flag',
    'BMI_Missing', 'TimeatFacility', 'Adherence_Missing', 'Facility_type_category',
    'Pregnant_Missing', 'Breastfeeding_Missing', 'Month', 'Day'
    # 'lastvd' to 'months_since_restart' would go here
    # 'Month', 'Day' handled below
    
])
# iit_data = iit_data.drop(columns=iit_data.loc[:, 'men_knowledge':'women_sti'].columns)



2021-01-04 2024-09-28
2022-01-01 00:00:00 2024-09-30 00:00:00


In [10]:
selected_columns= ['num_late_last3', 'num_late14_last3', 'num_late30_last3',
       'num_late_last5', 'num_late14_last5', 'num_late30_last5',
       'num_late_last10', 'num_late14_last10', 'num_late30_last10']
iit_data[selected_columns] = iit_data[selected_columns].apply(pd.to_numeric, errors='coerce')

In [11]:

# Pregnant: Yes -> 1, No -> 0, else NA
iit_data['Pregnant'] = iit_data['Pregnant'].map({'Yes': 1, 'No': 0}).astype('Int64')

# Breastfeeding: Yes -> 1, No -> 0, else NA
iit_data['Breastfeeding'] = iit_data['Breastfeeding'].map({'Yes': 1, 'No': 0}).astype('Int64')

# ARTAdherence: good -> 1, poor/fair -> 0, else NA
iit_data['ARTAdherence'] = iit_data['ARTAdherence'].map({
    'good': 1,
    'poor': 0,
    'fair': 0
}).astype('Int64')

# Sex: Male -> 1, else 0
iit_data['Sex'] = (iit_data['Sex'] == 'Male').astype('Int64')

# Emr: KenyaEMR -> 1, else 0
iit_data['Emr'] = (iit_data['Emr'] == 'KenyaEMR').astype('Int64')  # assuming there is an 'Emr' column



In [12]:
def encode_xgboost(dataset):
    # List of categorical variables to be encoded
    categorical_columns = [ 'BMI', 'WHOStage','most_recent_vl', 'MaritalStatus', 'EducationLevel','DifferentiatedCare',
       'Occupation', 'VisitBy','TCAReason', 'cascade_status', 'Kephlevel','Ownertype'] 
    
    # One-hot encoding the categorical columns
    ohe = pd.get_dummies(dataset[categorical_columns], drop_first=True, dtype=int)
    
    # Concatenate the original dataset (excluding categorical columns) with the one-hot encoded columns
    dataset_encoded = pd.concat([dataset.drop(columns=categorical_columns), ohe], axis=1)
    
    return dataset_encoded


Create folds 

In [16]:
from catboost import CatBoostClassifier, Pool

# Identify categorical features if known
categorical_features = ['Kephlevel','DifferentiatedCare','WHOStage','most_recent_vl','MaritalStatus','EducationLevel',
 'Occupation','VisitBy','BMI','TCAReason','cascade_status','Ownertype']  # Replace with actual names

In [45]:
# Fold 1
# ========== Train Data (Jan–May 2023) ==========
train_data1 = iit_data.copy()
train_data1 = train_data1.drop(columns=["SiteCode"])
train_data1 = train_data1[(train_data1["NAD"] >= "2023-01-01") & (train_data1["NAD"] <= "2023-05-31")]
train_data1 = encode_xgboost(train_data1)
train_labels1 = train_data1["iit"]
train_features1 = train_data1.drop(columns=["key", "NAD", "iit"])

train_pool1 = Pool(data=train_features1, label=train_labels1)

# ========== Validation Data (June 2023) ==========
val_data1 = iit_data.copy()
val_data1 = val_data1.drop(columns=["SiteCode"])
val_data1 = val_data1[(val_data1["NAD"] >= "2023-06-01") & (val_data1["NAD"] <= "2023-06-30")]
val_data1 = encode_xgboost(val_data1)
val_labels1 = val_data1["iit"]
val_features1 = val_data1.drop(columns=["key", "NAD", "iit"])

val_pool1 = Pool(data=val_features1, label=val_labels1)

# ========== Test Near Data (July 2023) ==========
testnear_data1 = iit_data.copy()
testnear_data1 = testnear_data1.drop(columns=["SiteCode"])
testnear_data1 = testnear_data1[(testnear_data1["NAD"] >= "2023-07-01") & (testnear_data1["NAD"] <= "2023-07-31")]
testnear_data1 = encode_xgboost(testnear_data1)
testnear_labels1 = testnear_data1["iit"]
testnear_features1 = testnear_data1.drop(columns=["key", "NAD", "iit"])

testnear_pool1 = Pool(data=testnear_features1, label=testnear_labels1)

# ========== Test Data (July–Sept 2023) ==========
test_data1 = iit_data.copy()
test_data1 = test_data1.drop(columns=["SiteCode"])
test_data1 = test_data1[(test_data1["NAD"] >= "2023-07-01") & (test_data1["NAD"] <= "2023-09-30")]
test_data1 = encode_xgboost(test_data1)
test_labels1 = test_data1["iit"]
test_features1 = test_data1.drop(columns=["key", "NAD", "iit"])

test_pool1 = Pool(data=test_features1, label=test_labels1)


In [46]:
# Fold 2
# Train Data (Jan–May 2023)
train_data2 = iit_data.copy()
train_data2 = train_data2.drop(columns=["SiteCode"])
train_data2 = train_data2[(train_data2["NAD"] >= "2023-04-01") & (train_data2["NAD"] <= "2023-08-30")]
train_data2 = encode_xgboost(train_data2)
train_labels2 = train_data2["iit"]
train_features2 = train_data2.drop(columns=["key", "NAD", "iit"])

train_pool2 = Pool(data=train_features2, label=train_labels2)

# Validation Data (June 2023)
val_data2 = iit_data.copy()
val_data2 = val_data2.drop(columns=["SiteCode"])
val_data2 = val_data2[(val_data2["NAD"] >= "2023-09-01") & (val_data2["NAD"] <= "2023-09-30")]
val_data2 = encode_xgboost(val_data2)
val_labels2 = val_data2["iit"]
val_features2 = val_data2.drop(columns=["key", "NAD", "iit"])

val_pool2 = Pool(data=val_features2, label=val_labels2)

# Test Near Data (July 2023)
testnear_data2 = iit_data.copy()
testnear_data2 = testnear_data2.drop(columns=["SiteCode"])
testnear_data2 = testnear_data2[(testnear_data2["NAD"] >= "2023-10-01") & (testnear_data2["NAD"] <= "2023-10-31")]
testnear_data2 = encode_xgboost(testnear_data2)
testnear_labels2 = testnear_data2["iit"]
testnear_features2 = testnear_data2.drop(columns=["key", "NAD", "iit"])

testnear_pool2 = Pool(data=testnear_features2, label=testnear_labels2)
# Test Data (July–Sept 2023)
test_data2 = iit_data.copy()
test_data2 = test_data2.drop(columns=["SiteCode"])
test_data2 = test_data2[(test_data2["NAD"] >= "2023-10-01") & (test_data2["NAD"] <= "2023-12-31")]
test_data2 = encode_xgboost(test_data2)
test_labels2 = test_data2["iit"]
test_features2 = test_data2.drop(columns=["key", "NAD", "iit"])

test_pool2 = Pool(data=test_features2, label=test_labels2)

In [47]:
# Fold 3

train_data3 = iit_data.copy()
train_data3 = train_data3.drop(columns=["SiteCode"])
train_data3 = train_data3[(train_data3["NAD"] >= "2023-06-01") & (train_data3["NAD"] <= "2023-11-30")]
train_data3 = encode_xgboost(train_data3)
train_labels3 = train_data3["iit"]
train_features3 = train_data3.drop(columns=["key", "NAD", "iit"])

train_pool3 = Pool(data=train_features3, label=train_labels3)

# Validation Data (June 3033)
val_data3 = iit_data.copy()
val_data3 = val_data3.drop(columns=["SiteCode"])
val_data3 = val_data3[(val_data3["NAD"] >= "2023-12-01") & (val_data3["NAD"] <= "2023-12-31")]
val_data3 = encode_xgboost(val_data3)
val_labels3 = val_data3["iit"]
val_features3 = val_data3.drop(columns=["key", "NAD", "iit"])

val_pool3 = Pool(data=val_features3, label=val_labels3)

# Test Near Data (July 3033)
testnear_data3 = iit_data.copy()
testnear_data3 = testnear_data3.drop(columns=["SiteCode"])
testnear_data3 = testnear_data3[(testnear_data3["NAD"] >= "2024-01-01") & (testnear_data3["NAD"] <= "2024-01-31")]
testnear_data3 = encode_xgboost(testnear_data3)
testnear_labels3 = testnear_data3["iit"]
testnear_features3 = testnear_data3.drop(columns=["key", "NAD", "iit"])

testnear_pool3 = Pool(data=testnear_features3, label=testnear_labels3)
# Test Data (July–Sept 3033)
test_data3 = iit_data.copy()
test_data3 = test_data3.drop(columns=["SiteCode"])
test_data3 = test_data3[(test_data3["NAD"] >= "2024-01-01") & (test_data3["NAD"] <= "2024-03-31")]
test_data3 = encode_xgboost(test_data3)
test_labels3 = test_data3["iit"]
test_features3 = test_data3.drop(columns=["key", "NAD", "iit"])

test_pool3 = Pool(data=test_features3, label=test_labels3)


In [48]:
# Fold 4
train_data4 = iit_data.copy()
train_data4 = train_data4.drop(columns=["SiteCode"])
train_data4 = train_data4[(train_data4["NAD"] >= "2023-09-01") & (train_data4["NAD"] <= "2024-02-29")]
train_data4= encode_xgboost(train_data4)
train_labels4 = train_data4["iit"]
train_features4 = train_data4.drop(columns=["key", "NAD", "iit"])

train_pool4 = Pool(data=train_features4, label=train_labels4)

# Validation Data (June 4044)
val_data4 = iit_data.copy()
val_data4 = val_data4.drop(columns=["SiteCode"])
val_data4 = val_data4[(val_data4["NAD"] >= "2024-03-01") & (val_data4["NAD"] <= "2024-03-31")]
val_data4= encode_xgboost(val_data4)
val_labels4 = val_data4["iit"]
val_features4 = val_data4.drop(columns=["key", "NAD", "iit"])

val_pool4 = Pool(data=val_features4, label=val_labels4)


# Test Near Data (July 4044)
testnear_data4 = iit_data.copy()
testnear_data4 = testnear_data4.drop(columns=["SiteCode"])
testnear_data4 = testnear_data4[(testnear_data4["NAD"] >= "2024-04-01") & (testnear_data4["NAD"] <= "2024-04-30")]
testnear_data4= encode_xgboost(testnear_data4)
testnear_labels4 = testnear_data4["iit"]
testnear_features4 = testnear_data4.drop(columns=["key", "NAD", "iit"])

testnear_pool4 = Pool(data=testnear_features4, label=testnear_labels4)

# Test Data (July–Sept 4044)
test_data4 = iit_data.copy()
test_data4 = test_data4.drop(columns=["SiteCode"])
test_data4 = test_data4[(test_data4["NAD"] >= "2024-04-01") & (test_data4["NAD"] <= "2024-06-30")]
test_data4=encode_xgboost(test_data4)
test_labels4 = test_data4["iit"]
test_features4 = test_data4.drop(columns=["key", "NAD", "iit"])

test_pool4 = Pool(data=test_features4, label=test_labels4)

In [49]:
# Fold 5

train_data5 = iit_data.copy()
train_data5 = train_data5.drop(columns=["SiteCode"])
train_data5 = train_data5[(train_data5["NAD"] >= "2024-01-01") & (train_data5["NAD"] <= "2024-05-31")]
train_data5= encode_xgboost(train_data5)
train_labels5 = train_data5["iit"]
train_features5 = train_data5.drop(columns=["key", "NAD", "iit"])

train_pool5 = Pool(data=train_features5, label=train_labels5)


# Validation Data (June 5055)
val_data5 = iit_data.copy()
val_data5 = val_data5.drop(columns=["SiteCode"])
val_data5 = val_data5[(val_data5["NAD"] >= "2024-06-01") & (val_data5["NAD"] <= "2024-06-30")]
val_data5= encode_xgboost(val_data5)
val_labels5 = val_data5["iit"]
val_features5 = val_data5.drop(columns=["key", "NAD", "iit"])

val_pool5 = Pool(data=val_features5, label=val_labels5)

# Test Near Data (July 5055)
testnear_data5 = iit_data.copy()
testnear_data5 = testnear_data5.drop(columns=["SiteCode"])
testnear_data5 = testnear_data5[(testnear_data5["NAD"] >= "2024-07-01") & (testnear_data5["NAD"] <= "2024-07-30")]
testnear_data5= encode_xgboost(testnear_data5)
testnear_labels5 = testnear_data5["iit"]
testnear_features5 = testnear_data5.drop(columns=["key", "NAD", "iit"])

testnear_pool5 = Pool(data=testnear_features5, label=testnear_labels5)
# Test Data (July–Sept 5055)
test_data5 = iit_data.copy()
test_data5 = test_data5.drop(columns=["SiteCode"])
test_data5 = test_data5[(test_data5["NAD"] >= "2024-07-01") & (test_data5["NAD"] <= "2024-09-30")]
test_data5= encode_xgboost(test_data5)
test_labels5 = test_data5["iit"]
test_features5 = test_data5.drop(columns=["key", "NAD", "iit"])

test_pool5 = Pool(data=test_features5, label=test_labels5)

In [50]:
# Your existing fold structure using CatBoost Pools and raw test DataFrames
fold_list = {
    "fold1": [train_pool1, val_pool1, testnear_pool1, test_pool1, testnear_data1, test_data1],
    "fold2": [train_pool2, val_pool2, testnear_pool2, test_pool2, testnear_data2, test_data2],
    "fold3": [train_pool3, val_pool3, testnear_pool3, test_pool3, testnear_data3, test_data3],
    "fold4": [train_pool4, val_pool4, testnear_pool4, test_pool4, testnear_data4, test_data4],
    "fold5": [train_pool5, val_pool5, testnear_pool5, test_pool5, testnear_data5, test_data5]
}


In [51]:

from itertools import product
from sklearn.metrics import precision_recall_curve, average_precision_score

# Step 1: Create the grid
params_grid = list(product(
    [0.05, 0.1],                  # eta
    [6, 8],               # max_depth
    [0.5, 0.8],           # subsample
    [0.6],           # colsample_bytree
    [1, 10],                     # lambda
    [50]                  # scale_pos_weight
))

# Build DataFrame
grid_sparse = pd.DataFrame(params_grid, columns=[
    "eta", "max_depth", "subsample", "col_sample", "lambda_", "scale_pos_weight"
])
# Add empty columns for PR AUCs
for k in range(1, 6):
    grid_sparse[f"val_pr_auc_near_{k}"] = np.nan
    grid_sparse[f"val_pr_auc_{k}"] = np.nan



In [26]:
grid_sparse

Unnamed: 0,eta,max_depth,subsample,col_sample,lambda_,scale_pos_weight,val_pr_auc_near_1,val_pr_auc_1,val_pr_auc_near_2,val_pr_auc_2,val_pr_auc_near_3,val_pr_auc_3,val_pr_auc_near_4,val_pr_auc_4,val_pr_auc_near_5,val_pr_auc_5
0,0.05,6,0.5,0.6,1,50,,,,,,,,,,
1,0.05,6,0.5,0.6,10,50,,,,,,,,,,
2,0.05,6,0.8,0.6,1,50,,,,,,,,,,
3,0.05,6,0.8,0.6,10,50,,,,,,,,,,
4,0.05,8,0.5,0.6,1,50,,,,,,,,,,
5,0.05,8,0.5,0.6,10,50,,,,,,,,,,
6,0.05,8,0.8,0.6,1,50,,,,,,,,,,
7,0.05,8,0.8,0.6,10,50,,,,,,,,,,
8,0.1,6,0.5,0.6,1,50,,,,,,,,,,
9,0.1,6,0.5,0.6,10,50,,,,,,,,,,


In [52]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import average_precision_score
from tqdm import tqdm


# Grid search loop
for i in tqdm(range(len(grid_sparse)), desc="Grid Search"):

    row = grid_sparse.iloc[i]

    for k in tqdm(range(1, 6), desc=f"Fold {i+1}", leave=False):

        dtrain, dval, dtestnear, dtest, test_near, test_data = fold_list[f"fold{k}"]

        model = CatBoostClassifier(
            iterations=3000,
            learning_rate=row["eta"],
            depth=int(row["max_depth"]),
            subsample=row["subsample"],
            colsample_bylevel=row["col_sample"],
            l2_leaf_reg=row["lambda_"],
            scale_pos_weight=row["scale_pos_weight"],
            eval_metric="AUC",
            loss_function="Logloss",
            verbose=False,
            early_stopping_rounds=100,
            random_seed=42
        )

        # Fit model
        model.fit(dtrain, eval_set=dval)

        # Predict on testnear
        testnear_preds = model.predict_proba(dtestnear)[:, 1]
        ap_near = average_precision_score(test_near["iit"], testnear_preds)
        grid_sparse.at[grid_sparse.index[i], f"val_pr_auc_near_{k}"] = ap_near

        # Predict on test
        test_preds = model.predict_proba(dtest)[:, 1]
        ap = average_precision_score(test_data["iit"], test_preds)
        grid_sparse.at[grid_sparse.index[i], f"val_pr_auc_{k}"] = ap

    print(grid_sparse.iloc[i])




Grid Search:   6%|▋         | 1/16 [13:17<3:19:19, 797.29s/it]

eta                   0.050000
max_depth             6.000000
subsample             0.500000
col_sample            0.600000
lambda_               1.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.160344
val_pr_auc_1          0.164269
val_pr_auc_near_2     0.194221
val_pr_auc_2          0.170676
val_pr_auc_near_3     0.143216
val_pr_auc_3          0.123820
val_pr_auc_near_4     0.131218
val_pr_auc_4          0.125694
val_pr_auc_near_5     0.140735
val_pr_auc_5          0.131620
Name: 0, dtype: float64


Grid Search:  12%|█▎        | 2/16 [27:10<3:11:01, 818.69s/it]

eta                   0.050000
max_depth             6.000000
subsample             0.500000
col_sample            0.600000
lambda_              10.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.160884
val_pr_auc_1          0.163165
val_pr_auc_near_2     0.190773
val_pr_auc_2          0.168372
val_pr_auc_near_3     0.142813
val_pr_auc_3          0.123439
val_pr_auc_near_4     0.130563
val_pr_auc_4          0.125050
val_pr_auc_near_5     0.142261
val_pr_auc_5          0.131808
Name: 1, dtype: float64


Grid Search:  19%|█▉        | 3/16 [42:35<3:07:53, 867.21s/it]

eta                   0.050000
max_depth             6.000000
subsample             0.800000
col_sample            0.600000
lambda_               1.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.158790
val_pr_auc_1          0.163828
val_pr_auc_near_2     0.190346
val_pr_auc_2          0.168443
val_pr_auc_near_3     0.142176
val_pr_auc_3          0.123082
val_pr_auc_near_4     0.130075
val_pr_auc_4          0.124137
val_pr_auc_near_5     0.142039
val_pr_auc_5          0.132118
Name: 2, dtype: float64


Grid Search:  25%|██▌       | 4/16 [58:23<2:59:46, 898.85s/it]

eta                   0.050000
max_depth             6.000000
subsample             0.800000
col_sample            0.600000
lambda_              10.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.157938
val_pr_auc_1          0.162837
val_pr_auc_near_2     0.190084
val_pr_auc_2          0.167982
val_pr_auc_near_3     0.141925
val_pr_auc_3          0.123119
val_pr_auc_near_4     0.130242
val_pr_auc_4          0.125001
val_pr_auc_near_5     0.140601
val_pr_auc_5          0.131839
Name: 3, dtype: float64


Grid Search:  31%|███▏      | 5/16 [1:06:52<2:19:02, 758.40s/it]

eta                   0.050000
max_depth             8.000000
subsample             0.500000
col_sample            0.600000
lambda_               1.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.160075
val_pr_auc_1          0.162118
val_pr_auc_near_2     0.191238
val_pr_auc_2          0.169153
val_pr_auc_near_3     0.141187
val_pr_auc_3          0.123432
val_pr_auc_near_4     0.133065
val_pr_auc_4          0.126577
val_pr_auc_near_5     0.140374
val_pr_auc_5          0.130755
Name: 4, dtype: float64


Grid Search:  38%|███▊      | 6/16 [1:17:13<1:58:35, 711.57s/it]

eta                   0.050000
max_depth             8.000000
subsample             0.500000
col_sample            0.600000
lambda_              10.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.163266
val_pr_auc_1          0.164597
val_pr_auc_near_2     0.193341
val_pr_auc_2          0.170131
val_pr_auc_near_3     0.143173
val_pr_auc_3          0.124383
val_pr_auc_near_4     0.133261
val_pr_auc_4          0.127327
val_pr_auc_near_5     0.141259
val_pr_auc_5          0.131103
Name: 5, dtype: float64


Grid Search:  44%|████▍     | 7/16 [1:26:33<1:39:18, 662.03s/it]

eta                   0.050000
max_depth             8.000000
subsample             0.800000
col_sample            0.600000
lambda_               1.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.160277
val_pr_auc_1          0.162724
val_pr_auc_near_2     0.189925
val_pr_auc_2          0.167643
val_pr_auc_near_3     0.142757
val_pr_auc_3          0.124305
val_pr_auc_near_4     0.132593
val_pr_auc_4          0.126327
val_pr_auc_near_5     0.141685
val_pr_auc_5          0.131386
Name: 6, dtype: float64


Grid Search:  50%|█████     | 8/16 [1:39:55<1:34:13, 706.66s/it]

eta                   0.050000
max_depth             8.000000
subsample             0.800000
col_sample            0.600000
lambda_              10.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.161401
val_pr_auc_1          0.164569
val_pr_auc_near_2     0.191953
val_pr_auc_2          0.169154
val_pr_auc_near_3     0.144538
val_pr_auc_3          0.124941
val_pr_auc_near_4     0.133889
val_pr_auc_4          0.127963
val_pr_auc_near_5     0.143071
val_pr_auc_5          0.132283
Name: 7, dtype: float64


Grid Search:  56%|█████▋    | 9/16 [1:46:43<1:11:33, 613.30s/it]

eta                   0.100000
max_depth             6.000000
subsample             0.500000
col_sample            0.600000
lambda_               1.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.158036
val_pr_auc_1          0.162947
val_pr_auc_near_2     0.189509
val_pr_auc_2          0.166009
val_pr_auc_near_3     0.140842
val_pr_auc_3          0.122617
val_pr_auc_near_4     0.128429
val_pr_auc_4          0.123759
val_pr_auc_near_5     0.141372
val_pr_auc_5          0.131742
Name: 8, dtype: float64


Grid Search:  62%|██████▎   | 10/16 [1:54:59<57:42, 577.16s/it] 

eta                   0.100000
max_depth             6.000000
subsample             0.500000
col_sample            0.600000
lambda_              10.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.159461
val_pr_auc_1          0.163085
val_pr_auc_near_2     0.191616
val_pr_auc_2          0.168769
val_pr_auc_near_3     0.140779
val_pr_auc_3          0.121947
val_pr_auc_near_4     0.129759
val_pr_auc_4          0.125277
val_pr_auc_near_5     0.140836
val_pr_auc_5          0.131464
Name: 9, dtype: float64


Grid Search:  69%|██████▉   | 11/16 [2:03:02<45:41, 548.30s/it]

eta                   0.100000
max_depth             6.000000
subsample             0.800000
col_sample            0.600000
lambda_               1.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.157535
val_pr_auc_1          0.162137
val_pr_auc_near_2     0.193300
val_pr_auc_2          0.169517
val_pr_auc_near_3     0.140610
val_pr_auc_3          0.122157
val_pr_auc_near_4     0.129993
val_pr_auc_4          0.124670
val_pr_auc_near_5     0.142234
val_pr_auc_5          0.132046
Name: 10, dtype: float64


Grid Search:  75%|███████▌  | 12/16 [2:11:28<35:42, 535.50s/it]

eta                   0.100000
max_depth             6.000000
subsample             0.800000
col_sample            0.600000
lambda_              10.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.158048
val_pr_auc_1          0.161828
val_pr_auc_near_2     0.191914
val_pr_auc_2          0.168471
val_pr_auc_near_3     0.141883
val_pr_auc_3          0.122430
val_pr_auc_near_4     0.127209
val_pr_auc_4          0.123386
val_pr_auc_near_5     0.142033
val_pr_auc_5          0.132084
Name: 11, dtype: float64


Grid Search:  81%|████████▏ | 13/16 [2:16:37<23:20, 466.78s/it]

eta                   0.100000
max_depth             8.000000
subsample             0.500000
col_sample            0.600000
lambda_               1.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.158522
val_pr_auc_1          0.160548
val_pr_auc_near_2     0.192108
val_pr_auc_2          0.168947
val_pr_auc_near_3     0.142138
val_pr_auc_3          0.123486
val_pr_auc_near_4     0.130762
val_pr_auc_4          0.124594
val_pr_auc_near_5     0.142259
val_pr_auc_5          0.131669
Name: 12, dtype: float64


Grid Search:  88%|████████▊ | 14/16 [2:22:18<14:17, 428.89s/it]

eta                   0.100000
max_depth             8.000000
subsample             0.500000
col_sample            0.600000
lambda_              10.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.161723
val_pr_auc_1          0.162474
val_pr_auc_near_2     0.191006
val_pr_auc_2          0.168407
val_pr_auc_near_3     0.142149
val_pr_auc_3          0.122890
val_pr_auc_near_4     0.131219
val_pr_auc_4          0.125406
val_pr_auc_near_5     0.142250
val_pr_auc_5          0.132223
Name: 13, dtype: float64


Grid Search:  94%|█████████▍| 15/16 [2:27:57<06:41, 401.63s/it]

eta                   0.100000
max_depth             8.000000
subsample             0.800000
col_sample            0.600000
lambda_               1.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.161750
val_pr_auc_1          0.163322
val_pr_auc_near_2     0.190948
val_pr_auc_2          0.168404
val_pr_auc_near_3     0.139990
val_pr_auc_3          0.122780
val_pr_auc_near_4     0.131664
val_pr_auc_4          0.125794
val_pr_auc_near_5     0.141398
val_pr_auc_5          0.132029
Name: 14, dtype: float64


Grid Search: 100%|██████████| 16/16 [2:34:58<00:00, 581.16s/it]

eta                   0.100000
max_depth             8.000000
subsample             0.800000
col_sample            0.600000
lambda_              10.000000
scale_pos_weight     50.000000
val_pr_auc_near_1     0.160438
val_pr_auc_1          0.162332
val_pr_auc_near_2     0.189557
val_pr_auc_2          0.167908
val_pr_auc_near_3     0.142023
val_pr_auc_3          0.123051
val_pr_auc_near_4     0.131156
val_pr_auc_4          0.126033
val_pr_auc_near_5     0.142284
val_pr_auc_5          0.131420
Name: 15, dtype: float64





In [65]:
grid_sparse

Unnamed: 0,eta,max_depth,subsample,col_sample,lambda_,scale_pos_weight,val_pr_auc_near_1,val_pr_auc_1,val_pr_auc_near_2,val_pr_auc_2,val_pr_auc_near_3,val_pr_auc_3,val_pr_auc_near_4,val_pr_auc_4,val_pr_auc_near_5,val_pr_auc_5
0,0.05,6,0.5,0.6,1,50,0.160344,0.164269,0.194221,0.170676,0.143216,0.12382,0.131218,0.125694,0.140735,0.13162
1,0.05,6,0.5,0.6,10,50,0.160884,0.163165,0.190773,0.168372,0.142813,0.123439,0.130563,0.12505,0.142261,0.131808
2,0.05,6,0.8,0.6,1,50,0.15879,0.163828,0.190346,0.168443,0.142176,0.123082,0.130075,0.124137,0.142039,0.132118
3,0.05,6,0.8,0.6,10,50,0.157938,0.162837,0.190084,0.167982,0.141925,0.123119,0.130242,0.125001,0.140601,0.131839
4,0.05,8,0.5,0.6,1,50,0.160075,0.162118,0.191238,0.169153,0.141187,0.123432,0.133065,0.126577,0.140374,0.130755
5,0.05,8,0.5,0.6,10,50,0.163266,0.164597,0.193341,0.170131,0.143173,0.124383,0.133261,0.127327,0.141259,0.131103
6,0.05,8,0.8,0.6,1,50,0.160277,0.162724,0.189925,0.167643,0.142757,0.124305,0.132593,0.126327,0.141685,0.131386
7,0.05,8,0.8,0.6,10,50,0.161401,0.164569,0.191953,0.169154,0.144538,0.124941,0.133889,0.127963,0.143071,0.132283
8,0.1,6,0.5,0.6,1,50,0.158036,0.162947,0.189509,0.166009,0.140842,0.122617,0.128429,0.123759,0.141372,0.131742
9,0.1,6,0.5,0.6,10,50,0.159461,0.163085,0.191616,0.168769,0.140779,0.121947,0.129759,0.125277,0.140836,0.131464


In [53]:
from io import StringIO
s3 = boto3.client('s3')  # assumes you've run aws configure or have IAM role
# Create a CSV in memory
csv_buffer = StringIO()
grid_sparse.to_csv(csv_buffer, index=False)
s3.put_object(
    Bucket='kehmisjan2025',
    Key='gridseach_catboost_051425.csv',
    Body=csv_buffer.getvalue()
)

{'ResponseMetadata': {'RequestId': '3V6Z85P6VTN8HD0C',
  'HostId': '6/ICDkfMkO+onLpTmPu5GH5qn01YuIg/Cc6In+lo3qhwm9y4rR7W8tsem8zUQhagN+PZ32tAkjM=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '6/ICDkfMkO+onLpTmPu5GH5qn01YuIg/Cc6In+lo3qhwm9y4rR7W8tsem8zUQhagN+PZ32tAkjM=',
   'x-amz-request-id': '3V6Z85P6VTN8HD0C',
   'date': 'Wed, 14 May 2025 12:42:38 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"bb6c8a392cb752e6a260e0384c09a9d2"',
   'x-amz-checksum-crc32': 'pCB1XQ==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"bb6c8a392cb752e6a260e0384c09a9d2"',
 'ChecksumCRC32': 'pCB1XQ==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256'}

Adaboost

In [61]:
# Identify available numeric and categorical columns
numeric_cols = iit_data.select_dtypes(include='number').drop(columns=["iit", "SiteCode"], errors='ignore').columns.tolist()
categorical_cols = iit_data.select_dtypes(include='object').columns.tolist()

In [59]:
import pandas as pd
from collections import Counter
import xgboost as xgb

def custom_mode_imputer(column, exclude="NR"):
    counter = Counter(column.dropna())
    if exclude in counter:
        del counter[exclude]
    return counter.most_common(1)[0][0] if counter else None

def impute_data(df, cat_impute_values=None, num_impute_values=None, fit=False, categorical_cols=None, numeric_cols=None):
    if fit:
        cat_impute_values = {}
        num_impute_values = {}
        for col in categorical_cols:
            cat_impute_values[col] = custom_mode_imputer(df[col], exclude="NR")
        for col in numeric_cols:
            num_impute_values[col] = df[col].mean()
    
    for col in categorical_cols:
        df[col] = df[col].fillna(cat_impute_values.get(col, "Unknown"))
    for col in numeric_cols:
        df[col] = df[col].fillna(num_impute_values.get(col, 0))
        
    return df, cat_impute_values, num_impute_values

In [62]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# ========== Train Data (Jan–May 2023) ==========
train_data1 = iit_data.copy().drop(columns=["SiteCode"])
train_data1 = train_data1[(train_data1["NAD"] >= "2023-01-01") & (train_data1["NAD"] <= "2023-05-31")]
# Impute train data and get imputation values
train_data1, cat_impute_values, num_impute_values = impute_data(
    train_data1,
    fit=True,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

train_data1 = encode_xgboost(train_data1)
train_labels1 = train_data1["iit"]
train_features1 = train_data1.drop(columns=["key", "NAD", "iit"])

# ========== Validation Data (June 2023) ==========
val_data1 = iit_data.copy().drop(columns=["SiteCode"])
val_data1 = val_data1[(val_data1["NAD"] >= "2023-06-01") & (val_data1["NAD"] <= "2023-06-30")]
val_data1, _, _ = impute_data(
    val_data1,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

val_data1 = encode_xgboost(val_data1)
val_labels1 = val_data1["iit"]
val_features1 = val_data1.drop(columns=["key", "NAD", "iit"])

# ========== Test Near Data (July 2023) ==========
testnear_data1 = iit_data.copy().drop(columns=["SiteCode"])
testnear_data1 = testnear_data1[(testnear_data1["NAD"] >= "2023-07-01") & (testnear_data1["NAD"] <= "2023-07-31")]

testnear_data1, _, _ = impute_data(
    testnear_data1,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

testnear_data1 = encode_xgboost(testnear_data1)
testnear_labels1 = testnear_data1["iit"]
testnear_features1 = testnear_data1.drop(columns=["key", "NAD", "iit"])

# ========== Test Data (July–Sept 2023) ==========
test_data1 = iit_data.copy().drop(columns=["SiteCode"])
test_data1 = test_data1[(test_data1["NAD"] >= "2023-07-01") & (test_data1["NAD"] <= "2023-09-30")]
test_data1, _, _ = impute_data(
    test_data1,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

test_data1 = encode_xgboost(test_data1)
test_labels1 = test_data1["iit"]
test_features1 = test_data1.drop(columns=["key", "NAD", "iit"])


  df[col] = df[col].fillna(cat_impute_values.get(col, "Unknown"))


TypeError: Invalid value '0.066305066410241' for dtype Int64

In [None]:
# Fold 2
# Train Data (Jan–May 2023)
train_data2 = iit_data.copy()
train_data2 = train_data2.drop(columns=["SiteCode"])
train_data2 = train_data2[(train_data2["NAD"] >= "2023-04-01") & (train_data2["NAD"] <= "2023-08-30")]
train_data2 = encode_xgboost(train_data2)
train_labels2 = train_data2["iit"]
train_features2 = train_data2.drop(columns=["key", "NAD", "iit"])



# Validation Data (June 2023)
val_data2 = iit_data.copy()
val_data2 = val_data2.drop(columns=["SiteCode"])
val_data2 = val_data2[(val_data2["NAD"] >= "2023-09-01") & (val_data2["NAD"] <= "2023-09-30")]
val_data2 = encode_xgboost(val_data2)
val_labels2 = val_data2["iit"]
val_features2 = val_data2.drop(columns=["key", "NAD", "iit"])



# Test Near Data (July 2023)
testnear_data2 = iit_data.copy()
testnear_data2 = testnear_data2.drop(columns=["SiteCode"])
testnear_data2 = testnear_data2[(testnear_data2["NAD"] >= "2023-10-01") & (testnear_data2["NAD"] <= "2023-10-31")]
testnear_data2 = encode_xgboost(testnear_data2)
testnear_labels2 = testnear_data2["iit"]
testnear_features2 = testnear_data2.drop(columns=["key", "NAD", "iit"])


# Test Data (July–Sept 2023)
test_data2 = iit_data.copy()
test_data2 = test_data2.drop(columns=["SiteCode"])
test_data2 = test_data2[(test_data2["NAD"] >= "2023-10-01") & (test_data2["NAD"] <= "2023-12-31")]
test_data2 = encode_xgboost(test_data2)
test_labels2 = test_data2["iit"]
test_features2 = test_data2.drop(columns=["key", "NAD", "iit"])


In [56]:
from sklearn.metrics import roc_auc_score, accuracy_score

# Define and train the model
model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=3),
    n_estimators=100,
    learning_rate=0.5,
    random_state=42
)
model.fit(train_features1, train_labels1)

# Validation
val_preds_proba = model.predict_proba(val_features1)[:, 1]
val_auc = roc_auc_score(val_labels1, val_preds_proba)
print(f"Validation AUC: {val_auc:.4f}")
val_aucpr = average_precision_score(val_labels1, val_preds_proba)
print(f"Validation AUC-PR: {val_aucpr:.4f}")

# Test Near (July)
testnear_preds_proba = model.predict_proba(testnear_features1)[:, 1]
testnear_auc = roc_auc_score(testnear_labels1, testnear_preds_proba)
print(f"Test Near AUC (July): {testnear_auc:.4f}")

# Test (July–Sept)
test_preds_proba = model.predict_proba(test_features1)[:, 1]
test_auc = roc_auc_score(test_labels1, test_preds_proba)
print(f"Test AUC (July–Sept): {test_auc:.4f}")


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


ValueError: Input X contains NaN.
AdaBoostClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values