In [None]:
import pandas as pd
import numpy as np
import pyreadr
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_curve, auc, roc_curve
from sklearn.utils import shuffle
from datetime import timedelta
from tqdm import tqdm
import random
import os
import boto3
import tempfile
from settings import settings

In [None]:
os.environ['AWS_ACCESS_KEY_ID'] = settings.AWS_ACCESS_KEY_ID
os.environ['AWS_SECRET_ACCESS_KEY'] = settings.AWS_SECRET_ACCESS_KEY
os.environ['AWS_DEFAULT_REGION'] = settings.AWS_DEFAULT_REGION

# Define S3 info
bucket_name = 'kehmisjan2025'
file_key = 'targets_apr2.rds'

# Initialize boto3 client
s3 = boto3.client('s3')

# Download to a temporary file
with tempfile.NamedTemporaryFile(suffix=".rds") as tmp_file:
    s3.download_fileobj(bucket_name, file_key, tmp_file)
    tmp_file.seek(0)  # go back to beginning
    result = pyreadr.read_r(tmp_file.name)  # returns a dictionary

# Extract the data frame
iit_data = next(iter(result.values()))  # assumes only one object inside

In [None]:
# print(iit_data.dtypes) 
# Ensure the 'NAD' column is converted to datetime
iit_data['NAD'] = pd.to_datetime(iit_data['NAD'], format='%Y-%m-%d')
# iit_data['VisitDate'] = pd.to_datetime(iit_data['VisitDate'], format='%Y-%m-%d')

In [None]:
# remove the last quarter of the year
# Define the date range to exclude
start_exclude = pd.Timestamp('2024-10-02')
end_exclude = pd.Timestamp('2024-12-31')

# Filter out records from Sept through Dec 2024
iit_data = iit_data[~((iit_data['NAD'] >= start_exclude) & (iit_data['NAD'] <= end_exclude))]

In [None]:
# Define the start date (January 2022)
start_date = pd.to_datetime('2022-01-01')

# Calculate 'tp' as the difference in months from January 2022
iit_data['tp'] = ((iit_data['NAD'].dt.year - start_date.year) * 12) + (iit_data['NAD'].dt.month - start_date.month) + 1

In [None]:
# iit_data['is_december'] = iit_data['Month'].apply(lambda x: 1 if x == "December" else 0)
iit_data['is_friday'] = iit_data['Day'].apply(lambda x: 1 if x == "Friday" else 0)

In [None]:
iit_data['BMI'] = np.where(iit_data['BMI'].isna() & (iit_data['BMI_Missing'] == 0), 'NR', iit_data['BMI'])
iit_data['ARTAdherence'] = np.where(iit_data['ARTAdherence'].isna() & (iit_data['Adherence_Missing'] == 0), 'NR', iit_data['ARTAdherence'])
iit_data['Pregnant'] = np.where(iit_data['Pregnant'].isna() & (iit_data['Pregnant_Missing'] == 0), 'NR', iit_data['Pregnant'])
iit_data['Breastfeeding'] = np.where(iit_data['Breastfeeding'].isna() & (iit_data['Breastfeeding_Missing'] == 0), 'NR', iit_data['Breastfeeding'])
iit_data['WHOStage'] = np.where(iit_data['WHOStage'].isna() & (iit_data['WHO_Missing'] == 0), 'NR', iit_data['WHOStage'])

In [None]:
print(iit_data['VisitDate'].min(),iit_data['VisitDate'].max())
print(iit_data['NAD'].min(),iit_data['NAD'].max())

iit_data = iit_data.drop(columns=[
    'OptimizedHIVRegimen', 'Drug', 'VisitDate', 'WHO_Missing', 'Type',
    'most_recent_cd4', 'regimen_switch', 'AHD', 'NAD_Imputation_Flag',
    'BMI_Missing', 'TimeatFacility', 'Adherence_Missing', 'Facility_type_category',
    'Pregnant_Missing', 'Breastfeeding_Missing', 'Month', 'Day'
    # 'lastvd' to 'months_since_restart' would go here
    # 'Month', 'Day' handled below
    
])
# iit_data = iit_data.drop(columns=iit_data.loc[:, 'men_knowledge':'women_sti'].columns)


In [None]:
selected_columns= ['num_late_last3', 'num_late14_last3', 'num_late30_last3',
       'num_late_last5', 'num_late14_last5', 'num_late30_last5',
       'num_late_last10', 'num_late14_last10', 'num_late30_last10']
iit_data[selected_columns] = iit_data[selected_columns].apply(pd.to_numeric, errors='coerce')

In [None]:


iit_data['ARTAdherence'] = iit_data['ARTAdherence'].map({
    'good': 'optimal',
    'fair': 'suboptimal',
    'poor': 'suboptimal'
}).fillna(iit_data['ARTAdherence'])

# Sex: Male -> 1, else 0
iit_data['Sex'] = (iit_data['Sex'] == 'Male').astype('Int64')

# Emr: KenyaEMR -> 1, else 0
iit_data['Emr'] = (iit_data['Emr'] == 'KenyaEMR').astype('Int64')  # assuming there is an 'Emr' column


In [None]:
def encode_xgboost(dataset):
    # List of categorical variables to be encoded
    categorical_columns = ['BMI', 'ARTAdherence', 'Pregnant', 'Breastfeeding', 'DifferentiatedCare', 'WHOStage', 'most_recent_vl', 'MaritalStatus', 'EducationLevel',
       'Occupation', 'VisitBy','TCAReason', 'cascade_status', 'Kephlevel','Ownertype'] 
    
    # One-hot encoding the categorical columns
    ohe = pd.get_dummies(dataset[categorical_columns], drop_first=True, dtype=int)
    
    # Concatenate the original dataset (excluding categorical columns) with the one-hot encoded columns
    dataset_encoded = pd.concat([dataset.drop(columns=categorical_columns), ohe], axis=1)
    
    return dataset_encoded

In [None]:
# Identify available numeric and categorical columns
numeric_cols = iit_data.select_dtypes(include='number').drop(columns=["iit"], errors='ignore').columns.tolist()
categorical_cols = iit_data.select_dtypes(include='object').columns.tolist()

In [None]:
import pandas as pd
from collections import Counter
import xgboost as xgb

def custom_mode_imputer(column, exclude="NR"):
    counter = Counter(column.dropna())
    if exclude in counter:
        del counter[exclude]
    return counter.most_common(1)[0][0] if counter else None

def impute_data(df, cat_impute_values=None, num_impute_values=None, fit=False, categorical_cols=None, numeric_cols=None):
    if fit:
        cat_impute_values = {}
        num_impute_values = {}
        for col in categorical_cols:
            cat_impute_values[col] = custom_mode_imputer(df[col], exclude="NR")
        for col in numeric_cols:
            num_impute_values[col] = df[col].mean()
    
    for col in categorical_cols:
        df[col] = df[col].fillna(cat_impute_values.get(col, "Unknown"))
    for col in numeric_cols:
        df[col] = df[col].fillna(num_impute_values.get(col, 0))
        
    return df, cat_impute_values, num_impute_values


# Prepare Train Data (Jan–May 2023)
train_data1 = iit_data.copy()
train_data1 = train_data1.drop(columns=["SiteCode"])
train_data1 = train_data1[(train_data1["NAD"] >= "2023-01-01") & (train_data1["NAD"] <= "2023-05-31")]
train_data1 = train_data1.drop(columns=["key", "NAD"])

# Impute train data and get imputation values
train_data1, cat_impute_values, num_impute_values = impute_data(
    train_data1,
    fit=True,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

train_data1 = encode_xgboost(train_data1)

dtrain1 = xgb.DMatrix(
    data=train_data1.drop(columns=["iit"]),
    label=train_data1["iit"]
)

# Validation Data (June 2023)
val_data1 = iit_data.copy()
val_data1 = val_data1.drop(columns=["SiteCode"])
val_data1 = val_data1[(val_data1["NAD"] >= "2023-06-01") & (val_data1["NAD"] <= "2023-06-30")]
val_data1 = val_data1.drop(columns=["key", "NAD"])
val_data1, _, _ = impute_data(
    val_data1,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
val_data1 = encode_xgboost(val_data1)

dval1 = xgb.DMatrix(
    data=val_data1.drop(columns=["iit"]),
    label=val_data1["iit"]
)

# Test Near Data (July 2023)
testnear_data1 = iit_data.copy()
testnear_data1 = testnear_data1.drop(columns=["SiteCode"])
testnear_data1 = testnear_data1[(testnear_data1["NAD"] >= "2023-07-01") & (testnear_data1["NAD"] <= "2023-07-31")]
testnear_data1 = testnear_data1.drop(columns=["key", "NAD"])
testnear_data1, _, _ = impute_data(
    testnear_data1,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
testnear_data1 = encode_xgboost(testnear_data1)

dtestnear1 = xgb.DMatrix(
    data=testnear_data1.drop(columns=["iit"]),
    label=testnear_data1["iit"]
)

# Test Data (July–Sept 2023)
test_data1 = iit_data.copy()
test_data1 = test_data1.drop(columns=["SiteCode"])
test_data1 = test_data1[(test_data1["NAD"] >= "2023-07-01") & (test_data1["NAD"] <= "2023-09-30")]
test_data1 = test_data1.drop(columns=["key", "NAD"])
test_data1, _, _ = impute_data(
    test_data1,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
test_data1 = encode_xgboost(test_data1)

dtest1 = xgb.DMatrix(
    data=test_data1.drop(columns=["iit"]),
    label=test_data1["iit"]
)


In [None]:
# Fold 2
# Prepare Train Data (Jan–May 2023)
train_data2 = iit_data.copy()
train_data2 = train_data2.drop(columns=["SiteCode"])
train_data2 = train_data2[(train_data2["NAD"] >= "2023-04-01") & (train_data2["NAD"] <= "2023-08-31")]
train_data2 = train_data2.drop(columns=["key", "NAD"])

# Impute train data and get imputation values
train_data2, cat_impute_values, num_impute_values = impute_data(
    train_data2,
    fit=True,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

train_data2 = encode_xgboost(train_data2)

dtrain2 = xgb.DMatrix(
    data=train_data2.drop(columns=["iit"]),
    label=train_data2["iit"]
)

# Validation Data (June 2023)
val_data2 = iit_data.copy()
val_data2 = val_data2.drop(columns=["SiteCode"])
val_data2 = val_data2[(val_data2["NAD"] >= "2023-09-01") & (val_data2["NAD"] <= "2023-09-30")]
val_data2 = val_data2.drop(columns=["key", "NAD"])
val_data2, _, _ = impute_data(
    val_data2,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
val_data2 = encode_xgboost(val_data2)

dval2 = xgb.DMatrix(
    data=val_data2.drop(columns=["iit"]),
    label=val_data2["iit"]
)

# Test Near Data (July 2023)
testnear_data2 = iit_data.copy()
testnear_data2 = testnear_data2.drop(columns=["SiteCode"])
testnear_data2 = testnear_data2[(testnear_data2["NAD"] >= "2023-10-01") & (testnear_data2["NAD"] <= "2023-10-31")]
testnear_data2 = testnear_data2.drop(columns=["key", "NAD"])
testnear_data2, _, _ = impute_data(
    testnear_data2,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
testnear_data2 = encode_xgboost(testnear_data2)

dtestnear2 = xgb.DMatrix(
    data=testnear_data2.drop(columns=["iit"]),
    label=testnear_data2["iit"]
)

# Test Data (July–Sept 2023)
test_data2 = iit_data.copy()
test_data2 = test_data2.drop(columns=["SiteCode"])
test_data2 = test_data2[(test_data2["NAD"] >= "2023-10-01") & (test_data2["NAD"] <= "2023-12-31")]
test_data2 = test_data2.drop(columns=["key", "NAD"])
test_data2, _, _ = impute_data(
    test_data2,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
test_data2 = encode_xgboost(test_data2)

dtest2 = xgb.DMatrix(
    data=test_data2.drop(columns=["iit"]),
    label=test_data2["iit"]
)

In [None]:
# Fold 3
# Prepare Train Data (Jan–May 3033)
train_data3 = iit_data.copy()
train_data3 = train_data3.drop(columns=["SiteCode"])
train_data3 = train_data3[(train_data3["NAD"] >= "2023-06-01") & (train_data3["NAD"] <= "2023-11-30")]
train_data3 = train_data3.drop(columns=["key", "NAD"])

# Impute train data and get imputation values
train_data3, cat_impute_values, num_impute_values = impute_data(
    train_data3,
    fit=True,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

train_data3 = encode_xgboost(train_data3)

dtrain3 = xgb.DMatrix(
    data=train_data3.drop(columns=["iit"]),
    label=train_data3["iit"]
)

# Validation Data (June 3033)
val_data3 = iit_data.copy()
val_data3 = val_data3.drop(columns=["SiteCode"])
val_data3 = val_data3[(val_data3["NAD"] >= "2023-12-01") & (val_data3["NAD"] <= "2023-12-31")]
val_data3 = val_data3.drop(columns=["key", "NAD"])
val_data3, _, _ = impute_data(
    val_data3,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
val_data3 = encode_xgboost(val_data3)

dval3 = xgb.DMatrix(
    data=val_data3.drop(columns=["iit"]),
    label=val_data3["iit"]
)
# Test Near Data (July 3033)
testnear_data3 = iit_data.copy()
testnear_data3 = testnear_data3.drop(columns=["SiteCode"])
testnear_data3 = testnear_data3[(testnear_data3["NAD"] >= "2024-01-01") & (testnear_data3["NAD"] <= "2024-01-31")]
testnear_data3 = testnear_data3.drop(columns=["key", "NAD"])
testnear_data3, _, _ = impute_data(
    testnear_data3,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
testnear_data3 = encode_xgboost(testnear_data3)

dtestnear3 = xgb.DMatrix(
    data=testnear_data3.drop(columns=["iit"]),
    label=testnear_data3["iit"]
)

# Test Data (July–Sept 3033)
test_data3 = iit_data.copy()
test_data3 = test_data3.drop(columns=["SiteCode"])
test_data3 = test_data3[(test_data3["NAD"] >= "2024-01-01") & (test_data3["NAD"] <= "2024-01-31")]
test_data3 = test_data3.drop(columns=["key", "NAD"])
test_data3, _, _ = impute_data(
    test_data3,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
test_data3 = encode_xgboost(test_data3)

dtest3 = xgb.DMatrix(
    data=test_data3.drop(columns=["iit"]),
    label=test_data3["iit"]
)

In [None]:
# Fold 4
# Prepare Train Data (Jan–May 4044)
train_data4 = iit_data.copy()
train_data4 = train_data4.drop(columns=["SiteCode"])
train_data4 = train_data4[(train_data4["NAD"] >= "2023-09-01") & (train_data4["NAD"] <= "2024-02-29")]
train_data4 = train_data4.drop(columns=["key", "NAD"])

# Impute train data and get imputation values
train_data4, cat_impute_values, num_impute_values = impute_data(
    train_data4,
    fit=True,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

train_data4 = encode_xgboost(train_data4)

dtrain4 = xgb.DMatrix(
    data=train_data4.drop(columns=["iit"]),
    label=train_data4["iit"]
)

# Validation Data (June 4044)
val_data4 = iit_data.copy()
val_data4 = val_data4.drop(columns=["SiteCode"])
val_data4 = val_data4[(val_data4["NAD"] >= "2024-03-01") & (val_data4["NAD"] <= "2024-03-31")]
val_data4 = val_data4.drop(columns=["key", "NAD"])
val_data4, _, _ = impute_data(
    val_data4,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
val_data4 = encode_xgboost(val_data4)

dval4 = xgb.DMatrix(
    data=val_data4.drop(columns=["iit"]),
    label=val_data4["iit"]
)
# Test Near Data (July 4044)
testnear_data4 = iit_data.copy()
testnear_data4 = testnear_data4.drop(columns=["SiteCode"])
testnear_data4 = testnear_data4[(testnear_data4["NAD"] >= "2024-04-01") & (testnear_data4["NAD"] <= "2024-04-30")]
testnear_data4 = testnear_data4.drop(columns=["key", "NAD"])
testnear_data4, _, _ = impute_data(
    testnear_data4,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
testnear_data4 = encode_xgboost(testnear_data4)

dtestnear4 = xgb.DMatrix(
    data=testnear_data4.drop(columns=["iit"]),
    label=testnear_data4["iit"]
)

# Test Data (July–Sept 4044)
test_data4 = iit_data.copy()
test_data4 = test_data4.drop(columns=["SiteCode"])
test_data4 = test_data4[(test_data4["NAD"] >= "2024-04-01") & (test_data4["NAD"] <= "2024-06-30")]
test_data4 = test_data4.drop(columns=["key", "NAD"])
test_data4, _, _ = impute_data(
    test_data4,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
test_data4 = encode_xgboost(test_data4)

dtest4 = xgb.DMatrix(
    data=test_data4.drop(columns=["iit"]),
    label=test_data4["iit"]
)

In [None]:
# Fold 5
# Prepare Train Data (Jan–May 5055)
train_data5 = iit_data.copy()
train_data5 = train_data5.drop(columns=["SiteCode"])
train_data5 = train_data5[(train_data5["NAD"] >= "2024-01-01") & (train_data5["NAD"] <= "2024-05-31")]
train_data5 = train_data5.drop(columns=["key", "NAD"])

# Impute train data and get imputation values
train_data5, cat_impute_values, num_impute_values = impute_data(
    train_data5,
    fit=True,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

train_data5 = encode_xgboost(train_data5)

dtrain5 = xgb.DMatrix(
    data=train_data5.drop(columns=["iit"]),
    label=train_data5["iit"]
)

# Validation Data (June 5055)
val_data5 = iit_data.copy()
val_data5 = val_data5.drop(columns=["SiteCode"])
val_data5 = val_data5[(val_data5["NAD"] >= "2024-06-01") & (val_data5["NAD"] <= "2024-06-30")]
val_data5 = val_data5.drop(columns=["key", "NAD"])
val_data5, _, _ = impute_data(
    val_data5,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
val_data5 = encode_xgboost(val_data5)

dval5 = xgb.DMatrix(
    data=val_data5.drop(columns=["iit"]),
    label=val_data5["iit"]
)
# Test Near Data (July 5055)
testnear_data5 = iit_data.copy()
testnear_data5 = testnear_data5.drop(columns=["SiteCode"])
testnear_data5 = testnear_data5[(testnear_data5["NAD"] >= "2024-07-01") & (testnear_data5["NAD"] <= "2024-07-31")]
testnear_data5 = testnear_data5.drop(columns=["key", "NAD"])
testnear_data5, _, _ = impute_data(
    testnear_data5,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
testnear_data5 = encode_xgboost(testnear_data5)

dtestnear5 = xgb.DMatrix(
    data=testnear_data5.drop(columns=["iit"]),
    label=testnear_data5["iit"]
)

# Test Data (July–Sept 5055)
test_data5 = iit_data.copy()
test_data5 = test_data5.drop(columns=["SiteCode"])
test_data5 = test_data5[(test_data5["NAD"] >= "2024-07-01") & (test_data5["NAD"] <= "2024-09-30")]
test_data5 = test_data5.drop(columns=["key", "NAD"])
test_data5, _, _ = impute_data(
    test_data5,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
test_data5 = encode_xgboost(test_data5)

dtest5 = xgb.DMatrix(
    data=test_data5.drop(columns=["iit"]),
    label=test_data5["iit"]
)

In [None]:
fold_list = {
    "fold1": [dtrain1, dval1, dtestnear1, dtest1, testnear_data1, test_data1],
    "fold2": [dtrain2, dval2, dtestnear2, dtest2, testnear_data2, test_data2],
    "fold3": [dtrain3, dval3, dtestnear3, dtest3, testnear_data3, test_data3],
    "fold4": [dtrain4, dval4, dtestnear4, dtest4, testnear_data4, test_data4],
    "fold5": [dtrain5, dval5, dtestnear5, dtest5, testnear_data5, test_data5]
}

In [None]:
from itertools import product
from sklearn.metrics import precision_recall_curve, average_precision_score

# Step 1: Create the grid
params_grid = list(product(
    [0.05],                  # eta
    [6, 8],               # max_depth
    [0.5, 0.8],           # subsample
    [0.5, 0.8],           # colsample_bytree
    [1,10],                     # lambda
    [20,50]                  # scale_pos_weight
))

# Build DataFrame
grid_sparse = pd.DataFrame(params_grid, columns=[
    "eta", "max_depth", "subsample", "col_sample", "lambda_", "scale_pos_weight"
])
# Add empty columns for PR AUCs
for k in range(1, 6):
    grid_sparse[f"val_pr_auc_near_{k}"] = np.nan
    grid_sparse[f"val_pr_auc_{k}"] = np.nan

In [None]:
# Train models for each grid row and each fold
for i in tqdm(range(len(grid_sparse)), desc="Grid Search"):

    row = grid_sparse.iloc[i]

    for k in tqdm(range(1, 6), desc=f"Fold {i+1}", leave=False):

        dtrain, dval, dtestnear, dtest, test_near, test_data = fold_list[f"fold{k}"]

        params = {
            "eta": row["eta"],
            "max_depth": int(row["max_depth"]),
            "subsample": row["subsample"],
            "colsample_bytree": row["col_sample"],
            "lambda": row["lambda_"],
            "scale_pos_weight": row["scale_pos_weight"],
            "eval_metric": "aucpr",
            "objective": "binary:logistic"
        }

        xgb_model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=3000,
            evals=[(dtrain, "train"), (dval, "val")],
            early_stopping_rounds=100,
            verbose_eval=False
        )

        # Predict on dtestnear
        testnear_preds = xgb_model.predict(dtestnear)
        ap_near = average_precision_score(test_near["iit"], testnear_preds)
        grid_sparse.at[grid_sparse.index[i], f"val_pr_auc_near_{k}"] = ap_near

        # Predict on dtest
        test_preds = xgb_model.predict(dtest)
        ap = average_precision_score(test_data["iit"], test_preds)
        grid_sparse.at[grid_sparse.index[i], f"val_pr_auc_{k}"] = ap

    print(grid_sparse.iloc[i])


In [None]:
from io import StringIO
s3 = boto3.client('s3')  # assumes you've run aws configure or have IAM role
# Create a CSV in memory
csv_buffer = StringIO()
grid_sparse.to_csv(csv_buffer, index=False)
s3.put_object(
    Bucket='kehmisjan2025',
    Key='xgbgrid_sparse_050525.csv',
    Body=csv_buffer.getvalue()
)