In [None]:
import pandas as pd
import numpy as np
import pyreadr
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_curve, auc, roc_curve
from sklearn.utils import shuffle
from datetime import timedelta
from tqdm import tqdm
import random
import os
import boto3
import tempfile
from settings import settings

In [None]:
os.environ['AWS_ACCESS_KEY_ID'] = settings.AWS_ACCESS_KEY_ID
os.environ['AWS_SECRET_ACCESS_KEY'] = settings.AWS_SECRET_ACCESS_KEY
os.environ['AWS_DEFAULT_REGION'] = settings.AWS_DEFAULT_REGION

# Define S3 info
bucket_name = 'kehmisjan2025'
file_key = 'targets_apr23.rds'

# Initialize boto3 client
s3 = boto3.client('s3')

# Download to a temporary file
with tempfile.NamedTemporaryFile(suffix=".rds") as tmp_file:
    s3.download_fileobj(bucket_name, file_key, tmp_file)
    tmp_file.seek(0)  # go back to beginning
    result = pyreadr.read_r(tmp_file.name)  # returns a dictionary

# Extract the data frame
iit_data = next(iter(result.values()))  # assumes only one object inside

In [3]:
# print(iit_data.dtypes) 
# Ensure the 'NAD' column is converted to datetime
iit_data['NAD'] = pd.to_datetime(iit_data['NAD'], format='%Y-%m-%d')
# iit_data['VisitDate'] = pd.to_datetime(iit_data['VisitDate'], format='%Y-%m-%d')

In [4]:
# remove the last quarter of the year
# Define the date range to exclude
start_exclude = pd.Timestamp('2024-10-02')
end_exclude = pd.Timestamp('2024-12-31')

# Filter out records from Sept through Dec 2024
iit_data = iit_data[~((iit_data['NAD'] >= start_exclude) & (iit_data['NAD'] <= end_exclude))]

In [5]:
# iit_data['is_december'] = iit_data['Month'].apply(lambda x: 1 if x == "December" else 0)
iit_data['is_friday'] = iit_data['Day'].apply(lambda x: 1 if x == "Fri" else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iit_data['is_friday'] = iit_data['Day'].apply(lambda x: 1 if x == "Fri" else 0)


In [6]:
iit_data['BMI'] = np.where(iit_data['BMI'].isna() & (iit_data['BMI_Missing'] == 0), 'NR', iit_data['BMI'])
iit_data['ARTAdherence'] = np.where(iit_data['ARTAdherence'].isna() & (iit_data['Adherence_Missing'] == 0), 'NR', iit_data['ARTAdherence'])
iit_data['Pregnant'] = np.where(iit_data['Pregnant'].isna() & (iit_data['Pregnant_Missing'] == 0), 'NR', iit_data['Pregnant'])
iit_data['Breastfeeding'] = np.where(iit_data['Breastfeeding'].isna() & (iit_data['Breastfeeding_Missing'] == 0), 'NR', iit_data['Breastfeeding'])
iit_data['WHOStage'] = np.where(iit_data['WHOStage'].isna() & (iit_data['WHO_Missing'] == 0), 'NR', iit_data['WHOStage'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iit_data['BMI'] = np.where(iit_data['BMI'].isna() & (iit_data['BMI_Missing'] == 0), 'NR', iit_data['BMI'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iit_data['ARTAdherence'] = np.where(iit_data['ARTAdherence'].isna() & (iit_data['Adherence_Missing'] == 0), 'NR', iit_data['ARTAdherence'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/index

In [7]:
print(iit_data['VisitDate'].min(),iit_data['VisitDate'].max())
print(iit_data['NAD'].min(),iit_data['NAD'].max())

iit_data = iit_data.drop(columns=[
    'OptimizedHIVRegimen', 'Drug', 'VisitDate', 'WHO_Missing', 'Type',
    'most_recent_cd4', 'regimen_switch', 'AHD', 'NAD_Imputation_Flag',
    'BMI_Missing', 'TimeatFacility', 'Adherence_Missing', 'Facility_type_category',
    'Pregnant_Missing', 'Breastfeeding_Missing', 'Month', 'Day'
    # 'lastvd' to 'months_since_restart' would go here
    # 'Month', 'Day' handled below
    
])
# iit_data = iit_data.drop(columns=iit_data.loc[:, 'men_knowledge':'women_sti'].columns)

2021-01-04 2024-09-30
2022-01-01 00:00:00 2024-10-01 00:00:00


In [8]:
selected_columns= ['num_late_last3', 'num_late14_last3', 'num_late30_last3',
       'num_late_last5', 'num_late14_last5', 'num_late30_last5',
       'num_late_last10', 'num_late14_last10', 'num_late30_last10']
iit_data[selected_columns] = iit_data[selected_columns].apply(pd.to_numeric, errors='coerce')

In [9]:
iit_data['ARTAdherence'] = iit_data['ARTAdherence'].map({
    'good': 'optimal',
    'fair': 'suboptimal',
    'poor': 'suboptimal'
}).fillna(iit_data['ARTAdherence'])

# Sex: Male -> 1, else 0
iit_data['Sex'] = (iit_data['Sex'] == 'Male').astype('Int64')

# Emr: KenyaEMR -> 1, else 0
iit_data['Emr'] = (iit_data['Emr'] == 'KenyaEMR').astype('Int64')  # assuming there is an 'Emr' colum

In [10]:
def encode_xgboost(dataset):
    # List of categorical variables to be encoded
    categorical_columns = ['BMI', 'ARTAdherence', 'Pregnant', 'Breastfeeding', 'DifferentiatedCare', 'WHOStage', 'most_recent_vl', 'MaritalStatus', 'EducationLevel',
       'Occupation', 'VisitBy','TCAReason', 'cascade_status', 'Kephlevel','Ownertype'] 
    
    # One-hot encoding the categorical columns
    ohe = pd.get_dummies(dataset[categorical_columns], drop_first=True, dtype=int)
    
    # Concatenate the original dataset (excluding categorical columns) with the one-hot encoded columns
    dataset_encoded = pd.concat([dataset.drop(columns=categorical_columns), ohe], axis=1)
    
    return dataset_encoded

In [11]:
# Identify available numeric and categorical columns
numeric_cols = iit_data.select_dtypes(include='number').drop(columns=["iit", "SiteCode"], errors='ignore').columns.tolist()
categorical_cols = iit_data.select_dtypes(include='object').drop(columns=["key"], errors='ignore').columns.tolist()

In [12]:

from collections import Counter

def custom_mode_imputer(column, exclude="NR"):
    counter = Counter(column.dropna())
    if exclude in counter:
        del counter[exclude]
    return counter.most_common(1)[0][0] if counter else None

def impute_data(df, cat_impute_values=None, num_impute_values=None, fit=False, categorical_cols=None, numeric_cols=None):
    if fit:
        cat_impute_values = {}
        num_impute_values = {}
        for col in categorical_cols:
            cat_impute_values[col] = custom_mode_imputer(df[col], exclude="NR")
        for col in numeric_cols:
            num_impute_values[col] = df[col].mean()
    
    for col in categorical_cols:
        df[col] = df[col].fillna(cat_impute_values.get(col, "Unknown"))
    for col in numeric_cols:
        df[col] = df[col].fillna(num_impute_values.get(col, 0))
        
    return df, cat_impute_values, num_impute_values

In [13]:

vif_data= iit_data.copy()
start_exclude = pd.Timestamp('2023-01-01')
end_exclude = pd.Timestamp('2023-12-31')

# Filter out records from Sept through Dec 2024
vif_data = vif_data[~((vif_data['NAD'] >= start_exclude) & (vif_data['NAD'] <= end_exclude))]
vif_data=vif_data.drop(columns=['key', 'SiteCode','NAD','iit'])
vif_data, cat_impute_values, num_impute_values = impute_data(
    vif_data,
    fit=True,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)


In [19]:
from decimal import Decimal

# Force conversion of all numeric columns to float
X_numeric = vif_data.select_dtypes(include=["number"]).applymap(lambda x: float(x) if isinstance(x, Decimal) else x)

# Ensure dtype is numeric
X_numeric = X_numeric.apply(pd.to_numeric, errors="coerce")

# Drop rows with NaNs (if any result from coercion)
X_numeric = X_numeric.dropna()


  X_numeric = vif_data.select_dtypes(include=["number"]).applymap(lambda x: float(x) if isinstance(x, Decimal) else x)


In [20]:
# Add constant and compute VIF
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_with_const = sm.add_constant(X_numeric)

vif_df = pd.DataFrame()
vif_df["Feature"] = X_with_const.columns
vif_df["VIF"] = [variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])]

print(vif_df)


                      Feature            VIF
0                       const  232767.227622
1         StabilityAssessment       1.400736
2                  FirstVisit       1.066132
3                         Emr       1.224877
4                         Sex       1.007300
5                         Age       1.183629
6       DaystoNextAppointment       1.433336
7                   TimeonART       1.199985
8       regimen_switch_visits       1.007879
9            VisitUnscheduled       1.168295
10                     lastvd       2.094608
11             lateness_last3      30.513018
12             num_late_last3       5.591297
13           num_late14_last3      10.486281
14           num_late30_last3      12.382298
15             lateness_last5      59.557907
16             num_late_last5       9.770060
17           num_late14_last5      18.132832
18           num_late30_last5      19.468604
19            lateness_last10      25.050480
20            num_late_last10       5.305961
21        

  return 1 - self.ssr/self.centered_tss


In [None]:
pip install seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute correlation matrix
corr_matrix = X_numeric.corr()

# Visualize
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Feature Correlation Matrix")
plt.show()

In [24]:
threshold = 0.8
high_corr = corr_matrix[(abs(corr_matrix) > threshold) & (abs(corr_matrix) < 1.0)]

# Stack and drop NaNs to find pairs
high_corr_pairs = high_corr.stack().reset_index()
high_corr_pairs.columns = ["Variable 1", "Variable 2", "Correlation"]
print(high_corr_pairs.sort_values(by="Correlation", ascending=False))

                   Variable 1                 Variable 2  Correlation
26    rolling_weighted_noshow  rolling_weighted_dayslate     0.987446
27  rolling_weighted_dayslate    rolling_weighted_noshow     0.987446
24          women_highrisksex    women_sexnotwithpartner     0.977454
25    women_sexnotwithpartner          women_highrisksex     0.977454
21      men_sexnotwithpartner            men_highrisksex     0.941056
20            men_highrisksex      men_sexnotwithpartner     0.941056
8              lateness_last5            lateness_last10     0.939083
16            lateness_last10             lateness_last5     0.939083
7              lateness_last5             lateness_last3     0.916185
1              lateness_last3             lateness_last5     0.916185
9              num_late_last5             num_late_last3     0.870091
3              num_late_last3             num_late_last5     0.870091
4            num_late14_last3           num_late14_last5     0.856304
11           num_lat

In [14]:
iit_data= iit_data.drop(columns=['rolling_weighted_dayslate','women_sexnotwithpartner','men_sexnotwithpartner', 'men_nevertested','women_nevertested','lateness_last10',
                                  'lateness_last5','num_late14_last5','num_late14_last10','num_late30_last5','num_late30_last10'])

In [15]:
# Identify available numeric and categorical columns
numeric_cols = iit_data.select_dtypes(include='number').drop(columns=["iit", "SiteCode"], errors='ignore').columns.tolist()
categorical_cols = iit_data.select_dtypes(include='object').drop(columns=["key"], errors='ignore').columns.tolist()

In [16]:
# Fold 1
train_data1 = iit_data.copy()
train_data1 = train_data1.drop(columns=["SiteCode"])
train_data1 = train_data1[(train_data1["NAD"] >= "2023-01-01") & (train_data1["NAD"] <= "2023-05-31")]
train_data1 = train_data1.drop(columns=["key", "NAD"])

train_data1, cat_impute_values, num_impute_values = impute_data(
    train_data1,
    fit=True,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

train_data1 = encode_xgboost(train_data1)
X_train1 = train_data1.drop(columns=["iit"])
y_train1 = train_data1["iit"]

# ---------- Prepare Validation Data (June 2023) ----------
val_data1 = iit_data.copy()
val_data1 = val_data1.drop(columns=["SiteCode"])
val_data1 = val_data1[(val_data1["NAD"] >= "2023-06-01") & (val_data1["NAD"] <= "2023-06-30")]
val_data1 = val_data1.drop(columns=["key", "NAD"])

val_data1, _, _ = impute_data(
    val_data1,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

val_data1 = encode_xgboost(val_data1)
X_val1 = val_data1.drop(columns=["iit"])
y_val1 = val_data1["iit"]

# ---------- Prepare Test Near Data (July 2023) ----------
testnear_data1 = iit_data.copy()
testnear_data1 = testnear_data1.drop(columns=["SiteCode"])
testnear_data1 = testnear_data1[(testnear_data1["NAD"] >= "2023-07-01") & (testnear_data1["NAD"] <= "2023-07-31")]
testnear_data1 = testnear_data1.drop(columns=["key", "NAD"])

testnear_data1, _, _ = impute_data(
    testnear_data1,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

testnear_data1 = encode_xgboost(testnear_data1)
X_testnear1 = testnear_data1.drop(columns=["iit"])
y_testnear1 = testnear_data1["iit"]

# ---------- Prepare Test Data (July–Sept 2023) ----------
test_data1 = iit_data.copy()
test_data1 = test_data1.drop(columns=["SiteCode"])
test_data1 = test_data1[(test_data1["NAD"] >= "2023-07-01") & (test_data1["NAD"] <= "2023-09-30")]
test_data1 = test_data1.drop(columns=["key", "NAD"])

test_data1, _, _ = impute_data(
    test_data1,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

test_data1 = encode_xgboost(test_data1)
X_test1 = test_data1.drop(columns=["iit"])
y_test1 = test_data1["iit"]



In [17]:
# Fold 2
# Prepare Train Data (Jan–May 2023)
train_data2 = iit_data.copy()
train_data2 = train_data2.drop(columns=["SiteCode"])
train_data2 = train_data2[(train_data2["NAD"] >= "2023-04-01") & (train_data2["NAD"] <= "2023-08-31")]
train_data2 = train_data2.drop(columns=["key", "NAD"])

# Impute train data and get imputation values
train_data2, cat_impute_values, num_impute_values = impute_data(
    train_data2,
    fit=True,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

train_data2 = encode_xgboost(train_data2)

X_train2 = train_data2.drop(columns=["iit"])
y_train2 = train_data2["iit"]

# Validation Data (June 2023)
val_data2 = iit_data.copy()
val_data2 = val_data2.drop(columns=["SiteCode"])
val_data2 = val_data2[(val_data2["NAD"] >= "2023-09-01") & (val_data2["NAD"] <= "2023-09-30")]
val_data2 = val_data2.drop(columns=["key", "NAD"])
val_data2, _, _ = impute_data(
    val_data2,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
val_data2 = encode_xgboost(val_data2)
X_val2 = val_data2.drop(columns=["iit"])
y_val2 = val_data2["iit"]


# Test Near Data (July 2023)
testnear_data2 = iit_data.copy()
testnear_data2 = testnear_data2.drop(columns=["SiteCode"])
testnear_data2 = testnear_data2[(testnear_data2["NAD"] >= "2023-10-01") & (testnear_data2["NAD"] <= "2023-10-31")]
testnear_data2 = testnear_data2.drop(columns=["key", "NAD"])
testnear_data2, _, _ = impute_data(
    testnear_data2,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
testnear_data2 = encode_xgboost(testnear_data2)

X_testnear2 = testnear_data2.drop(columns=["iit"])
y_testnear2 = testnear_data2["iit"]

# Test Data (July–Sept 2023)
test_data2 = iit_data.copy()
test_data2 = test_data2.drop(columns=["SiteCode"])
test_data2 = test_data2[(test_data2["NAD"] >= "2023-10-01") & (test_data2["NAD"] <= "2023-12-31")]
test_data2 = test_data2.drop(columns=["key", "NAD"])
test_data2, _, _ = impute_data(
    test_data2,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
test_data2 = encode_xgboost(test_data2)

X_test2 = test_data2.drop(columns=["iit"])
y_test2 = test_data2["iit"]

In [18]:
# Fold 3
# Prepare Train Data (Jan–May 3033)
train_data3 = iit_data.copy()
train_data3 = train_data3.drop(columns=["SiteCode"])
train_data3 = train_data3[(train_data3["NAD"] >= "2023-06-01") & (train_data3["NAD"] <= "2023-11-30")]
train_data3 = train_data3.drop(columns=["key", "NAD"])

# Impute train data and get imputation values
train_data3, cat_impute_values, num_impute_values = impute_data(
    train_data3,
    fit=True,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

train_data3 = encode_xgboost(train_data3)

X_train3 = train_data3.drop(columns=["iit"])
y_train3 = train_data3["iit"]

# Validation Data (June 3033)
val_data3 = iit_data.copy()
val_data3 = val_data3.drop(columns=["SiteCode"])
val_data3 = val_data3[(val_data3["NAD"] >= "2023-12-01") & (val_data3["NAD"] <= "2023-12-31")]
val_data3 = val_data3.drop(columns=["key", "NAD"])
val_data3, _, _ = impute_data(
    val_data3,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
val_data3 = encode_xgboost(val_data3)

X_val3 = val_data3.drop(columns=["iit"])
y_val3 = val_data3["iit"]
# Test Near Data (July 3033)
testnear_data3 = iit_data.copy()
testnear_data3 = testnear_data3.drop(columns=["SiteCode"])
testnear_data3 = testnear_data3[(testnear_data3["NAD"] >= "2024-01-01") & (testnear_data3["NAD"] <= "2024-01-31")]
testnear_data3 = testnear_data3.drop(columns=["key", "NAD"])
testnear_data3, _, _ = impute_data(
    testnear_data3,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
testnear_data3 = encode_xgboost(testnear_data3)

X_testnear3 = testnear_data3.drop(columns=["iit"])
y_testnear3 = testnear_data3["iit"]

# Test Data (July–Sept 3033)
test_data3 = iit_data.copy()
test_data3 = test_data3.drop(columns=["SiteCode"])
test_data3 = test_data3[(test_data3["NAD"] >= "2024-01-01") & (test_data3["NAD"] <= "2024-01-31")]
test_data3 = test_data3.drop(columns=["key", "NAD"])
test_data3, _, _ = impute_data(
    test_data3,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
test_data3 = encode_xgboost(test_data3)

X_test3 = test_data3.drop(columns=["iit"])
y_test3 = test_data3["iit"]

In [19]:
# Fold 4
# Prepare Train Data (Jan–May 4044)
train_data4 = iit_data.copy()
train_data4 = train_data4.drop(columns=["SiteCode"])
train_data4 = train_data4[(train_data4["NAD"] >= "2023-09-01") & (train_data4["NAD"] <= "2024-02-29")]
train_data4 = train_data4.drop(columns=["key", "NAD"])

# Impute train data and get imputation values
train_data4, cat_impute_values, num_impute_values = impute_data(
    train_data4,
    fit=True,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

train_data4 = encode_xgboost(train_data4)

X_train4 = train_data4.drop(columns=["iit"])
y_train4 = train_data4["iit"]

# Validation Data (June 4044)
val_data4 = iit_data.copy()
val_data4 = val_data4.drop(columns=["SiteCode"])
val_data4 = val_data4[(val_data4["NAD"] >= "2024-03-01") & (val_data4["NAD"] <= "2024-03-31")]
val_data4 = val_data4.drop(columns=["key", "NAD"])
val_data4, _, _ = impute_data(
    val_data4,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
val_data4 = encode_xgboost(val_data4)

X_val4 = val_data4.drop(columns=["iit"])
y_val4 = val_data4["iit"]
# Test Near Data (July 4044)
testnear_data4 = iit_data.copy()
testnear_data4 = testnear_data4.drop(columns=["SiteCode"])
testnear_data4 = testnear_data4[(testnear_data4["NAD"] >= "2024-04-01") & (testnear_data4["NAD"] <= "2024-04-30")]
testnear_data4 = testnear_data4.drop(columns=["key", "NAD"])
testnear_data4, _, _ = impute_data(
    testnear_data4,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
testnear_data4 = encode_xgboost(testnear_data4)
X_testnear4 = testnear_data4.drop(columns=["iit"])
y_testnear4 = testnear_data4["iit"]


# Test Data (July–Sept 4044)
test_data4 = iit_data.copy()
test_data4 = test_data4.drop(columns=["SiteCode"])
test_data4 = test_data4[(test_data4["NAD"] >= "2024-04-01") & (test_data4["NAD"] <= "2024-06-30")]
test_data4 = test_data4.drop(columns=["key", "NAD"])
test_data4, _, _ = impute_data(
    test_data4,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
test_data4 = encode_xgboost(test_data4)

X_test4 = test_data4.drop(columns=["iit"])
y_test4 = test_data4["iit"]

In [20]:
# Fold 5
# Prepare Train Data (Jan–May 5055)
train_data5 = iit_data.copy()
train_data5 = train_data5.drop(columns=["SiteCode"])
train_data5 = train_data5[(train_data5["NAD"] >= "2024-01-01") & (train_data5["NAD"] <= "2024-05-31")]
train_data5 = train_data5.drop(columns=["key", "NAD"])

# Impute train data and get imputation values
train_data5, cat_impute_values, num_impute_values = impute_data(
    train_data5,
    fit=True,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)

train_data5 = encode_xgboost(train_data5)
X_train5 = train_data5.drop(columns=["iit"])
y_train5 = train_data5["iit"]


# Validation Data (June 5055)
val_data5 = iit_data.copy()
val_data5 = val_data5.drop(columns=["SiteCode"])
val_data5 = val_data5[(val_data5["NAD"] >= "2024-06-01") & (val_data5["NAD"] <= "2024-06-30")]
val_data5 = val_data5.drop(columns=["key", "NAD"])
val_data5, _, _ = impute_data(
    val_data5,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
val_data5 = encode_xgboost(val_data5)

X_val5 = val_data5.drop(columns=["iit"])
y_val5 = val_data5["iit"]
# Test Near Data (July 5055)
testnear_data5 = iit_data.copy()
testnear_data5 = testnear_data5.drop(columns=["SiteCode"])
testnear_data5 = testnear_data5[(testnear_data5["NAD"] >= "2024-07-01") & (testnear_data5["NAD"] <= "2024-07-31")]
testnear_data5 = testnear_data5.drop(columns=["key", "NAD"])
testnear_data5, _, _ = impute_data(
    testnear_data5,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
testnear_data5 = encode_xgboost(testnear_data5)

X_testnear5 = testnear_data5.drop(columns=["iit"])
y_testnear5 = testnear_data5["iit"]

# Test Data (July–Sept 5055)
test_data5 = iit_data.copy()
test_data5 = test_data5.drop(columns=["SiteCode"])
test_data5 = test_data5[(test_data5["NAD"] >= "2024-07-01") & (test_data5["NAD"] <= "2024-09-30")]
test_data5 = test_data5.drop(columns=["key", "NAD"])
test_data5, _, _ = impute_data(
    test_data5,
    cat_impute_values=cat_impute_values,
    num_impute_values=num_impute_values,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols
)
test_data5 = encode_xgboost(test_data5)

X_test5 = test_data5.drop(columns=["iit"])
y_test5 = test_data5["iit"]

In [21]:
fold_list = {
    "fold1": [X_train1, y_train1, X_val1, y_val1, X_testnear1, y_testnear1,X_test1,y_test1],
    "fold2": [X_train2, y_train2, X_val2, y_val2, X_testnear2, y_testnear2,X_test2,y_test2],
    "fold3": [X_train3, y_train3, X_val3, y_val3, X_testnear3, y_testnear3, X_test3,y_test3],
    "fold4": [X_train4, y_train4, X_val4, y_val4, X_testnear4, y_testnear4,X_test4,y_test4],
    "fold5": [X_train5, y_train5, X_val5, y_val5, X_testnear5, y_testnear5, X_test5,y_test5],
}


In [22]:
from itertools import product
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score
from tqdm import tqdm

# Step 1: Create a grid for Logistic Regression hyperparameters
params_grid = list(product(
    ['l2'],             # penalty
    [1.0] ,       # inverse of regularization strength C
    ['liblinear']  # Add solver
))

# Build DataFrame
grid_sparse = pd.DataFrame(params_grid, columns=[
    "penalty", "C","solver"
])

# Step 2: Add empty columns to record validation metrics (PR AUC)
for k in range(1, 6):
    grid_sparse[f"val_pr_auc_near_{k}"] = np.nan
    grid_sparse[f"val_pr_auc_{k}"] = np.nan


In [23]:
grid_sparse

Unnamed: 0,penalty,C,solver,val_pr_auc_near_1,val_pr_auc_1,val_pr_auc_near_2,val_pr_auc_2,val_pr_auc_near_3,val_pr_auc_3,val_pr_auc_near_4,val_pr_auc_4,val_pr_auc_near_5,val_pr_auc_5
0,l2,1.0,liblinear,,,,,,,,,,


In [24]:
for i in tqdm(range(len(grid_sparse)), desc="Grid Search"):

    row = grid_sparse.iloc[i]

    for k in tqdm(range(1, 6), desc=f"Fold {i+1}", leave=False):

        # Unpack fold-specific data
        X_train, y_train, X_val, y_val, X_testnear, y_testnear, X_test, y_test = fold_list[f"fold{k}"]

        try:
            # Initialize Logistic Regression model
            model = LogisticRegression(
                penalty=row["penalty"],
                C=row["C"],
                solver=row["solver"],
                class_weight='balanced',
                max_iter=100,
                random_state=42
            )

            # Fit the model
            model.fit(X_train, y_train)

            # Predict on near test set
            testnear_preds = model.predict_proba(X_testnear)[:, 1]
            ap_near = average_precision_score(y_testnear, testnear_preds)
            grid_sparse.at[grid_sparse.index[i], f"val_pr_auc_near_{k}"] = ap_near

            # Predict on main test set
            test_preds = model.predict_proba(X_test)[:, 1]
            ap = average_precision_score(y_test, test_preds)
            grid_sparse.at[grid_sparse.index[i], f"val_pr_auc_{k}"] = ap

        except Exception as e:
            print(f"Failed on grid index {i} fold {k}: {e}")
            continue

    print(grid_sparse.iloc[i])

Grid Search: 100%|██████████| 1/1 [45:16<00:00, 2716.13s/it]

penalty                     l2
C                          1.0
solver               liblinear
val_pr_auc_near_1     0.122133
val_pr_auc_1          0.129589
val_pr_auc_near_2     0.147689
val_pr_auc_2          0.133898
val_pr_auc_near_3     0.123767
val_pr_auc_3          0.123767
val_pr_auc_near_4     0.099695
val_pr_auc_4          0.096408
val_pr_auc_near_5     0.109903
val_pr_auc_5          0.110506
Name: 0, dtype: object





In [25]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Assume y_true and y_pred_proba are already defined
test_preds = model.predict_proba(X_test5)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test5, test_preds)
roc_auc = roc_auc_score(y_test5, test_preds)

In [26]:
roc_auc

np.float64(0.7381695188528021)

In [None]:
from io import StringIO
s3 = boto3.client('s3')  # assumes you've run aws configure or have IAM role
# Create a CSV in memory
csv_buffer = StringIO()
test.to_csv(csv_buffer, index=False)
s3.put_object(
    Bucket='kehmisjan2025',
    Key='test_051325_rf.csv',
    Body=csv_buffer.getvalue()
)

{'ResponseMetadata': {'RequestId': 'W0AZSKZV9V2M9NQX',
  'HostId': 'KGoPWPTCrOqXUt41RmncK6nRQdCyIGJAQQOvyV2V2ICDNiB/5FqvrMGN/FgvjY0ZGBs2bFN2hK0=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'KGoPWPTCrOqXUt41RmncK6nRQdCyIGJAQQOvyV2V2ICDNiB/5FqvrMGN/FgvjY0ZGBs2bFN2hK0=',
   'x-amz-request-id': 'W0AZSKZV9V2M9NQX',
   'date': 'Tue, 13 May 2025 12:26:18 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"3af81b0aa122cc8d62dc49c3d36fce7d"',
   'x-amz-checksum-crc32': 'kpOazg==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"3af81b0aa122cc8d62dc49c3d36fce7d"',
 'ChecksumCRC32': 'kpOazg==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256'}