In [1]:
import numpy as np
import pandas as pd
import regex as re
from datetime import datetime

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
imonitor = pd.read_csv('data/imonitor_1703.csv')
imonitor.head()

  imonitor = pd.read_csv('data/imonitor_1703.csv')


Unnamed: 0,Survey ID,Created Date,Facility name and MFL Code if applicable,Facility ownership,Please specify,County,What is your month; and year of birth,How do you consider yourself?,What is the highest level of education you completed?,Please specify.1,...,how long do you wait on average to get a service; which service was that?,Do you consider the waiting time for lab test results long?,how long do you wait on average to get your lab test result?,Does the facility offer support groups?,Specify the support group you belong to,In your opinion are the services offered at this facility youth friendly?,What measures have been put in place to create GBV awareness and its harmful effects within the community?,Please Specify,PWD In your opinion are the services offered at this facility persons-with-disability friendly?,What are the top 1-3 things you don’t like about this facility with regards to care and treatment?
0,2390063,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1977-09-03,Male,Primary school,,...,,No,,Yes,Adults,Yes,Presence of GBV Desk;,Chiefs office,Yes,
1,2390062,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1972-08-12,Female,Secondary school,,...,,No,,No,,Yes,Presence of GBV Desk;,Chiefs office,,
2,2390061,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1984-08-31,Female,Primary school,,...,,Yes,2 hours,No,,Yes,Presence of GBV Desk;,Chiefs office,Yes,
3,2390060,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1977-05-07,Female,Primary school,,...,,No,,No,,Yes,Presence of GBV Desk;,Police station,,
4,2390059,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1987-06-13,Male,Vocational training or technician,,...,1 hour,Yes,2 hours,Yes,Adults,Yes,Presence of GBV Desk;,Police station,,


In [3]:
imonitor.shape

(46549, 85)

In [4]:
# Find and drop columns that contain "Please specify" or "Please Specify"
cols_to_drop = [col for col in imonitor.columns if "Please specify" in col or "Please Specify" in col]

# Drop these columns from the DataFrame in a single operation
imonitor.drop(cols_to_drop, axis=1, inplace=True)

In [5]:
imonitor.shape

(46549, 69)

In [6]:
imonitor.columns = imonitor.columns.map(lambda x: x.strip())

In [7]:
columns_to_drop = [
    "Survey ID",
    "Facility name and MFL Code if applicable",
    "What is your month; and year of birth",
    "How do you consider yourself?",
    "What is the highest level of education you completed?",
    "What is your current marital status?",
    "Which county do you currently live in?",
    "What are your sources of income?",
    "Facility name",
    "What did you like about the services you received?",
    "What did you not like about the services you received?",
    "In your opinion what would you like to be improved?",
    "In your opinion what can be done to improve access to the services you seek at the facility?",
    "Facility name denied service",
    "Why",
    "Were reasons provided as to why these services were not available?",
    "Were reasons provided as to why these services were not available?.1",
    "What are the barriers to uptake of VMMC by males 25+years and above?",
    "What are some of the current site level practices that community members like and would love to maintain for KP/PP ?",
    "What would you like this facility to change/do better?",
    "Throughout your visit what did you find interesting/pleasing about this facility that should be emulated by other facilities?",
    "What do you think can be improved",
    "Anything else that you would like to mention?",
    "What are the top 1-3 things you like about this facility with regards to care and treatment?"
]

# Drop the columns
imonitor.drop(columns=columns_to_drop, axis=1, inplace=True)

In [8]:
column_name_mapping = {
    "Created Date": "Date",
    "Organization name coordinating the feedback from the clients": "OrgFeedbackCoordinator",
    "Facility ownership": "FacilityOwnership",
    "County": "FacilityCounty",
    "For how long have you been accessing services (based on the expected package of services) in this facility?": "ServiceAccessDuration",
    "Are you aware of the package of services that you are entitled to?": "ServicesAwareness",
    "According to you; which HIV related services are you likely to receive in this facility?": "ExpectedHIVServices",
    "Is there a service that you needed that was not provided?": "UnprovidedService",
    "Facility name no service": "UnprovidedServiceFacilityName",
    "For that service that was not provided; were you referred?": "ReferralForUnprovidedService",
    "If referred; did you receive the service where you were referred to?": "ReferralServiceReceived",
    "If Yes which Service/Test/Medicine": "ReceivedServiceDetail",
    "On a scale of 1 to 5; how satisfied are you with the package of services received in this facility? If 1 is VERY UNSATISFIED and 5 is VERY SATISFIED.": "ServiceSatisfaction",
    "Do you face any challenges when accessing the services at the facility?": "AccessChallenges",
    "Common issues that can be added in the drop-down box": "CommonIssuesDropdown",
    "Was confidentiality considered while you were being served?": "Confidentiality",
    "Are there age-appropriate health services for specific groups?": "AgeAppropriateServices",
    "Does the facility allow you to share your concerns with the administration?": "ConcernsSharing",
    "Do you know your health-related rights as a client of this facility?": "RightsAwareness",
    "Have you ever been denied services at this facility?": "ServiceDenial",
    "Are you comfortable with getting services at this facility": "ComfortWithServices",
    "Have you ever been counseled?": "CounselingReceived",
    "Did you identify any gaps in the facility when you tried to access the services": "IdentifiedGaps",
    "Service type": "ServiceGapsType",
    "Are the HIV testing services readily available when required?": "HIVTestingAvailability",
    "Have you ever Interrupted your treatment?": "TreatmentInterruption",
    "Are the PMTCT services readily available when required?": "PMTCTServiceAvailability",
    "Are the HIV prevention; testing; treatment and care services adequate for KPs?": "KPServiceAdequacy",
    "Facility Level": "FacilityLevel",
    "Facility Operation times": "OperationTimes",
    "Facility Operation Days": "OperationDays",
    "What are your preferred days of visiting the facility": "PreferredVisitDays",
    "What are your preferred time of visiting the facility": "PreferredVisitTimes",
    "On a scale of 1-5; how clean do you find the facility?": "FacilityCleanliness",
    "How do you reach this facility?": "FacilityAccessMode",
    "How long does it take to reach this facility?": "FacilityAccessTime",
    "On a scale of 1-5; how accessible do you find this facility?": "FacilityAccessibility",
    "Do you consider the waiting time to be seen at this facility long?": "WaitingTimeOpinion",
    "how long do you wait on average to get a service; which service was that?": "AverageWaitingTime",
    "Do you consider the waiting time for lab test results long?": "LabResultsWaitingTimeOpinion",
    "how long do you wait on average to get your lab test result?": "AverageLabResultsWaitingTime",
    "Does the facility offer support groups?": "SupportGroupAvailability",
    "Specify the support group you belong to": "SpecifySupportGroup",
    "In your opinion are the services offered at this facility youth friendly?": "YouthFriendlyServices",
    "What measures have been put in place to create GBV awareness and its harmful effects within the community?": "GBVAwarenessMeasures",
    "PWD In your opinion are the services offered at this facility persons-with-disability friendly?": "PWDFriendlyServicesOpinion",
    "What are the top 1-3 things you don’t like about this facility with regards to care and treatment?": "TopFacilityDislikes"
}

# Assuming imonitor is your DataFrame
df = imonitor.rename(columns=column_name_mapping)

In [9]:
columns_to_clean1 = [
    'WaitingTimeOpinion',
    'LabResultsWaitingTimeOpinion'
]

def replace_dont_know(df, column):
    df[column] = df[column].replace("Dont Know", "Do not know", regex=False)
    return df

for column in columns_to_clean1:
    df = replace_dont_know(df, column)

In [10]:
columns_to_clean2 = [
    'FacilityCleanliness',
    'FacilityAccessibility'
    ]

def replace_mixed_with_text(df, column_name):
    def replace_value(value):
        satisfaction_map = {
            1: 'Very Unsatisfied',
            2: 'Unsatisfied',
            3: 'Okay',
            4: 'Satisfied',
            5: 'Very Satisfied'
        }
        if isinstance(value, str) and value[0].isdigit():
            num = int(value[0])
        elif isinstance(value, int):
            num = value
        else:
            return value

        return satisfaction_map.get(num, value)

    df[column_name] = df[column_name].apply(replace_value)
    return df

for column in columns_to_clean2:
    df = replace_mixed_with_text(df, column)

In [11]:
def standardize_satisfaction(df, column_name):
    # Mapping for consolidating variations of satisfaction levels
    satisfaction_map = {
        '5': 'Very Satisfied',
        5.0: 'Very Satisfied',
        '4': 'Satisfied',
        4.0: 'Satisfied',
        '3': 'Okay',
        3.0: 'Okay',
        '2': 'Unsatisfied',
        2.0: 'Unsatisfied',
        '1': 'Very Unsatisfied',
        1.0: 'Very Unsatisfied',
        'Dissatisfied': 'Unsatisfied'
    }
    
    # Replace values based on the map
    df[column_name] = df[column_name].replace(satisfaction_map)
    return df

df = standardize_satisfaction(df, 'ServiceSatisfaction')


In [12]:
print(df['FacilityLevel'].value_counts())

FacilityLevel
4.0    4802
3.0    4515
2.0    2889
5.0    2240
1.0     556
6.0      14
Name: count, dtype: int64


In [13]:
def standardize_facility(df, column_name):
    # Mapping for consolidating variations of satisfaction levels
    satisfaction_map = {
        1.0: 'Community Health Unit',
        2.0: 'Dispensaries and Private Clinics',
        3.0: 'Health Centers',
        4.0: 'Sub-County Hospitals',
        5.0: 'County Referral Hospitals',
        6.0: 'National Referral Hospitals',
    }
    
    # Replace values based on the map
    df[column_name] = df[column_name].replace(satisfaction_map)
    return df

df = standardize_facility(df, 'FacilityLevel')

In [14]:
def replace_symbols_and_words(df, column_name):
    df[column_name] = df[column_name].str.replace('<', 'Less than', regex=False)
    df[column_name] = df[column_name].str.replace('>', 'More than', regex=False)
    df[column_name] = df[column_name].str.replace('minutes', 'mins', regex=False)
    return df

df = replace_symbols_and_words(df, 'FacilityAccessTime')

In [15]:
def replace_symbols_and_words2(df, column_name):
    df[column_name] = df[column_name].str.replace('Less than 30mins', 'Less than 30 mins', regex=False)
    df[column_name] = df[column_name].str.replace('More than45 mins', 'More than 45 mins', regex=False)
    return df

df = replace_symbols_and_words2(df, 'FacilityAccessTime')

In [16]:
def convert_mixed_dates(date_column):
    """
    This function takes a Pandas Series of mixed dates and Excel serial dates and converts them to datetime objects.
    
    Parameters:
    date_column (pd.Series): A pandas Series with mixed date formats and serial dates.
    
    Returns:
    pd.Series: A pandas Series with all dates converted to datetime objects.
    """
    # Define the epoch start for Excel's serial date format
    excel_epoch = pd.Timestamp('1899-12-30')
    converted_dates = []

    for date in date_column:
        if isinstance(date, str) and re.match(r'^\d+(\.\d+)?$', date):
            # If it's a string that looks like a serial date, convert it
            serial_value = float(date)
            converted_date = excel_epoch + pd.to_timedelta(serial_value, unit='D')
        elif isinstance(date, (int, float)):
            # If it's a numeric type, assume it's a serial date
            converted_date = excel_epoch + pd.to_timedelta(date, unit='D')
        else:
            # Otherwise, try to parse it as a regular date
            converted_date = pd.to_datetime(date, errors='coerce')

        # Append the result, which will be NaT (Not a Time) if parsing failed
        converted_dates.append(converted_date)

    return pd.Series(converted_dates)

# Example usage, assuming 'df' is your DataFrame and 'Date' is the column to be converted:
df['Date'] = convert_mixed_dates(df['Date'])

In [17]:
def standardize_gbv_awareness(df, column_name):
    df[column_name] = df[column_name].str.replace('Is there a desk to report GBV as community or individual', 'Presence of GBV Desk', regex=False)
    df[column_name] = df[column_name].str.replace('Are there training events on GBV for the community', 'Community trained on GBV', regex=False)
    return df

df = standardize_gbv_awareness(df, 'GBVAwarenessMeasures')

In [18]:
def encode_multi_select(df, columns):
    # Iterate over the specified columns
    for col in columns:
        # Remove all whitespaces within each value and split based on ';'
        # This creates a Series of lists
        split_series = df[col].str.replace(' ', '').str.split(';')
        
        # Use the str.get_dummies() method on the Series of lists to perform one-hot encoding
        # This approach handles the separation and encoding in one step
        encoded = split_series.str.join('|').str.get_dummies()
        
        # Prefix the encoded column names to indicate their origin
        encoded.columns = [f"{col}_{option}" for option in encoded.columns]
        
        # Join the encoded dataframe with the original dataframe
        df = df.join(encoded)
        
        # Optionally, drop the original column if no longer needed
        # df.drop(col, axis=1, inplace=True)
    
    return df

# Specify the columns to encode
columns_to_encode = ['ExpectedHIVServices', 'OperationTimes', 'OperationDays', 'PreferredVisitDays', 'PreferredVisitTimes', 'GBVAwarenessMeasures']

# Apply the function
df2 = encode_multi_select(df, columns_to_encode)

In [19]:
df2.drop(columns=columns_to_encode, axis=1, inplace=True)

In [20]:
missing_percentage = df2.isnull().mean() * 100

threshold = 60

columns_to_drop = missing_percentage[missing_percentage > threshold].index.tolist()

print("Columns to drop:", columns_to_drop)

print("Number of columns to drop:", len(columns_to_drop))

df2.drop(columns=columns_to_drop, axis=1, inplace=True)

print("DataFrame shape after dropping columns:", df2.shape)

Columns to drop: ['ReferralForUnprovidedService', 'ReferralServiceReceived', 'ReceivedServiceDetail', 'CommonIssuesDropdown', 'ServiceGapsType', 'HIVTestingAvailability', 'TreatmentInterruption', 'PMTCTServiceAvailability', 'KPServiceAdequacy', 'FacilityLevel', 'FacilityCleanliness', 'FacilityAccessMode', 'FacilityAccessTime', 'FacilityAccessibility', 'WaitingTimeOpinion', 'AverageWaitingTime', 'LabResultsWaitingTimeOpinion', 'AverageLabResultsWaitingTime', 'SupportGroupAvailability', 'SpecifySupportGroup', 'YouthFriendlyServices', 'PWDFriendlyServicesOpinion', 'TopFacilityDislikes']
Number of columns to drop: 23
DataFrame shape after dropping columns: (46549, 66)


In [21]:
threshold_percentage = 100

threshold = len(df2.columns) * (threshold_percentage / 100)

df3 = df2.dropna(thresh=threshold).copy()

print("Original DataFrame shape:", df2.shape)
print("Cleaned DataFrame shape:", df3.shape)

rows_dropped = df2.shape[0] - df3.shape[0]
print("Rows dropped:", rows_dropped)

Original DataFrame shape: (46549, 66)
Cleaned DataFrame shape: (39862, 66)
Rows dropped: 6687


In [22]:
def divide_date_column(df):
    # Change Date column to datetime type
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    
    # Extract year from Date and handle conditions
    df['Year'] = df['Date'].dt.year.fillna(0).astype(int).astype(str)
    
    # Replace year values not matching 2022, 2023, or 2024 with 'error'
    df['Year'] = df['Year'].apply(lambda x: x if x in ['2022', '2023', '2024'] else 'error')
    
    # Count number of rows with 'error' in 'Year'
    error_count = (df['Year'] == 'error').sum()
    
    # Delete rows with 'Year' == 'error' if error_count > 0
    if error_count > 0:
        df = df[df['Year'] != 'error']
        
    return df, error_count

# Applying the function to the dataframe
data, error_count = divide_date_column(df3)
data['Year'] = data['Year'].astype('object')
data.drop(columns=['Date'], inplace=True, axis=1)

print('Error count: ', error_count)

Error count:  0


In [23]:
# Assuming 'data' is your DataFrame

# Separating features for preprocessing: Only identify categorical features since numerical features don't need preprocessing
categorical_features = data.select_dtypes(include=['object']).columns.tolist()

# If there's no preprocessing needed for numerical features, we can skip defining 'num' in transformers
# Defining the ColumnTransformer to apply preprocessing to only categorical data
preprocessor = ColumnTransformer(
    transformers=[
        # Only encode categorical features
        ('cat', OneHotEncoder(), categorical_features)],
    remainder='passthrough')  # 'remainder=passthrough' ensures that the rest of the columns not specified in transformers are not dropped

# Creating the pipeline with preprocessing and the KMeans algorithm
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('cluster', KMeans(n_clusters=2))  # Adjust n_clusters as needed
])

# Fitting the pipeline to the data
pipeline.fit(data)

# Accessing the cluster labels assigned to each record
cluster_labels = pipeline.named_steps['cluster'].labels_
print(cluster_labels)

[0 0 0 ... 1 1 1]


In [24]:
# Extract the transformed dataset from the pipeline
transformed_data = pipeline.named_steps['preprocessor'].transform(data)

pca = PCA(n_components=2)
reduced_data = pca.fit_transform(transformed_data)

# Convert cluster labels to string to treat them as categorical data for coloring
cluster_labels_str = cluster_labels.astype(str)

# Create the Plotly scatter plot of the reduced data points, colored by their cluster labels
fig = px.scatter(
    x=reduced_data[:, 0],
    y=reduced_data[:, 1],
    color=cluster_labels_str,
    color_continuous_scale='Viridis',
    labels={'color': 'Cluster Label'},
    title='Clusters after PCA Reduction'
)

fig.update_traces(marker=dict(size=12, line=dict(width=1, color='DarkSlateGrey'), opacity=0.6))
fig.update_layout(xaxis_title='PCA Feature 1', yaxis_title='PCA Feature 2')
fig.show()

In [25]:
# Let's assume that after your analysis, you determine that:
# Cluster 0 corresponds to 'Not Satisfied'
# Cluster 1 corresponds to 'Satisfied'

# Accessing the cluster labels from your pipeline
cluster_labels = pipeline.named_steps['cluster'].labels_

# Mapping cluster labels to satisfaction scores
satisfaction_mapping = {0: 'Not Satisfied', 1: 'Satisfied'}
data['satisfaction_score'] = [satisfaction_mapping[label] for label in cluster_labels]

# Now 'data' has a new column 'satisfaction_score' with the satisfaction label

In [26]:
data.to_csv('data/cleanednonull.csv', index=False)

In [27]:
recategorization_mapping = {
    'Satisfied': 1,
    'Not Satisfied': 0
}

data.loc[:, 'satisfaction_score'] = data['satisfaction_score'].replace(recategorization_mapping)

# After replacement, you might want to ensure the data type is what you expect
# For example, if you want to ensure it's an integer (especially if NaN values are not expected)
data['satisfaction_score'] = data['satisfaction_score'].astype(int)

# Verify the changes
print(data['satisfaction_score'].value_counts())

satisfaction_score
1    25809
0    14053
Name: count, dtype: int64



Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [28]:
# Assuming subset_df is your DataFrame and 'ServiceSatisfaction' is the column of interest

class_1_df = data[data['satisfaction_score'] == 1]
class_0_df = data[data['satisfaction_score'] == 0]

# Get the target number of instances to match, which is the number of instances in class 1
target_number = class_0_df.shape[0]

# Randomly sample from classes 3 and 2 to match the number of instances in class 1
class_1_sampled_df = class_1_df.sample(n=target_number, random_state=42)

balanced_df = pd.concat([class_1_sampled_df, class_0_df])

balanced_df['satisfaction_score'].value_counts()

satisfaction_score
1    14053
0    14053
Name: count, dtype: int64

In [29]:
ordinal_vars = balanced_df['satisfaction_score']
nominal_vars = [col for col in balanced_df.columns if balanced_df[col].dtype == 'object' and col not in ordinal_vars]
encoded_data = pd.get_dummies(balanced_df, columns=nominal_vars)

# This automatically drops the original nominal columns and adds the one-hot encoded columns
print("NaN counts after pandas get_dummies:", encoded_data.isnull().sum().sum())

NaN counts after pandas get_dummies: 0


In [30]:
X = encoded_data.drop('satisfaction_score', axis=1)
y = encoded_data['satisfaction_score']
# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [31]:
def test_models(X_train, y_train, X_test, y_test):
    # Models dictionary, assuming CatBoost and LightGBM handle categorical variables without the need for conversion
    models = {
        'CatBoostClassifier': CatBoostClassifier(verbose=0),
        'LGBMClassifier': LGBMClassifier(),
        'XGBClassifier': XGBClassifier(use_label_encoder=False, eval_metric='logloss', enable_categorical=True)
    }
    
    best_model = None
    best_score = -1
    model_results = []
    for name, model in models.items():
        # For CatBoost, specify categorical features
        if name == 'CatBoostClassifier':
            model.set_params(cat_features=[col for col in X_train.columns if str(X_train[col].dtype) == 'category'])
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) if hasattr(model, "predict_proba") else None
        report = classification_report(y_test, y_pred, output_dict=True)
        
        model_result = {
            'Model': name,
            'ROC AUC': roc_auc,
            'Accuracy': report['accuracy'],
            'Precision': report['weighted avg']['precision'],
            'Recall': report['weighted avg']['recall'],
            'F1 Score': report['weighted avg']['f1-score'],
        }
        model_results.append(model_result)
        
        if roc_auc is not None and roc_auc > best_score:
            best_score = roc_auc
            best_model = model

    return pd.DataFrame(model_results), best_model

# Example usage:
results_df, best_model = test_models(X_train, y_train, X_test, y_test)
print(results_df)
print("Best model:", best_model)

[LightGBM] [Info] Number of positive: 9824, number of negative: 9850
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 200
[LightGBM] [Info] Number of data points in the train set: 19674, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499339 -> initscore=-0.002643
[LightGBM] [Info] Start training from score -0.002643
                Model   ROC AUC  Accuracy  Precision    Recall  F1 Score
0  CatBoostClassifier  0.999992  0.999288   0.999289  0.999288  0.999288
1      LGBMClassifier  0.999992  0.999407   0.999407  0.999407  0.999407
2       XGBClassifier  0.999995  0.999288   0.999289  0.999288  0.999288
Best model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              cols

In [32]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Assuming 'model' is already defined (e.g., model = RandomForestClassifier())
# X is the feature set and y is the target for the entire dataset (not just the train set)

# Define K-Fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize an empty list to hold the ROC AUC scores
roc_auc_scores = []

# Perform K-Fold cross-validation
for train_index, test_index in kf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model on the training fold
    best_model.fit(X_train_fold, y_train_fold)
    
    # Make predictions on the test fold
    predictions_proba = best_model.predict_proba(X_test_fold)[:, 1]
    
    # Calculate the ROC AUC score and append to the list
    roc_auc = roc_auc_score(y_test_fold, predictions_proba)
    roc_auc_scores.append(roc_auc)

# Calculate average and standard deviation of ROC AUC scores across all folds
average_roc_auc = sum(roc_auc_scores) / len(roc_auc_scores)
std_dev_roc_auc = (sum((x - average_roc_auc) ** 2 for x in roc_auc_scores) / len(roc_auc_scores)) ** 0.5

print(f"Average ROC AUC: {average_roc_auc:.4f}")
print(f"Standard Deviation of ROC AUC: {std_dev_roc_auc:.4f}")

Average ROC AUC: 1.0000
Standard Deviation of ROC AUC: 0.0000


In [33]:
feature_importances = best_model.feature_importances_

# Create a Series for the feature importances
importances = pd.Series(feature_importances, index=X_train.columns)

# Sort the importances and select the top 10, then reverse the Series for plotting
top_10_importances = importances.sort_values(ascending=False)[:10][::-1]

# Create a bar chart using Plotly
fig = px.bar(top_10_importances, x=top_10_importances.values, y=top_10_importances.index, orientation='h',
             labels={'x': 'Importance', 'index': 'Feature'},
             title='Top 15 Feature Importances (Highest to Lowest)')

# Show the plot
fig.show()

In [34]:
# Predict probabilities for the positive class
y_pred_probs = best_model.predict_proba(X_test)[:, 1]

# Calculate residuals (difference between true binary labels and predicted probabilities)
residuals = y_test - y_pred_probs

# Assuming you have the true labels y_test and the predicted probabilities y_pred_probs from your best_model
# residuals = y_test - y_pred_probs  # Uncomment this line if you have y_test and y_pred_probs

# Create the Plotly histogram of the residuals
fig = px.histogram(x=residuals, nbins=20, title='Residual Distribution')
fig.update_layout(xaxis_title='Residuals', yaxis_title='Frequency')
# Show the plot in your environment
fig.show()