In [64]:
import numpy as np
import pandas as pd
import regex as re
from datetime import datetime

import plotly.express as px
import shap

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
imonitor = pd.read_csv('data/imonitor_1703.csv')
imonitor.head()

  imonitor = pd.read_csv('data/imonitor_1703.csv')


Unnamed: 0,Survey ID,Created Date,Facility name and MFL Code if applicable,Facility ownership,Please specify,County,What is your month; and year of birth,How do you consider yourself?,What is the highest level of education you completed?,Please specify.1,...,how long do you wait on average to get a service; which service was that?,Do you consider the waiting time for lab test results long?,how long do you wait on average to get your lab test result?,Does the facility offer support groups?,Specify the support group you belong to,In your opinion are the services offered at this facility youth friendly?,What measures have been put in place to create GBV awareness and its harmful effects within the community?,Please Specify,PWD In your opinion are the services offered at this facility persons-with-disability friendly?,What are the top 1-3 things you don’t like about this facility with regards to care and treatment?
0,2390063,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1977-09-03,Male,Primary school,,...,,No,,Yes,Adults,Yes,Presence of GBV Desk;,Chiefs office,Yes,
1,2390062,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1972-08-12,Female,Secondary school,,...,,No,,No,,Yes,Presence of GBV Desk;,Chiefs office,,
2,2390061,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1984-08-31,Female,Primary school,,...,,Yes,2 hours,No,,Yes,Presence of GBV Desk;,Chiefs office,Yes,
3,2390060,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1977-05-07,Female,Primary school,,...,,No,,No,,Yes,Presence of GBV Desk;,Police station,,
4,2390059,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1987-06-13,Male,Vocational training or technician,,...,1 hour,Yes,2 hours,Yes,Adults,Yes,Presence of GBV Desk;,Police station,,


In [3]:
imonitor.shape

(46549, 85)

In [4]:
# Find and drop columns that contain "Please specify" or "Please Specify"
cols_to_drop = [col for col in imonitor.columns if "Please specify" in col or "Please Specify" in col]

# Drop these columns from the DataFrame in a single operation
imonitor.drop(cols_to_drop, axis=1, inplace=True)

In [5]:
imonitor.shape

(46549, 69)

In [6]:
imonitor.columns = imonitor.columns.map(lambda x: x.strip())

In [7]:
for c in imonitor.columns:
    print(c)

Survey ID
Created Date
Facility name and MFL Code if applicable
Facility ownership
County
What is your month; and year of birth
How do you consider yourself?
What is the highest level of education you completed?
What is your current marital status?
Which county do you currently live in?
What are your sources of income?
For how long have you been accessing services (based on the expected package of services) in this facility?
Are you aware of the package of services that you are entitled to?
According to you; which HIV related services are you likely to receive in this facility?
Is there a service that you needed that was not provided?
Facility name
For that service that was not provided; were you referred?
If referred; did you receive the service where you were referred to?
If Yes which Service/Test/Medicine
On a scale of 1 to 5; how satisfied are you with the package of services received in this facility? If 1 is VERY UNSATISFIED and 5 is VERY SATISFIED.
What did you like about the se

In [8]:
columns_to_drop = [
    "Survey ID",
    "Facility name and MFL Code if applicable",
    "What is your month; and year of birth",
    "How do you consider yourself?",
    "What is the highest level of education you completed?",
    "What is your current marital status?",
    "Which county do you currently live in?",
    "What are your sources of income?",
    "Facility name",
    "What did you like about the services you received?",
    "What did you not like about the services you received?",
    "In your opinion what would you like to be improved?",
    "In your opinion what can be done to improve access to the services you seek at the facility?",
    "Facility name denied service",
    "Why",
    "Were reasons provided as to why these services were not available?",
    "Were reasons provided as to why these services were not available?.1",
    "What are the barriers to uptake of VMMC by males 25+years and above?",
    "What are some of the current site level practices that community members like and would love to maintain for KP/PP ?",
    "What would you like this facility to change/do better?",
    "Throughout your visit what did you find interesting/pleasing about this facility that should be emulated by other facilities?",
    "What do you think can be improved",
    "Anything else that you would like to mention?",
    "What are the top 1-3 things you like about this facility with regards to care and treatment?"
]

# Drop the columns
imonitor.drop(columns=columns_to_drop, axis=1, inplace=True)

In [9]:
column_name_mapping = {
    "Created Date": "Date",
    "Organization name coordinating the feedback from the clients": "OrgFeedbackCoordinator",
    "Facility ownership": "FacilityOwnership",
    "County": "FacilityCounty",
    "For how long have you been accessing services (based on the expected package of services) in this facility?": "ServiceAccessDuration",
    "Are you aware of the package of services that you are entitled to?": "ServicesAwareness",
    "According to you; which HIV related services are you likely to receive in this facility?": "ExpectedHIVServices",
    "Is there a service that you needed that was not provided?": "UnprovidedService",
    "Facility name no service": "UnprovidedServiceFacilityName",
    "For that service that was not provided; were you referred?": "ReferralForUnprovidedService",
    "If referred; did you receive the service where you were referred to?": "ReferralServiceReceived",
    "If Yes which Service/Test/Medicine": "ReceivedServiceDetail",
    "On a scale of 1 to 5; how satisfied are you with the package of services received in this facility? If 1 is VERY UNSATISFIED and 5 is VERY SATISFIED.": "ServiceSatisfaction",
    "Do you face any challenges when accessing the services at the facility?": "AccessChallenges",
    "Common issues that can be added in the drop-down box": "CommonIssuesDropdown",
    "Was confidentiality considered while you were being served?": "Confidentiality",
    "Are there age-appropriate health services for specific groups?": "AgeAppropriateServices",
    "Does the facility allow you to share your concerns with the administration?": "ConcernsSharing",
    "Do you know your health-related rights as a client of this facility?": "RightsAwareness",
    "Have you ever been denied services at this facility?": "ServiceDenial",
    "Are you comfortable with getting services at this facility": "ComfortWithServices",
    "Have you ever been counseled?": "CounselingReceived",
    "Did you identify any gaps in the facility when you tried to access the services": "IdentifiedGaps",
    "Service type": "ServiceGapsType",
    "Are the HIV testing services readily available when required?": "HIVTestingAvailability",
    "Have you ever Interrupted your treatment?": "TreatmentInterruption",
    "Are the PMTCT services readily available when required?": "PMTCTServiceAvailability",
    "Are the HIV prevention; testing; treatment and care services adequate for KPs?": "KPServiceAdequacy",
    "Facility Level": "FacilityLevel",
    "Facility Operation times": "OperationTimes",
    "Facility Operation Days": "OperationDays",
    "What are your preferred days of visiting the facility": "PreferredVisitDays",
    "What are your preferred time of visiting the facility": "PreferredVisitTimes",
    "On a scale of 1-5; how clean do you find the facility?": "FacilityCleanliness",
    "How do you reach this facility?": "FacilityAccessMode",
    "How long does it take to reach this facility?": "FacilityAccessTime",
    "On a scale of 1-5; how accessible do you find this facility?": "FacilityAccessibility",
    "Do you consider the waiting time to be seen at this facility long?": "WaitingTimeOpinion",
    "how long do you wait on average to get a service; which service was that?": "AverageWaitingTime",
    "Do you consider the waiting time for lab test results long?": "LabResultsWaitingTimeOpinion",
    "how long do you wait on average to get your lab test result?": "AverageLabResultsWaitingTime",
    "Does the facility offer support groups?": "SupportGroupAvailability",
    "Specify the support group you belong to": "SpecifySupportGroup",
    "In your opinion are the services offered at this facility youth friendly?": "YouthFriendlyServices",
    "What measures have been put in place to create GBV awareness and its harmful effects within the community?": "GBVAwarenessMeasures",
    "PWD In your opinion are the services offered at this facility persons-with-disability friendly?": "PWDFriendlyServicesOpinion",
    "What are the top 1-3 things you don’t like about this facility with regards to care and treatment?": "TopFacilityDislikes"
}

# Assuming imonitor is your DataFrame
df = imonitor.rename(columns=column_name_mapping)

In [10]:
for c in df.columns:
    print(c)

Date
FacilityOwnership
FacilityCounty
ServiceAccessDuration
ServicesAwareness
ExpectedHIVServices
UnprovidedService
ReferralForUnprovidedService
ReferralServiceReceived
ReceivedServiceDetail
ServiceSatisfaction
AccessChallenges
CommonIssuesDropdown
Confidentiality
AgeAppropriateServices
ConcernsSharing
RightsAwareness
ServiceDenial
ComfortWithServices
CounselingReceived
IdentifiedGaps
ServiceGapsType
HIVTestingAvailability
TreatmentInterruption
PMTCTServiceAvailability
KPServiceAdequacy
FacilityLevel
OperationTimes
OperationDays
PreferredVisitDays
PreferredVisitTimes
FacilityCleanliness
FacilityAccessMode
FacilityAccessTime
FacilityAccessibility
WaitingTimeOpinion
AverageWaitingTime
LabResultsWaitingTimeOpinion
AverageLabResultsWaitingTime
SupportGroupAvailability
SpecifySupportGroup
YouthFriendlyServices
GBVAwarenessMeasures
PWDFriendlyServicesOpinion
TopFacilityDislikes


In [11]:
columns_to_clean1 = [
    'WaitingTimeOpinion',
    'LabResultsWaitingTimeOpinion'
]

def replace_dont_know(df, column):
    df[column] = df[column].replace("Dont Know", "Do not know", regex=False)
    return df

for column in columns_to_clean1:
    df = replace_dont_know(df, column)

In [12]:
columns_to_clean2 = [
    'FacilityCleanliness',
    'FacilityAccessibility'
    ]

def replace_mixed_with_text(df, column_name):
    def replace_value(value):
        satisfaction_map = {
            1: 'Very Unsatisfied',
            2: 'Unsatisfied',
            3: 'Okay',
            4: 'Satisfied',
            5: 'Very Satisfied'
        }
        if isinstance(value, str) and value[0].isdigit():
            num = int(value[0])
        elif isinstance(value, int):
            num = value
        else:
            return value

        return satisfaction_map.get(num, value)

    df[column_name] = df[column_name].apply(replace_value)
    return df

for column in columns_to_clean2:
    df = replace_mixed_with_text(df, column)

In [13]:
def standardize_satisfaction(df, column_name):
    # Mapping for consolidating variations of satisfaction levels
    satisfaction_map = {
        '5': 'Very Satisfied',
        5.0: 'Very Satisfied',
        '4': 'Satisfied',
        4.0: 'Satisfied',
        '3': 'Okay',
        3.0: 'Okay',
        '2': 'Unsatisfied',
        2.0: 'Unsatisfied',
        '1': 'Very Unsatisfied',
        1.0: 'Very Unsatisfied',
        'Dissatisfied': 'Unsatisfied'
    }
    
    # Replace values based on the map
    df[column_name] = df[column_name].replace(satisfaction_map)
    return df

df = standardize_satisfaction(df, 'ServiceSatisfaction')


In [14]:
print(df['FacilityLevel'].value_counts())

FacilityLevel
4.0    4802
3.0    4515
2.0    2889
5.0    2240
1.0     556
6.0      14
Name: count, dtype: int64


In [15]:
def standardize_facility(df, column_name):
    # Mapping for consolidating variations of satisfaction levels
    satisfaction_map = {
        1.0: 'Community Health Unit',
        2.0: 'Dispensaries and Private Clinics',
        3.0: 'Health Centers',
        4.0: 'Sub-County Hospitals',
        5.0: 'County Referral Hospitals',
        6.0: 'National Referral Hospitals',
    }
    
    # Replace values based on the map
    df[column_name] = df[column_name].replace(satisfaction_map)
    return df

df = standardize_facility(df, 'FacilityLevel')

In [16]:
def replace_symbols_and_words(df, column_name):
    df[column_name] = df[column_name].str.replace('<', 'Less than', regex=False)
    df[column_name] = df[column_name].str.replace('>', 'More than', regex=False)
    df[column_name] = df[column_name].str.replace('minutes', 'mins', regex=False)
    return df

df = replace_symbols_and_words(df, 'FacilityAccessTime')

In [17]:
def replace_symbols_and_words(df, column_name):
    df[column_name] = df[column_name].str.replace('Less than 30mins', 'Less than 30 mins', regex=False)
    df[column_name] = df[column_name].str.replace('More than45 mins', 'More than 45 mins', regex=False)
    return df

df = replace_symbols_and_words(df, 'FacilityAccessTime')

In [18]:
# def remove_trailing_semicolons(df, column_names):
#     for column in column_names:
#         if column in df.columns:
#             df[column] = df[column].str.rstrip(';')
#     return df

# df = pd.DataFrame(df)

# columns_to_clean = ['ExpectedHIVServices', 'OperationTimes', 'OperationDays', 'PreferredVisitDays', 'PreferredVisitTimes', 'GBVAwarenessMeasures']

# df2 = remove_trailing_semicolons(df, columns_to_clean)

In [19]:
def standardize_gbv_awareness(df, column_name):
    df[column_name] = df[column_name].str.replace('Is there a desk to report GBV as community or individual', 'Presence of GBV Desk', regex=False)
    df[column_name] = df[column_name].str.replace('Are there training events on GBV for the community', 'Community trained on GBV', regex=False)
    return df

df = standardize_gbv_awareness(df, 'GBVAwarenessMeasures')

In [20]:
df['GBVAwarenessMeasures'].value_counts()

GBVAwarenessMeasures
Presence of GBV Desk                                                                                               2250
Presence of GBV Desk;                                                                                              1663
GBV Posters displayed;                                                                                             1605
Facility staff trained on GBV;                                                                                     1591
Other;                                                                                                             1431
Presence of GBV Desk;Hotline for reporting GBV cases;Facility staff trained on GBV;GBV Posters displayed;          1125
Facility staff trained on GBV;GBV Posters displayed;                                                               1015
Community trained on GBV                                                                                            844
Presence of GBV Des

In [21]:
def encode_multi_select(df, columns):
    # Iterate over the specified columns
    for col in columns:
        # Remove all whitespaces within each value and split based on ';'
        # This creates a Series of lists
        split_series = df[col].str.replace(' ', '').str.split(';')
        
        # Use the str.get_dummies() method on the Series of lists to perform one-hot encoding
        # This approach handles the separation and encoding in one step
        encoded = split_series.str.join('|').str.get_dummies()
        
        # Prefix the encoded column names to indicate their origin
        encoded.columns = [f"{col}_{option}" for option in encoded.columns]
        
        # Join the encoded dataframe with the original dataframe
        df = df.join(encoded)
        
        # Optionally, drop the original column if no longer needed
        # df.drop(col, axis=1, inplace=True)
    
    return df

# Specify the columns to encode
columns_to_encode = ['ExpectedHIVServices', 'OperationTimes', 'OperationDays', 'PreferredVisitDays', 'PreferredVisitTimes', 'GBVAwarenessMeasures']

# Apply the function
df2 = encode_multi_select(df, columns_to_encode)

print(df2)

            Date FacilityOwnership FacilityCounty ServiceAccessDuration  \
0      04-Dec-23               GOK        Nairobi   More than one year    
1      04-Dec-23               GOK        Nairobi   More than one year    
2      04-Dec-23               GOK        Nairobi   More than one year    
3      04-Dec-23               GOK        Nairobi   More than one year    
4      04-Dec-23               GOK        Nairobi   More than one year    
...          ...               ...            ...                   ...   
46544  19-May-22               NaN            NaN                   NaN   
46545  19-May-22               NaN            NaN                   NaN   
46546  19-May-22               NaN            NaN                   NaN   
46547  19-May-22               NaN            NaN                   NaN   
46548  18-May-22               NaN            NaN                   NaN   

      ServicesAwareness                          ExpectedHIVServices  \
0                   Yes  AR

In [22]:
df2.drop(columns=columns_to_encode, axis=1, inplace=True)

In [23]:
df2.to_csv('data/cleanedwnull.csv', index=False)

In [24]:
missing_percentage = df2.isnull().mean() * 100

threshold = 60

columns_to_drop = missing_percentage[missing_percentage > threshold].index.tolist()

print("Columns to drop:", columns_to_drop)

print("Number of columns to drop:", len(columns_to_drop))

df2.drop(columns=columns_to_drop, axis=1, inplace=True)

print("DataFrame shape after dropping columns:", df2.shape)

Columns to drop: ['ReferralForUnprovidedService', 'ReferralServiceReceived', 'ReceivedServiceDetail', 'CommonIssuesDropdown', 'ServiceGapsType', 'HIVTestingAvailability', 'TreatmentInterruption', 'PMTCTServiceAvailability', 'KPServiceAdequacy', 'FacilityLevel', 'FacilityCleanliness', 'FacilityAccessMode', 'FacilityAccessTime', 'FacilityAccessibility', 'WaitingTimeOpinion', 'AverageWaitingTime', 'LabResultsWaitingTimeOpinion', 'AverageLabResultsWaitingTime', 'SupportGroupAvailability', 'SpecifySupportGroup', 'YouthFriendlyServices', 'PWDFriendlyServicesOpinion', 'TopFacilityDislikes']
Number of columns to drop: 23
DataFrame shape after dropping columns: (46549, 66)


In [25]:
threshold_percentage = 100

threshold = len(df2.columns) * (threshold_percentage / 100)

data = df2.dropna(thresh=threshold).copy()

print("Original DataFrame shape:", df2.shape)
print("Cleaned DataFrame shape:", data.shape)

rows_dropped = df2.shape[0] - data.shape[0]
print("Rows dropped:", rows_dropped)

Original DataFrame shape: (46549, 66)
Cleaned DataFrame shape: (39862, 66)
Rows dropped: 6687


In [26]:
df2.to_csv('data/cleanednonull.csv', index=False)

In [27]:
# General descriptive statistics
data.describe()

Unnamed: 0,ExpectedHIVServices_ARTmedicine,ExpectedHIVServices_CD4COUNT,ExpectedHIVServices_Cervicalcancerscreening,ExpectedHIVServices_ChestXray(thiscapturesonlyonesectorofclientswithchestissues),ExpectedHIVServices_CondomDistribution,ExpectedHIVServices_Contraceptives,ExpectedHIVServices_Diagnosis,ExpectedHIVServices_DifferentiatedServiceDelivery(DSD),ExpectedHIVServices_HIVTestingandCounseling(HTS),ExpectedHIVServices_KPservices,...,PreferredVisitDays_Tuesday,PreferredVisitDays_Wednesday,PreferredVisitTimes_8to5,PreferredVisitTimes_After5,GBVAwarenessMeasures_CommunitytrainedonGBV,GBVAwarenessMeasures_FacilitystafftrainedonGBV,GBVAwarenessMeasures_GBVPostersdisplayed,GBVAwarenessMeasures_HotlineforreportingGBVcases,GBVAwarenessMeasures_Other,GBVAwarenessMeasures_PresenceofGBVDesk
count,39862.0,39862.0,39862.0,39862.0,39862.0,39862.0,39862.0,39862.0,39862.0,39862.0,...,39862.0,39862.0,39862.0,39862.0,39862.0,39862.0,39862.0,39862.0,39862.0,39862.0
mean,0.747203,0.085169,0.05647,0.017636,0.025187,0.047664,0.041769,0.016758,0.098013,0.004541,...,0.171115,0.170338,0.357433,0.001831,0.019066,0.118484,0.137524,0.057147,0.045081,0.163815
std,0.434621,0.279136,0.23083,0.131625,0.156694,0.213058,0.200064,0.128364,0.297336,0.067232,...,0.376614,0.375934,0.47925,0.042755,0.136758,0.323184,0.344404,0.232126,0.207483,0.370112
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [28]:
data['ServiceSatisfaction'].value_counts()

ServiceSatisfaction
Satisfied                22585
Very Satisfied           15966
Unsatisfied                584
Okay                       467
Do not know                127
Very Unsatisfied            74
Prefer not to answer        59
Name: count, dtype: int64

In [29]:
recategorization_mapping = {
    'Very Satisfied': 2,
    'Satisfied': 1,
    'Okay': 1,
    'Unsatisfied': 0,
    'Very Unsatisfied': 0,
    #'Unknown': 0,
    'Do not know': 99,
    'Prefer not to answer ': 99
}

data.loc[:, 'ServiceSatisfaction'] = data['ServiceSatisfaction'].replace(recategorization_mapping)

# After replacement, you might want to ensure the data type is what you expect
# For example, if you want to ensure it's an integer (especially if NaN values are not expected)
data['ServiceSatisfaction'] = data['ServiceSatisfaction'].astype(int)

# Verify the changes
print(data['ServiceSatisfaction'].value_counts())

ServiceSatisfaction
1     23052
2     15966
0       658
99      186
Name: count, dtype: int64


  data.loc[:, 'ServiceSatisfaction'] = data['ServiceSatisfaction'].replace(recategorization_mapping)


In [30]:
model_data = data[data.ServiceSatisfaction != 99]

In [31]:
model_data = model_data.drop(['Date', 'FacilityCounty', 'FacilityOwnership'], axis=1)

In [32]:
# Assuming subset_df is your DataFrame and 'ServiceSatisfaction' is the column of interest

# Split the dataset into separate groups based on 'ServiceSatisfaction'
class_3_df = model_data[model_data['ServiceSatisfaction'] == 2]
class_2_df = model_data[model_data['ServiceSatisfaction'] == 1]
class_1_df = model_data[model_data['ServiceSatisfaction'] == 0]

# Get the target number of instances to match, which is the number of instances in class 1
target_number = class_1_df.shape[0]

# Randomly sample from classes 3 and 2 to match the number of instances in class 1
class_3_sampled_df = class_3_df.sample(n=target_number, random_state=42)
class_2_sampled_df = class_2_df.sample(n=target_number, random_state=42)

balanced_df = pd.concat([class_3_sampled_df, class_2_sampled_df, class_1_df])

In [33]:
balanced_df['ServiceSatisfaction'].value_counts()

ServiceSatisfaction
2    658
1    658
0    658
Name: count, dtype: int64

In [34]:
# ordinal_vars = balanced_df['ServiceSatisfaction']
# nominal_vars = [col for col in balanced_df.columns if balanced_df[col].dtype == 'object' and col not in ordinal_vars]
# encoded_data = pd.get_dummies(balanced_df, columns=nominal_vars)

# # This automatically drops the original nominal columns and adds the one-hot encoded columns
# print("NaN counts after pandas get_dummies:", encoded_data.isnull().sum().sum())

In [35]:


# Assuming model_data is your DataFrame

# Convert all columns of type 'object' to 'category'
for col in model_data.columns[model_data.dtypes == 'object']:
    model_data[col] = model_data[col].astype('category')

# Check the new data types
print(model_data.dtypes)


ServiceAccessDuration                               category
ServicesAwareness                                   category
UnprovidedService                                   category
ServiceSatisfaction                                    int32
AccessChallenges                                    category
                                                      ...   
GBVAwarenessMeasures_FacilitystafftrainedonGBV         int64
GBVAwarenessMeasures_GBVPostersdisplayed               int64
GBVAwarenessMeasures_HotlineforreportingGBVcases       int64
GBVAwarenessMeasures_Other                             int64
GBVAwarenessMeasures_PresenceofGBVDesk                 int64
Length: 63, dtype: object


In [36]:
X = model_data.drop('ServiceSatisfaction', axis=1)
y = model_data['ServiceSatisfaction']
# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [40]:
def test_models(X_train, y_train, X_test, y_test):
    categorical_features = [col for col in X_train.columns if X_train[col].dtype.name == 'category']
    models = {
        'CatBoostClassifier': CatBoostClassifier(verbose=0, cat_features=categorical_features),
        'LGBMClassifier': LGBMClassifier(enable_categorical=True),
        'XGBClassifier': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', enable_categorical=True)
    }
    
    best_model = None
    best_score = -1
    model_results = []
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr', average='weighted') if hasattr(model, "predict_proba") else None
        report = classification_report(y_test, y_pred, output_dict=True)
        
        model_result = {
            'Model': name,
            'ROC AUC': roc_auc,
            'Accuracy': report['accuracy'],
            'Precision': report['weighted avg']['precision'],
            'Recall': report['weighted avg']['recall'],
            'F1 Score': report['weighted avg']['f1-score'],
        }
        model_results.append(model_result)
        
        # Check if this model is the best
        if roc_auc > best_score:
            best_score = roc_auc
            best_model = model

    return pd.DataFrame(model_results), best_model

# Example usage:
results_df, best_model = test_models(X_train, y_train, X_test, y_test)
print(results_df)
print("Best model:", best_model)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 27773, number of used features: 62
[LightGBM] [Info] Start training from score -4.131501
[LightGBM] [Info] Start training from score -0.540659
[LightGBM] [Info] Start training from score -0.912356
                Model   ROC AUC  Accuracy  Precision    Recall  F1 Score
0  CatBoostClassifier  0.778288  0.696715   0.702264  0.696715  0.679132
1      LGBMClassifier  0.779008  0.700496   0.708744  0.700496  0.681403
2       XGBClassifier  0.777420  0.697723   0.701008  0.697723  0.682042
Best model: LGBMClassifier(enable_categorical=True)


In [39]:
feature_importances = best_model.feature_importances_

# Create a Series for the feature importances
importances = pd.Series(feature_importances, index=X_train.columns)

# Sort the importances and select the top 10, then reverse the Series for plotting
top_10_importances = importances.sort_values(ascending=False)[:10][::-1]

# Create a bar chart using Plotly
fig = px.bar(top_10_importances, x=top_10_importances.values, y=top_10_importances.index, orientation='h',
             labels={'x': 'Importance', 'index': 'Feature'},
             title='Top 10 Feature Importances (Highest to Lowest)')

# Show the plot
fig.show()