In [262]:
import pandas as pd
import numpy as np
import plotly.express as px
from datetime import datetime
import regex as re
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report


In [263]:
imonitor = pd.read_csv('data/imonitor_1703.csv')
imonitor.head()


Columns (1,4,11,15,24,25,42,56,62,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,Survey ID,Created Date,Facility name and MFL Code if applicable,Facility ownership,Please specify,County,What is your month; and year of birth,How do you consider yourself?,What is the highest level of education you completed?,Please specify.1,...,how long do you wait on average to get a service; which service was that?,Do you consider the waiting time for lab test results long?,how long do you wait on average to get your lab test result?,Does the facility offer support groups?,Specify the support group you belong to,In your opinion are the services offered at this facility youth friendly?,What measures have been put in place to create GBV awareness and its harmful effects within the community?,Please Specify,PWD In your opinion are the services offered at this facility persons-with-disability friendly?,What are the top 1-3 things you don’t like about this facility with regards to care and treatment?
0,2390063,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1977-09-03,Male,Primary school,,...,,No,,Yes,Adults,Yes,Presence of GBV Desk;,Chiefs office,Yes,
1,2390062,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1972-08-12,Female,Secondary school,,...,,No,,No,,Yes,Presence of GBV Desk;,Chiefs office,,
2,2390061,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1984-08-31,Female,Primary school,,...,,Yes,2 hours,No,,Yes,Presence of GBV Desk;,Chiefs office,Yes,
3,2390060,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1977-05-07,Female,Primary school,,...,,No,,No,,Yes,Presence of GBV Desk;,Police station,,
4,2390059,04-Dec-23,BABA DOGO HEALTH CENTRE,GOK,,Nairobi,1987-06-13,Male,Vocational training or technician,,...,1 hour,Yes,2 hours,Yes,Adults,Yes,Presence of GBV Desk;,Police station,,


In [264]:
imonitor.shape

(46549, 85)

In [265]:
# Find and drop columns that contain "Please specify" or "Please Specify"
cols_to_drop = [col for col in imonitor.columns if "Please specify" in col or "Please Specify" in col]

# Drop these columns from the DataFrame in a single operation
imonitor.drop(cols_to_drop, axis=1, inplace=True)

In [266]:
imonitor.shape

(46549, 69)

In [267]:
column_name_mapping = {
    "Survey ID": "SurveyID",
    "Organization name coordinating the feedback from the clients": "OrgFeedbackCoordinator",
    "Created Date": "FeedbackDate",
    "Facility name and MFL Code if applicable": "FacilityName",
    "Facility ownership": "FacilityOwnership",
    "County": "FacilityCounty",
    "What is your month; and year of birth": "BirthMonthYear",
    "How do you consider yourself? ": "SelfIdentity",
    "What is the highest level of education you completed?": "EducationLevel",
    "What is your current marital status? ": "MaritalStatus",
    "Which county do you currently live in?": "ResidenceCounty",
    "What are your sources of income?": "IncomeSources",
    "For how long have you been accessing services (based on the expected package of services) in this facility?": "ServiceAccessDuration",
    "Are you aware of the package of services that you are entitled to?": "ServicesAwareness",
    "According to you; which HIV related services are you likely to receive in this facility?": "ExpectedHIVServices",
    "Is there a service that you needed that was not provided?": "UnprovidedService",
    "Facility name no service": "UnprovidedServiceFacilityName",
    "For that service that was not provided; were you referred?": "ReferralForUnprovidedService",
    "If referred; did you receive the service where you were referred to?": "ReferralServiceReceived",
    "If Yes which Service/Test/Medicine ": "ReceivedServiceDetail",
    "On a scale of 1 to 5; how satisfied are you with the package of services received in this facility? If 1 is VERY UNSATISFIED and 5 is VERY SATISFIED.": "ServiceSatisfaction",
    "What did you like about the services you received?": "ServicesLiked",
    "What did you not like about the services you received?": "ServicesDisliked",
    "In your opinion what would you like to be improved?": "ImprovementSuggestions",
    "Do you face any challenges when accessing the services at the facility?": "AccessChallenges",
    "Common issues that can be added in the drop-down box": "CommonIssuesDropdown",
    "In your opinion what can be done to improve access to the services you seek at the facility?": "AccessImprovementSuggestions",
    "Was confidentiality considered while you were being served?": "Confidentiality",
    "Are there age-appropriate health services for specific groups?": "AgeAppropriateServices",
    "Does the facility allow you to share your concerns with the administration?": "ConcernsSharing",
    "Do you know your health-related rights as a client of this facility?": "RightsAwareness",
    "Have you ever been denied services at this facility?": "ServiceDenial",
    "Facility name denied service": "ServiceDenialFacilityName",
    "Why": "ServiceDenialSpecify",
    "Are you comfortable with getting services at this facility": "ComfortWithServices",
    "Have you ever been counseled? ": "CounselingReceived",
    "Did you identify any gaps in the facility when you tried to access the services": "IdentifiedGaps",
    "Service type": "ServiceGapsType",
    "Are the HIV testing services readily available when required? ": "HIVTestingAvailability",
    "Have you ever Interrupted your treatment?": "TreatmentInterruption",
    "Are the PMTCT services readily available when required?": "PMTCTServiceAvailability",
    "Were reasons provided as to why these services were not available?": "PMTCTServiceNonavailabilityReasons",
    "Are the HIV prevention; testing; treatment and care services adequate for KPs? ": "KPServiceAdequacy",
    "Were reasons provided as to why these services were not available?.1": "KPServiceNonavailabilityReasons",
    "What are the barriers to uptake of VMMC by males 25+years and above?": "VMMCBarriers",
    "What are some of the current site level practices that community members like and would love to maintain for KP/PP ?": "KPCommunityPreferredPractices",
    "What would you like this facility to change/do better?": "ChangeSuggestions",
    "Throughout your visit what did you find interesting/pleasing about this facility that should be emulated by other facilities?": "PositiveObservations",
    "What do you think can be improved": "GeneralImprovementSuggestions",
    "Anything else that you would like to mention?": "AdditionalComments",
    "What are the top 1-3 things you like about this facility with regards to care and treatment? ": "TopFacilityFeatures",
    "Facility Level": "FacilityLevel",
    "Facility Operation times": "OperationTimes",
    "Facility Operation Days ": "OperationDays",
    "What are your preferred days of visiting the facility": "PreferredVisitDays",
    "What are your preferred time of visiting the facility": "PreferredVisitTimes",
    "According to you, which HIV related service/tests/medicine are you likely to receive in this facility?": "ExpectedHIVServices2",
    "Is there a service/test/medicine that you needed that was not provided?": "UnprovidedService2",
    "which service/test/medicine?": "UnprovidedServiceDetail",
    "On a scale of 1-5; how clean do you find the facility?": "FacilityCleanliness",
    "How do you reach this facility?": "FacilityAccessMode",
    "How long does it take to reach this facility?": "FacilityAccessTime",
    "On a scale of 1-5; how accessible do you find this facility?": "FacilityAccessibility",
    "Do you consider the waiting time to be seen at this facility long?": "WaitingTimeOpinion",
    "how long do you wait on average to get a service; which service was that?": "AverageWaitingTime",
    "Do you consider the waiting time for lab test results long?": "LabResultsWaitingTimeOpinion",
    "how long do you wait on average to get your lab test result?": "AverageLabResultsWaitingTime",
    "Does the facility offer support groups?": "SupportGroupAvailability",
    "Specify the support group you belong to": "SpecifySupportGroup",
    "In your opinion are the services offered at this facility youth friendly?": "YouthFriendlyServices",
    "What measures have been put in place to create GBV awareness and its harmful effects within the community? ": "GBVAwarenessMeasures",
    "PWD In your opinion are the services offered at this facility persons-with-disability friendly?": "PWDFriendlyServicesOpinion",
    "What are the top 1-3 things you don’t like about this facility with regards to care and treatment?": "TopFacilityDislikes"
}

df = imonitor.rename(columns=column_name_mapping)

In [268]:
for column in df.columns:
    print(column)

SurveyID
FeedbackDate
FacilityName
FacilityOwnership
FacilityCounty
BirthMonthYear
SelfIdentity
EducationLevel
MaritalStatus
ResidenceCounty
IncomeSources
ServiceAccessDuration
ServicesAwareness
ExpectedHIVServices
UnprovidedService
Facility name
ReferralForUnprovidedService
ReferralServiceReceived
ReceivedServiceDetail
ServiceSatisfaction
ServicesLiked
ServicesDisliked
ImprovementSuggestions
AccessChallenges
CommonIssuesDropdown
AccessImprovementSuggestions
Confidentiality
AgeAppropriateServices
ConcernsSharing
RightsAwareness
ServiceDenial
ServiceDenialFacilityName
ServiceDenialSpecify
ComfortWithServices
CounselingReceived
IdentifiedGaps
ServiceGapsType
HIVTestingAvailability
TreatmentInterruption
PMTCTServiceAvailability
PMTCTServiceNonavailabilityReasons
KPServiceAdequacy
KPServiceNonavailabilityReasons
VMMCBarriers
KPCommunityPreferredPractices
ChangeSuggestions
PositiveObservations
GeneralImprovementSuggestions
AdditionalComments
TopFacilityFeatures
FacilityLevel
OperationTimes


In [269]:
columns_to_clean1 = [
    'WaitingTimeOpinion',
    'LabResultsWaitingTimeOpinion'
]

def replace_dont_know(df, column):
    df[column] = df[column].replace("Dont Know", "I don't know", regex=False)
    return df

for column in columns_to_clean1:
    df = replace_dont_know(df, column)

In [270]:
columns_to_clean2 = [
    'FacilityCleanliness',
    'FacilityAccessibility'
    ]

def replace_mixed_with_text(df, column_name):
    def replace_value(value):
        satisfaction_map = {
            1: 'Very Unsatisfied',
            2: 'Unsatisfied',
            3: 'Okay',
            4: 'Satisfied',
            5: 'Very Satisfied'
        }
        if isinstance(value, str) and value[0].isdigit():
            num = int(value[0])
        elif isinstance(value, int):
            num = value
        else:
            return value

        return satisfaction_map.get(num, value)

    df[column_name] = df[column_name].apply(replace_value)
    return df

for column in columns_to_clean2:
    df = replace_mixed_with_text(df, column)

In [271]:
def standardize_satisfaction(df, column_name):
    # Mapping for consolidating variations of satisfaction levels
    satisfaction_map = {
        '5': 'Very Satisfied',
        5.0: 'Very Satisfied',
        '4': 'Satisfied',
        4.0: 'Satisfied',
        '3': 'Okay',
        3.0: 'Okay',
        '2': 'Unsatisfied',
        2.0: 'Unsatisfied',
        '1': 'Very Unsatisfied',
        1.0: 'Very Unsatisfied',
        'Dissatisfied': 'Unsatisfied'
    }
    
    # Replace values based on the map
    df[column_name] = df[column_name].replace(satisfaction_map)
    return df

df = standardize_satisfaction(df, 'ServiceSatisfaction')


In [272]:
print(df['FacilityLevel'].value_counts())

FacilityLevel
4.0    4802
3.0    4515
2.0    2889
5.0    2240
1.0     556
6.0      14
Name: count, dtype: int64


In [273]:
def standardize_facility(df, column_name):
    # Mapping for consolidating variations of satisfaction levels
    satisfaction_map = {
        1.0: 'Community Health Unit',
        2.0: 'Dispensaries and Private Clinics',
        3.0: 'Health Centers',
        4.0: 'Sub-County Hospitals',
        5.0: 'County Referral Hospitals',
        6.0: 'National Referral Hospitals',
    }
    
    # Replace values based on the map
    df[column_name] = df[column_name].replace(satisfaction_map)
    return df

df = standardize_facility(df, 'FacilityLevel')

In [274]:
def replace_symbols_and_words(df, column_name):
    df[column_name] = df[column_name].str.replace('<', 'Less than', regex=False)
    df[column_name] = df[column_name].str.replace('>', 'More than', regex=False)
    df[column_name] = df[column_name].str.replace('minutes', 'mins', regex=False)
    return df

df = replace_symbols_and_words(df, 'FacilityAccessTime')

In [275]:
def replace_symbols_and_words(df, column_name):
    df[column_name] = df[column_name].str.replace('Less than 30mins', 'Less than 30 mins', regex=False)
    df[column_name] = df[column_name].str.replace('More than45 mins', 'More than 45 mins', regex=False)
    return df

df = replace_symbols_and_words(df, 'FacilityAccessTime')

In [276]:
def remove_trailing_semicolons(df, column_names):
    for column in column_names:
        if column in df.columns:
            df[column] = df[column].str.rstrip(';')
    return df

df = pd.DataFrame(df)

columns_to_clean = ['ExpectedHIVServices', 'OperationTimes', 'OperationDays', 'PreferredVisitDays', 'PreferredVisitTimes', 'GBVAwarenessMeasures']

df2 = remove_trailing_semicolons(df, columns_to_clean)

In [277]:
def calculate_age(birth_date_str):
    if isinstance(birth_date_str, str):
        # Parse the birth date string into a datetime object
        birth_date = datetime.strptime(birth_date_str, "%Y-%m-%d")
        
        # Get today's date
        today = datetime.today()
        
        # Calculate age
        age = today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))
        
        return age
    else:
        # Return None or an appropriate value for missing or invalid input
        return None
    
df2['Age'] = df2['BirthMonthYear'].apply(calculate_age)

In [278]:
def remove_invalid_age_rows(df2, age_column):
    valid_age_condition = (df[age_column] >= 0) & (df[age_column] <= 100)
    
    # Keep only rows with valid age
    cleaned_df = df2[valid_age_condition].copy()
    
    return cleaned_df
df3 = remove_invalid_age_rows(df2, 'Age')

In [279]:
df3.drop(columns=['BirthMonthYear'], inplace=True)

In [280]:
def convert_mixed_dates(date_column):
    """
    This function takes a Pandas Series of mixed dates and Excel serial dates and converts them to datetime objects.
    
    Parameters:
    date_column (pd.Series): A pandas Series with mixed date formats and serial dates.
    
    Returns:
    pd.Series: A pandas Series with all dates converted to datetime objects.
    """
    # Define the epoch start for Excel's serial date format
    excel_epoch = pd.Timestamp('1899-12-30')
    converted_dates = []

    for date in date_column:
        if isinstance(date, str) and re.match(r'^\d+(\.\d+)?$', date):
            # If it's a string that looks like a serial date, convert it
            serial_value = float(date)
            converted_date = excel_epoch + pd.to_timedelta(serial_value, unit='D')
        elif isinstance(date, (int, float)):
            # If it's a numeric type, assume it's a serial date
            converted_date = excel_epoch + pd.to_timedelta(date, unit='D')
        else:
            # Otherwise, try to parse it as a regular date
            converted_date = pd.to_datetime(date, errors='coerce')

        # Append the result, which will be NaT if parsing failed
        converted_dates.append(converted_date)

    return pd.Series(converted_dates)

convert_mixed_dates(df3['FeedbackDate'])


0       2023-12-04
1       2023-12-04
2       2023-12-04
3       2023-12-04
4       2023-12-04
           ...    
41837   2022-10-04
41838   2022-10-03
41839   2022-10-04
41840   2022-10-03
41841   2022-09-28
Length: 41842, dtype: datetime64[ns]

In [281]:
df3.to_csv('data/cleaned.csv', index=False)

In [282]:
columns_to_keep = [
    'SurveyID', 'FeedbackDate', 'FacilityCounty', 'FacilityLevel', 'FacilityOwnership', 'ServicesLiked', 'ServicesDisliked', 'ImprovementSuggestions',
    'AccessImprovementSuggestions', 'PositiveObservations',
    'GeneralImprovementSuggestions', 'AdditionalComments', 'TopFacilityFeatures', 'ServiceSatisfaction', 'TopFacilityDislikes'
]

df_selected = df3[columns_to_keep]

df_selected.to_csv('data/clm_open_ended.csv', index=False)

In [283]:
missing_percentage = df3.isnull().mean() * 100

threshold = 60

columns_to_drop = missing_percentage[missing_percentage > threshold].index.tolist()

print("Columns to drop:", columns_to_drop)

print("Number of columns to drop:", len(columns_to_drop))

df3.drop(columns=columns_to_drop, axis=1, inplace=True)

print("DataFrame shape after dropping columns:", df3.shape)

Columns to drop: ['Facility name', 'ReferralForUnprovidedService', 'ReferralServiceReceived', 'ReceivedServiceDetail', 'CommonIssuesDropdown', 'ServiceDenialFacilityName', 'ServiceDenialSpecify', 'ServiceGapsType', 'HIVTestingAvailability', 'TreatmentInterruption', 'PMTCTServiceAvailability', 'PMTCTServiceNonavailabilityReasons', 'KPServiceAdequacy', 'KPServiceNonavailabilityReasons', 'VMMCBarriers', 'KPCommunityPreferredPractices', 'ChangeSuggestions', 'TopFacilityFeatures', 'FacilityLevel', 'OperationTimes', 'OperationDays', 'PreferredVisitDays', 'PreferredVisitTimes', 'FacilityCleanliness', 'FacilityAccessMode', 'FacilityAccessTime', 'FacilityAccessibility', 'WaitingTimeOpinion', 'AverageWaitingTime', 'LabResultsWaitingTimeOpinion', 'AverageLabResultsWaitingTime', 'SupportGroupAvailability', 'SpecifySupportGroup', 'YouthFriendlyServices', 'GBVAwarenessMeasures', 'PWDFriendlyServicesOpinion', 'TopFacilityDislikes']
Number of columns to drop: 37
DataFrame shape after dropping columns:

In [284]:
threshold_percentage = 50

threshold = len(df3.columns) * (threshold_percentage / 100)

data = df3.dropna(thresh=threshold)

print("Original DataFrame shape:", df3.shape)
print("Cleaned DataFrame shape:", data.shape)

rows_dropped = df3.shape[0] - data.shape[0]
print("Rows dropped:", rows_dropped)

Original DataFrame shape: (41842, 32)
Cleaned DataFrame shape: (41671, 32)
Rows dropped: 171


In [285]:
percent_empty = {}
for column in data.columns:
    # Calculate the number of empty values per column
    num_empty = data[column].isnull().sum()
    # Calculate the percentage of empty values
    percent_empty[column] = (num_empty / len(df3)) * 100
    # if percent_empty[column] > 1:
    print(f"{column}: {percent_empty[column]:.2f}%")

SurveyID: 0.00%
FeedbackDate: 0.00%
FacilityName: 6.97%
FacilityOwnership: 0.23%
FacilityCounty: 0.90%
SelfIdentity: 0.72%
EducationLevel: 0.60%
MaritalStatus: 0.67%
ResidenceCounty: 0.08%
IncomeSources: 0.16%
ServiceAccessDuration: 0.61%
ServicesAwareness: 0.83%
ExpectedHIVServices: 7.06%
UnprovidedService: 0.99%
ServiceSatisfaction: 0.56%
ServicesLiked: 1.02%
ServicesDisliked: 15.31%
ImprovementSuggestions: 8.25%
AccessChallenges: 0.70%
AccessImprovementSuggestions: 9.88%
Confidentiality: 1.49%
AgeAppropriateServices: 1.60%
ConcernsSharing: 1.56%
RightsAwareness: 0.84%
ServiceDenial: 0.91%
ComfortWithServices: 0.94%
CounselingReceived: 0.85%
IdentifiedGaps: 1.11%
PositiveObservations: 2.42%
GeneralImprovementSuggestions: 12.71%
AdditionalComments: 10.95%
Age: 0.00%


In [286]:
columns_null = [
    'FacilityName',
    'FacilityOwnership',
    'FacilityCounty',
    'SelfIdentity',
    'EducationLevel',
    'MaritalStatus',
    'ResidenceCounty',
    'IncomeSources',
    'ServiceAccessDuration',
    'ServicesAwareness',
    'ExpectedHIVServices',
    'UnprovidedService',
    'ServiceSatisfaction',
    'ServicesLiked',
    'ServicesDisliked',
    'ImprovementSuggestions',
    'AccessChallenges',
    'AccessImprovementSuggestions',
    'Confidentiality',
    'AgeAppropriateServices',
    'ConcernsSharing',
    'RightsAwareness',
    'ServiceDenial',
    'ComfortWithServices',
    'CounselingReceived',
    'IdentifiedGaps',
    'PositiveObservations',
    'GeneralImprovementSuggestions',
    'AdditionalComments'
]

for column in columns_null:
    if column in data.columns:
        data.fillna({column:'Unknown'}, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [287]:
# General descriptive statistics
data.describe()

Unnamed: 0,SurveyID,Age
count,41671.0,41671.0
mean,992428.2,41.091167
std,1045821.0,13.072615
min,179148.0,0.0
25%,192427.0,32.0
50%,204400.0,40.0
75%,2341124.0,49.0
max,2390063.0,100.0


In [288]:
# Descriptive statistics for categorical data
data.describe(include=['object'])

Unnamed: 0,FeedbackDate,FacilityName,FacilityOwnership,FacilityCounty,SelfIdentity,EducationLevel,MaritalStatus,ResidenceCounty,IncomeSources,ServiceAccessDuration,...,AgeAppropriateServices,ConcernsSharing,RightsAwareness,ServiceDenial,ComfortWithServices,CounselingReceived,IdentifiedGaps,PositiveObservations,GeneralImprovementSuggestions,AdditionalComments
count,41671,41671,41671,41671,41671,41671,41671,41671,41671,41671,...,41671,41671,41671,41671,41671,41671,41671,41671,41671,41671
unique,13727,1536,7,11,8,9,10,13,32,5,...,3,3,3,3,3,4,3,14173,10806,11750
top,09-Oct-23,Unknown,GOK,Homabay,Female,Primary school,Married,Homabay,Business;,More than one year,...,Yes,Yes,Yes,No,Yes,Yes,No,Unknown,Unknown,Unknown
freq,2072,2918,33972,15937,28068,20242,24601,15597,8900,36676,...,36345,38846,36427,41132,40739,35017,38720,1011,5318,4582


In [289]:
fig = px.histogram(data, x='ServiceSatisfaction', title='Distribution of Service Satisfaction')
fig.update_layout(bargap=0.2)
fig.show()

In [290]:
recategorization_mapping = {
    'Very Satisfied': 3,
    'Satisfied': 3,
    'Okay': 2,
    'Unsatisfied': 1,
    'Very Unsatisfied': 3,
    'Unknown': 0,
    'Do not know': 0,
    'Prefer not to answer ': 0
}

# Apply the mapping to the 'ServiceSatisfaction' column
data['ServiceSatisfaction'] = data['ServiceSatisfaction'].replace(recategorization_mapping)

# Verify the changes
print(data['ServiceSatisfaction'].value_counts())

ServiceSatisfaction
3    40137
1      611
2      521
0      402
Name: count, dtype: int64



Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [291]:
model_data = data[data.ServiceSatisfaction != 0]

In [292]:
model_data = model_data.drop(['SurveyID', 'FeedbackDate', 'FacilityName', 'ResidenceCounty', 'ServicesLiked', 'ServicesDisliked', 'ImprovementSuggestions', 'AccessImprovementSuggestions', 'PositiveObservations', 'GeneralImprovementSuggestions', 'AdditionalComments'], axis=1)

In [294]:
target_var = 'ServiceSatisfaction'
# ordinal_vars = ['ServiceSatisfaction', 'FacilityCleanliness', 'FacilityAccessibility']
ordinal_vars = ['ServiceSatisfaction']
nominal_vars = [col for col in model_data.columns if model_data[col].dtype == 'object' and col not in ordinal_vars + ['Age']]

# Encode nominal variables using OneHotEncoder without specifying 'sparse' argument
one_hot_encoder = OneHotEncoder()
encoded_nominal_sparse = one_hot_encoder.fit_transform(data[nominal_vars])
# Convert the sparse matrix to a dense array
encoded_nominal = encoded_nominal_sparse.toarray()
# Create a DataFrame with the correct column names
encoded_nominal_data = pd.DataFrame(encoded_nominal, columns=one_hot_encoder.get_feature_names_out())

# Proceed with the rest of your code as before
# Encode ordinal variables using LabelEncoder
label_encoders = {}
for ordinal_var in ordinal_vars:
    label_encoders[ordinal_var] = LabelEncoder()
    data[ordinal_var] = label_encoders[ordinal_var].fit_transform(data[ordinal_var])

# Drop the original nominal columns from df and concatenate the new encoded_nominal_df
model_data = model_data.drop(nominal_vars, axis=1)
model_data = pd.concat([model_data, encoded_nominal_data], axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [295]:
X = model_data.drop('ServiceSatisfaction', axis=1)
y = model_data['ServiceSatisfaction']

In [296]:
# Split the data into training and testing sets (70% train, 15% validation, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [297]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Validate the model
y_val_pred = rf_model.predict(X_val)
print("Random Forest Validation Metrics:")
print(classification_report(y_val, y_val_pred))

ValueError: Input y contains NaN.

In [None]:
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Validate the model
y_val_pred = gb_model.predict(X_val)
print("Gradient Boosting Validation Metrics:")
print(classification_report(y_val, y_val_pred))

In [None]:
# Assume rf_model was the best
y_test_pred = rf_model.predict(X_test)
print("Random Forest Test Metrics:")
print(classification_report(y_test, y_test_pred))