In [None]:
import pandas as pd
data = pd.read_csv('data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Match_ID              100 non-null    int64 
 1   Elderly_ID            100 non-null    object
 2   Caregiver_ID          100 non-null    object
 3   Elderly_Age           100 non-null    int64 
 4   Elderly_Gender        100 non-null    object
 5   Health_Condition      88 non-null     object
 6   Care_Needs_Level      100 non-null    object
 7   Elderly_Location      98 non-null     object
 8   Caregiver_Location    100 non-null    object
 9   Caregiver_Age         100 non-null    int64 
 10  Caregiver_Gender      100 non-null    object
 11  Caregiver_Experience  97 non-null     object
 12  Languages_Spoken      100 non-null    object
 13  Match_Success         100 non-null    int64 
dtypes: int64(4), object(10)
memory usage: 11.1+ KB


In [3]:
# Display the first five rows of the dataset
data.head()

Unnamed: 0,Match_ID,Elderly_ID,Caregiver_ID,Elderly_Age,Elderly_Gender,Health_Condition,Care_Needs_Level,Elderly_Location,Caregiver_Location,Caregiver_Age,Caregiver_Gender,Caregiver_Experience,Languages_Spoken,Match_Success
0,1,E001,C005,78,Female,Diabetes,High,Nairobi,Nairobi,45,Female,10,English;Swahili,1
1,2,E002,C002,82,Male,Hypertension,Medium,Mombasa,Mombasa,38,Male,5,English,0
2,3,E003,C001,65,Female,,Low,Kisumu,Kisumu,30,Female,2,English,1
3,4,E004,C004,90,Female,Alzheimer,High,Nakuru,Nakuru,50,Female,20,English;Swahili,1
4,5,E005,C003,74,Male,Heart Disease,High,Eldoret,Eldoret,42,Male,8,English;Kikuyu,0


DATA CLEANING

In [4]:
# Count duplicate rows
duplicate_count = data.duplicated().sum()

# Display the number of duplicate rows found
print(f"Number of duplicate rows: {duplicate_count}")


Number of duplicate rows: 0


In [5]:
# Loop over all columns of type object (strings) and trim whitespace
for col in data.select_dtypes(include='object').columns:
    data[col] = data[col].str.strip()

# Check the cleaned columns by viewing the first few rows
data.head()


Unnamed: 0,Match_ID,Elderly_ID,Caregiver_ID,Elderly_Age,Elderly_Gender,Health_Condition,Care_Needs_Level,Elderly_Location,Caregiver_Location,Caregiver_Age,Caregiver_Gender,Caregiver_Experience,Languages_Spoken,Match_Success
0,1,E001,C005,78,Female,Diabetes,High,Nairobi,Nairobi,45,Female,10,English;Swahili,1
1,2,E002,C002,82,Male,Hypertension,Medium,Mombasa,Mombasa,38,Male,5,English,0
2,3,E003,C001,65,Female,,Low,Kisumu,Kisumu,30,Female,2,English,1
3,4,E004,C004,90,Female,Alzheimer,High,Nakuru,Nakuru,50,Female,20,English;Swahili,1
4,5,E005,C003,74,Male,Heart Disease,High,Eldoret,Eldoret,42,Male,8,English;Kikuyu,0


In [6]:
# Correct known typos in the Health_Condition column
typo_corrections = {
    'Diabtes': 'Diabetes',
    'Artrhitis': 'Arthritis',
    'Arzheimer': 'Alzheimer'
}
data['Health_Condition'] = data['Health_Condition'].replace(typo_corrections)

# Display the unique values to verify corrections
print("Unique Health_Condition values:", data['Health_Condition'].unique())


Unique Health_Condition values: ['Diabetes' 'Hypertension' nan 'Alzheimer' 'Heart Disease' 'Arthritis'
 "Parkinson's"]


In [7]:
# Replace inconsistent delimiters (-, /, ,) with a semicolon
data['Languages_Spoken'] = data['Languages_Spoken'].replace({'-': ';', '/': ';', ',': ';'}, regex=True)

# Remove any extra semicolons and trim them from the ends
data['Languages_Spoken'] = data['Languages_Spoken'].str.replace(r';+', ';', regex=True).str.strip(';')

# Optional: Standardize language order (so "Swahili;English" becomes "english;swahili")
def standardize_languages(languages):
    langs = sorted([lang.strip().lower() for lang in languages.split(';') if lang.strip()])
    return ';'.join(langs)

data['Languages_Spoken'] = data['Languages_Spoken'].apply(standardize_languages)

# View cleaned languages
data['Languages_Spoken'].head()


0    english;swahili
1            english
2            english
3    english;swahili
4     english;kikuyu
Name: Languages_Spoken, dtype: object

In [10]:

# Convert columns to numeric (non-numeric values become NaN)
data['Elderly_Age'] = pd.to_numeric(data['Elderly_Age'], errors='coerce')
data['Caregiver_Age'] = pd.to_numeric(data['Caregiver_Age'], errors='coerce')
data['Caregiver_Experience'] = pd.to_numeric(data['Caregiver_Experience'], errors='coerce')

# Check conversion results
print(data.dtypes)


Match_ID                  int64
Elderly_ID               object
Caregiver_ID             object
Elderly_Age               int64
Elderly_Gender           object
Health_Condition         object
Care_Needs_Level         object
Elderly_Location         object
Caregiver_Location       object
Caregiver_Age             int64
Caregiver_Gender         object
Caregiver_Experience    float64
Languages_Spoken         object
Match_Success             int64
dtype: object


In [13]:
# check number of null values per column
print(data.isnull().sum())

Match_ID                0
Elderly_ID              0
Caregiver_ID            0
Elderly_Age             0
Elderly_Gender          0
Health_Condition        0
Care_Needs_Level        0
Elderly_Location        0
Caregiver_Location      0
Caregiver_Age           0
Caregiver_Gender        0
Caregiver_Experience    0
Languages_Spoken        0
Match_Success           0
dtype: int64


FEATURE ENGINEERING

In [14]:
# Create a new column 'Location_Match': 1 if Elderly_Location and Caregiver_Location match, else 0
data['Location_Match'] = (data['Elderly_Location'].str.lower() == data['Caregiver_Location'].str.lower()).astype(int)

# Verify the new feature
data[['Elderly_Location', 'Caregiver_Location', 'Location_Match']].head()


Unnamed: 0,Elderly_Location,Caregiver_Location,Location_Match
0,Nairobi,Nairobi,1
1,Mombasa,Mombasa,1
3,Nakuru,Nakuru,1
4,Eldoret,Eldoret,1
5,Thika,Thika,1


 Encode Categorical Variables and Create Features

In [16]:
# Import necessary libraries
import pandas as pd

# Assuming 'data' is your cleaned DataFrame from previous cells
# Drop unnecessary identifier columns and one-hot encode categorical features
# Updated categorical columns list now includes 'Caregiver_Gender'
categorical_cols = ['Elderly_Gender', 'Caregiver_Gender', 'Health_Condition', 
                    'Care_Needs_Level', 'Elderly_Location', 'Caregiver_Location', 'Languages_Spoken']

# One-hot encoding transforms categorical variables into numerical columns that machine learning models can understand.
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# Define features (X) and target (y). The target is Match_Success.
X = data_encoded.drop(['Match_ID', 'Elderly_ID', 'Caregiver_ID', 'Match_Success'], axis=1)
y = data_encoded['Match_Success']

# Display first few rows of the feature set
X.head()


Unnamed: 0,Elderly_Age,Caregiver_Age,Caregiver_Experience,Location_Match,Elderly_Gender_Male,Caregiver_Gender_Male,Caregiver_Gender_male,Health_Condition_Arthritis,Health_Condition_Diabetes,Health_Condition_Heart Disease,...,Caregiver_Location_Mombasa,Caregiver_Location_Murang'a,Caregiver_Location_Nairobi,Caregiver_Location_Nakuru,Caregiver_Location_Nyeri,Caregiver_Location_Thika,Languages_Spoken_english;kikuyu,Languages_Spoken_english;luhya,Languages_Spoken_english;swahili,Languages_Spoken_english;swahili;swahili
0,78,45,10.0,1,False,False,False,False,True,False,...,False,False,True,False,False,False,False,False,True,False
1,82,38,5.0,1,True,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,90,50,20.0,1,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
4,74,42,8.0,1,True,True,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
5,68,36,4.0,1,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False


TRAIN TEST SPLIT

In [17]:
from sklearn.model_selection import train_test_split

# Split the dataset into training (80%) and testing (20%) sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the shapes of your splits
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


X_train shape: (68, 50)
X_test shape: (18, 50)


DECISION TREE CLASSIFIER

In [18]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train the Decision Tree model.
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions on the test set.
y_pred_dt = dt_model.predict(X_test)

# Display a quick accuracy check.
from sklearn.metrics import accuracy_score
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))


Decision Tree Accuracy: 0.9444444444444444


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.9444444444444444
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.86      0.92         7
           1       0.92      1.00      0.96        11

    accuracy                           0.94        18
   macro avg       0.96      0.93      0.94        18
weighted avg       0.95      0.94      0.94        18



NAIVE BAYES CLASSIFIER

In [22]:
from sklearn.naive_bayes import GaussianNB

# Initialize and train the Naive Bayes model.
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Make predictions on the test set.
y_pred_nb = nb_model.predict(X_test)

# Display accuracy for Naive Bayes.
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.7222222222222222


EVALUATION

In [23]:
from sklearn.metrics import classification_report

# Evaluate Decision Tree
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))

# Evaluate Naive Bayes
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))


Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.86      0.92         7
           1       0.92      1.00      0.96        11

    accuracy                           0.94        18
   macro avg       0.96      0.93      0.94        18
weighted avg       0.95      0.94      0.94        18

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.58      1.00      0.74         7
           1       1.00      0.55      0.71        11

    accuracy                           0.72        18
   macro avg       0.79      0.77      0.72        18
weighted avg       0.84      0.72      0.72        18



save models for deployment

In [24]:
import pickle

# Save the Decision Tree model
with open('dt_model.pkl', 'wb') as dt_file:
    pickle.dump(dt_model, dt_file)

# Save the Naive Bayes model
with open('nb_model.pkl', 'wb') as nb_file:
    pickle.dump(nb_model, nb_file)

print("Models saved as 'dt_model.pkl' and 'nb_model.pkl'")


Models saved as 'dt_model.pkl' and 'nb_model.pkl'


use for new predictions

In [25]:
# Example: Use one of the models to predict a new match
# Note: New data must be preprocessed (cleaned and encoded) similarly to the training data.

# For demonstration, we'll use an existing row from X_test as a new example.
new_example = X_test.iloc[0].values.reshape(1, -1)

# Predictions using both models
dt_prediction = dt_model.predict(new_example)
nb_prediction = nb_model.predict(new_example)

print("Decision Tree Prediction for new example:", dt_prediction[0])
print("Naive Bayes Prediction for new example:", nb_prediction[0])


Decision Tree Prediction for new example: 0
Naive Bayes Prediction for new example: 0


