<a href="https://colab.research.google.com/github/ogundipe/AkinwaleDataScience/blob/master/Dissertation_Modelling_XGBOOST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install xgboost



Installation of packages

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import plot_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

Import dataset

In [None]:
df= pd.read_csv ('/content/drive/MyDrive/peadiatric_update_Final.csv')

In [None]:
df.shape #This gives us the dimensions of the dataset

In [None]:
columns_to_drop =['Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'patient_deceased_date']
df = df.drop(columns_to_drop, axis=1) # Droping columns that are not useful

In [None]:
df.info()

**Convert Date Column to DateTime format**

In [None]:
# Replace "Null" with NaN
df.replace("Null", pd.NA, inplace=True)

In [None]:
# Assuming you have a list of column names that contain dates
date_columns = ['art_start_date', 'first_confirmed_hiv_test_date', 'last_drug_pickup_date','last_clinic_visit_date','date_of_current_viral_load']

# Input this date format "DD/MM/YYYY" for all missing dates
for column in date_columns:
    df[column] = pd.to_datetime(df[column], format='%d/%m/%Y', errors='coerce')

In [None]:
# Extracting relevant time features for each date column
for column in date_columns:
    df[f'{column}_year'] = df[column].dt.year
    df[f'{column}_month'] = df[column].dt.month
    df[f'{column}_day'] = df[column].dt.day
    df[f'{column}_day_of_week'] = df[column].dt.dayofweek
    df[f'{column}_is_weekend'] = df[column].dt.dayofweek.isin([5, 6]).astype(int)  # 5 and 6 represent Saturday and Sunday

# Dropping the original date columns
df.drop(date_columns, axis=1, inplace=True)

In [None]:
# Identifing missing values using pd.isnull() or pd.isna()
missing_mask = df.isnull()  # Returns a DataFrame with True where the value is missing and False otherwise

# Finding unique values representing missing data in each column
missing_values_dict = {}
for column in df.columns:
    unique_missing_values = df[column][missing_mask[column]].unique()
    missing_values_dict[column] = unique_missing_values

print("Missing Values in Each Column:")
for column, missing_values in missing_values_dict.items():
    if len(missing_values) > 0:
        print(f"{column}: {missing_values}")

**Converting catergorical data using One-Hot Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
# Create a label encoder object
label_encoder = LabelEncoder()

In [None]:
# Separate the features (X) and the target variable (y)
X = df.drop('currentStatus_28', axis=1)
y = df['currentStatus_28']

In [None]:
# One-hot encode all categorical columns
X_encoded = pd.get_dummies(X, drop_first=True)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [None]:
# Mapping 'Active' to 1 and 'Inactive' to 0
class_mapping = {'Active': 1, 'Inactive': 0}

In [None]:
# Appling the mapping to convert the target variable to numeric format
y_train_numeric = y_train.map(class_mapping)
y_test_numeric = y_test.map(class_mapping)

In [None]:
# Creating and train the XGBoost model
model = xgb.XGBClassifier()
model.fit(X_train, y_train_numeric)


To Evaluate the model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import seaborn as sns  # Import Seaborn

In [None]:
y_pred = model.predict(X_test)

# Calculating evaluation metrics
accuracy = accuracy_score(y_test_numeric, y_pred)
precision = precision_score(y_test_numeric, y_pred)
recall = recall_score(y_test_numeric, y_pred)
f1 = f1_score(y_test_numeric, y_pred)

# Print or log the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

In [None]:
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']
scores = [accuracy, precision, recall, f1]

plt.figure(figsize=(10, 6))
plt.bar(metrics, scores, color='skyblue')
plt.ylim(0, 1)  # Set y-axis limit to match the range of scores
plt.title('Model Evaluation Metrics')
plt.xlabel('Metrics')
plt.ylabel('Score')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
conf_matrix = confusion_matrix(y_test_numeric, y_pred)

print("Confusion Matrix:")
print(conf_matrix)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Reds")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

Conducting the features importance

In [None]:
feature_importances = model.feature_importances_

In [None]:
# Organizing the output along with there features
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

In [None]:
import seaborn as sns  # Import Seaborn

In [None]:
# To get the feature names
feature_names = X_train.columns


In [None]:
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

#To get the top 10 features
top_10_features = feature_importance_df.head(10)

print(top_10_features)

In [None]:
# To sSelect the top 8 features and polt it on a graph
top_features = feature_importance_df.head(10)

# Create a bar plot using Seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=top_features, palette='viridis')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.title('Top 10 Feature Importance')
plt.tight_layout()

plt.show()