Task: Predict the onset of diabetes based on diagnostic measures.

**Data**

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. 
The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

### Imports

In [None]:
# !pip install xgboost

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import mlflow.xgboost
import mlflow.sklearn
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
df = pd.read_csv('pima-indians-diabetes.csv')
display(df.head())
display(df.shape)

### Exploratory Data Analysis

In [None]:
# Correlation
sns.heatmap(df.corr(), annot=True, cmap='RdYlGn', linewidths=0.9);
plt.title('Correlation between variables');

In [None]:
# Missing values 
df.isnull().sum()

In [None]:
# Distribution of the target variable
df['Outcome'].value_counts().plot(kind='bar', color=['green', 'red']);
plt.title('Distribution of the target variable');

In [None]:
# Subplots of distribution of all the features

fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))
fig.subplots_adjust(hspace=0.5)
# Colors
sns.set_palette('RdYlGn')
for i, ax in enumerate(axes.flatten()):
    sns.histplot(df[df.columns[i]], ax=ax, kde=True, fill=True);
    plt.title(df.columns[i]);

### Train and Test split

In [None]:
X = df.iloc[:,0:8]
y = df.iloc[:,8]

In [None]:
y.head(1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)
print('y_train shape: ', y_train.shape)
print('y_test shape: ', y_test.shape)

### Instantiating the model

In [None]:
model = XGBClassifier(eval_metric='mlogloss')

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [None]:
# Classification report
print(classification_report(y_test, predictions))

In [None]:
# Display Confusion matrix
cm = confusion_matrix(y_test, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot();

### Using GridSearch with GPU for hyper-parameter optimisation

In [None]:
# Parameters for Grid Search to find the best combination
param_grid = {
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.01],
    'eval_metric': ['mlogloss', 'merror']
}

# MLFlow tracking
model_name = "first_model"
mlflow.set_experiment('Grid Search')
mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True, 
                       log_models=True, registered_model_name=model_name)


with mlflow.start_run() as run:
    xgb = XGBClassifier(device='cuda', verbosity=2)
    grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)
mlflow.end_run()

In [None]:
print("Best parameters found: ", grid_search.best_params_)
print("Classification report with best parameters: ", classification_report(y_test, grid_search.predict(X_test)))

#### Loading the best parameters model

In [None]:
model = mlflow.pyfunc.load_model(model_uri=f"models:/first_model/1")

# Get sample for testing the loaded model
sample = X_test.sample(5)
sample

In [None]:
print("Model prediction: ", model.predict(sample))
print("Actual values: \n", y_test[sample.index])