In [None]:
import os
import joblib

import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, log_loss

from xgboost import XGBClassifier
 
import mlflow


In [None]:
# URI to database
gcp_bucket = ''
MLFLOW_TRACKING_URI = "http://localhost:5000"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [None]:
gcs_bucket_name = "gcp-mlflow-artifacts"
experiment_name = 'stroke-predictor'
gcs_artifact_location = f"gs://{gcs_bucket_name}/mlartifacts/"

experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    mlflow.create_experiment(name=experiment_name, artifact_location=gcs_artifact_location)

mlflow.set_experiment(experiment_name)

In [None]:
df = pd.read_csv("../data/healthcare-dataset-stroke-data.csv")

In [None]:
df.head(5)

In [None]:
print(df.dtypes)

In [None]:
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1, 'Other': -1})
df['ever_married'] = df['ever_married'].map({'Yes': 1, 'No': 0})
df['Residence_type'] = df['Residence_type'].map({'Urban': 1, 'Rural': 0})

In [None]:
df.isnull().sum()

In [None]:
# Fill null bmi values
feature_cols = [col for col in df.columns if col not in ['bmi', 'stroke', 'id']]

categorical_cols = ['work_type', 'smoking_status']
numerical_cols = [col for col in feature_cols if df[col].dtype in ['float64', 'int64'] and col not in categorical_cols]

# Split data into missing and non-missing bmi
missing_bmi = df[df['bmi'].isna()]
not_missing_bmi = df[~df['bmi'].isna()]

# Preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Pipeline for imputation
bmi_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


# Check which columns are missing from not_missing_bmi
missing_in_train = [col for col in feature_cols if col not in not_missing_bmi.columns]

# Fit on non-missing bmi
y_train = not_missing_bmi['bmi']
X_train = not_missing_bmi[feature_cols]

bmi_pipe.fit(X_train, y_train)

# Predict missing bmi
X_missing = missing_bmi[feature_cols]
predicted_bmi = bmi_pipe.predict(X_missing)

# Fill missing values
df.loc[missing_bmi.index, 'bmi'] = pd.Series(predicted_bmi, index=missing_bmi.index)

In [None]:
df.isnull().sum()

In [None]:
with mlflow.start_run():

    categorical_cols = ['work_type', 'smoking_status']
    numerical_cols = ['bmi', 'age', 'avg_glucose_level']

    # Define the column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
            ('num', StandardScaler(), numerical_cols)]
    )

    # Define the model pipeline
    xgb_model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier())
    ])

    # Separate features and target
    X = df.drop('stroke', axis=1)
    y = df['stroke']

    # Fit the pipeline
    xgb_model_pipeline.fit(X, y)

    y_pred = xgb_model_pipeline.predict(X)
    log_loss_ = log_loss(y, y_pred)
    accuracy = accuracy_score(y, y_pred)

    mlflow.log_metric('log_loss', log_loss_)
    mlflow.log_metric('accuracy', accuracy)

    os.makedirs("../models", exist_ok=True)
    joblib.dump(xgb_model_pipeline, "../models/model_pipeline.pkl")

    # Log the file as an artifact
    mlflow.log_artifact("../models/model_pipeline.pkl")
    
    run = mlflow.active_run()
    run_id = run.info.run_id

     # Log the model using MLflow's sklearn flavor
    mlflow.sklearn.log_model(
        sk_model=xgb_model_pipeline,
        name="model"
    )

    model_uri = f"runs:/{run_id}/model"
    result = mlflow.register_model(model_uri, "xgb_model")

    # Add a tag to the registered model version
    from mlflow.tracking import MlflowClient

    client = MlflowClient()
    client.set_model_version_tag(
        name="xgb_model",
        version=result.version,
        key="stage",
        value=" development"  # or "production"
    )


In [None]:
# Register the model
model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"