In [None]:
#Demonstrate comprehensive understanding of ML experiment tracking, model building, model selection, and model registration using MLflow either on your local machine or on a hyperscaler (AWS, Azure, GCP).
# Importing all necessary libraries
get_ipython().system('pip install --upgrade pip')

Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3


In [None]:
get_ipython().system('pip install mlflow')
get_ipython().system('pip install scikit-learn')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
# MLOps Pipeline with MLflow and Docker
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 1. Data preparation
# Assuming X and y are already defined from previous steps
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # This line requires X and y to be defined

# 2. Start MLflow experiment
mlflow.start_run()

# 3. Train model
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train) # This line requires X_train and y_train to be defined

# 4. Evaluate model
predictions = model.predict(X_test) # This line requires X_test to be defined
accuracy = accuracy_score(y_test, predictions) # This line requires y_test and predictions to be defined

# 5. Log metrics and model
mlflow.log_metric("accuracy", accuracy) # This line requires accuracy to be defined
mlflow.sklearn.log_model(model, "random_forest_model")

# 6. Register model for deployment
model_uri = f"runs:/{mlflow.active_run().info.run_id}/random_forest_model" # This line requires an active run
mlflow.register_model(model_uri, "production_model")

mlflow.end_run() # Ensure run is ended even if errors occur in commented lines

Successfully registered model 'production_model'.
Created version '1' of model 'production_model'.


In [None]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Start MLflow experiment
mlflow.set_experiment("model-comparison")
with mlflow.start_run():
    # Log parameters
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 10)

    # Train model
    model = RandomForestClassifier(n_estimators=100, max_depth=10)
    model.fit(X_train, y_train)

    #Make predictions and log metrics
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    mlflow.log_metric("accuracy", accuracy)

    #Log model
    mlflow.sklearn.log_model(model, "model")

2025/10/29 05:06:32 INFO mlflow.tracking.fluent: Experiment with name 'model-comparison' does not exist. Creating a new experiment.


In [None]:
get_ipython().system('pip install mlflow scikit-learn')



In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# Create dummy data
X = np.random.rand(100, 10)  # 100 samples, 10 features
y = np.random.randint(0, 2, 100) # 100 samples, binary target

# Split dummy data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Create dummy data (assuming this is needed based on previous errors)
X = np.random.rand(100, 10)  # 100 samples, 10 features
y = np.random.randint(0, 2, 100) # 100 samples, binary target

# Split dummy data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Start MLflow experiment
mlflow.set_experiment("model-comparison")
with mlflow.start_run() as run:
    run_id = run.info.run_id # Get the run_id here

    # Log parameters
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 10)

    # Train model
    model = RandomForestClassifier(n_estimators=100, max_depth=10)
    model.fit(X_train, y_train)

    #Make predictions and log metrics
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    mlflow.log_metric("accuracy", accuracy)

    #Log model
    mlflow.sklearn.log_model(model, "model")

    # Register model for deployment
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri, "production_model")

# The run is automatically ended when exiting the 'with' block

Registered model 'production_model' already exists. Creating a new version of this model...
Created version '2' of model 'production_model'.


In [None]:
# Register best model
model_uri = "runs:/{}/model".format(run_id)
mlflow.register_model(model_uri, "MyMLModel")
# Transition model to production
client = mlflow.tracking.MlflowClient()
client.transition_model_version_stage( name="MyMLModel", version=1, stage="Production" )

Successfully registered model 'MyMLModel'.
Created version '1' of model 'MyMLModel'.
  client.transition_model_version_stage( name="MyMLModel", version=1, stage="Production" )


<ModelVersion: aliases=[], creation_timestamp=1761714532018, current_stage='Production', deployment_job_state=None, description=None, last_updated_timestamp=1761714532029, metrics=[<Metric: dataset_digest=None, dataset_name=None, key='accuracy', model_id='m-becfaa461f8a468085d44956472efb10', run_id='a19a1981212d4fb88c65333f701afb5d', step=0, timestamp=1761714450415, value=0.8>], model_id='m-becfaa461f8a468085d44956472efb10', name='MyMLModel', params={'max_depth': '10', 'n_estimators': '100'}, run_id='a19a1981212d4fb88c65333f701afb5d', run_link=None, source='models:/m-becfaa461f8a468085d44956472efb10', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- 1. Dataset Generation (200,000 Samples) ---
# Simulating a "big" dataset (200,000 rows, 13 features) for a Classification task.
print("--- 1. Generating a Large Synthetic Dataset (200,000 rows) ---")
N_SAMPLES = 200000
N_FEATURES = 12

# Generate Numerical Features (8 features)
data_num = {
    f'num_feat_{i}': np.random.randn(N_SAMPLES) * (i + 1) for i in range(8)
}

# Generate Categorical Features (4 features)
categories = {
    'city_region': np.random.choice(['North', 'South', 'East', 'West', np.nan], N_SAMPLES, p=[0.2, 0.3, 0.2, 0.2, 0.1]),
    'product_type': np.random.choice(['A', 'B', 'C', 'D'], N_SAMPLES),
    'device_os': np.random.choice(['iOS', 'Android', 'Web'], N_SAMPLES, p=[0.4, 0.3, 0.3]),
    'is_prime': np.random.choice([True, False], N_SAMPLES, p=[0.15, 0.85])
}

df = pd.DataFrame({**data_num, **categories})

# Introduce some missing values into numerical columns
for col in ['num_feat_1', 'num_feat_5']:
    missing_indices = np.random.choice(N_SAMPLES, size=int(0.05 * N_SAMPLES), replace=False)
    df.loc[missing_indices, col] = np.nan

# Generate a Binary Target Variable (y)
df['target'] = np.random.randint(0, 2, N_SAMPLES)
X = df.drop('target', axis=1)
y = df['target']

print(f"Dataset Shape (X): {X.shape}")
print(f"Target Shape (y): {y.shape}")
print("-" * 60)
print(X.head())
print("-" * 60)


# --- 2. Data Preprocessing Pipeline (Scikit-learn ColumnTransformer) ---

# Identify feature types
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()

print(f"Numerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")
print("-" * 60)

# Define preprocessing steps for numerical data
numerical_transformer = Pipeline(steps=[
    # Strategy 1: Impute missing numerical values with the mean
    ('imputer', SimpleImputer(strategy='mean')),
    # Strategy 2: Scale features using Z-score standardization
    ('scaler', StandardScaler())
])

# Define preprocessing steps for categorical data
categorical_transformer = Pipeline(steps=[
    # Strategy 1: Impute missing categorical values with a constant 'missing'
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    # Strategy 2: Encode categories using One-Hot Encoding
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Create a ColumnTransformer to apply transformations to the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'  # Keep any other columns untouched (none in this case)
)

print("Preprocessing Pipeline (ColumnTransformer) defined.")
print("-" * 60)


# --- 3. Apply Preprocessing and Split Data ---

print("--- 3. Applying Preprocessing and Splitting Data ---")

# Apply the preprocessing steps to the full dataset X
X_processed_array = preprocessor.fit_transform(X)

# Get feature names after one-hot encoding for the final DataFrame
ohe_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
final_feature_names = numerical_features + ohe_feature_names.tolist()

# Convert the processed array back to a DataFrame for easy inspection
X_processed = pd.DataFrame(X_processed_array, columns=final_feature_names)

print(f"Processed Dataset Shape: {X_processed.shape}")
print("Processed Dataset (First 5 Rows):")
print(X_processed.head())
print("-" * 60)

# Split the processed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print("-" * 60)

# The 'preprocessor' object is the artifact needed to clean new data before deployment.
# We will use this preprocessor in the subsequent steps of the MLOps pipeline.

# NOTE: The resulting variables X_train, X_test, y_train, and y_test are ready
# for immediate use in training two different ML models, satisfying the
# requirements of your MLflow Model Building (2.) step.


--- 1. Generating a Large Synthetic Dataset (200,000 rows) ---
Dataset Shape (X): (200000, 12)
Target Shape (y): (200000,)
------------------------------------------------------------
   num_feat_0  num_feat_1  num_feat_2  num_feat_3  num_feat_4  num_feat_5  \
0    0.183776   -1.346512    2.008726    3.091149   -2.056032   -0.222408   
1    0.095921    1.415653   -3.107627    5.236909   -1.426230    1.512554   
2   -0.742375         NaN    0.277130   -3.741007   -1.897122   -9.563048   
3    0.974448    1.877190    3.259789    1.277742    4.171122   -7.728250   
4    0.305433   -3.391127   -1.903347    0.963442   -2.605184   -5.406868   

   num_feat_6  num_feat_7 city_region product_type device_os  is_prime  
0    1.806583    2.807676       South            C       Web     False  
1   -0.726231    8.212578       South            C   Android      True  
2    6.828250    1.566795        East            C       Web     False  
3   -3.006983   11.233883       South            A       Web 