In [0]:
# Notebook 1: Data Ingestion and Model Training with MLflow
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import mlflow
import mlflow.sklearn

path="/Workspace/Users/nikesh.kaza@accenture.com/mlops/data"
df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv")
df.to_csv(path+"/boston.csv", index=False)  # Save to DBFS

# Split data
train, test = train_test_split(df, test_size=0.2, random_state=42)
train.to_csv(path+"/train.csv", index=False)
test.to_csv(path+"/test.csv", index=False)

# Train model
X_train = train.drop("medv", axis=1)
y_train = train["medv"]

mlflow.start_run()
model = LinearRegression().fit(X_train, y_train)
mlflow.sklearn.log_model(model, "model")
rmse = mean_squared_error(y_train, model.predict(X_train), squared=False)
mlflow.log_metric("rmse", rmse)
mlflow.end_run()

print(f"Model trained done and RMSE logged: {rmse}")
