In [0]:
# Installing snowflake connector
!pip install snowflake

In [0]:
%restart_python

In [0]:
# Importing libraries
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import snowflake.connector
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import json

In [0]:
# Snowflake credentials (replace these with environment variables or secrets in production)
username = dbutils.secrets.get(scope="sf_connection", key="sf_username")
password = dbutils.secrets.get(scope="sf_connection", key="sf_password")
account = dbutils.secrets.get(scope="sf_connection", key="sf_account")
warehouse = dbutils.secrets.get(scope="sf_connection", key="sf_warehouse")
database = dbutils.secrets.get(scope="sf_connection", key="sf_database")

# Optional: set them as environment variables if you need
os.environ["sf_username"] = username
os.environ["sf_password"] = password
os.environ["sf_account"] = account
os.environ["sf_warehouse"] = warehouse
os.environ["sf_database"] = database

In [0]:

# Set Snowflake connection options
sfOptions = {
  "sfURL": account,
  "sfUser": username,
  "sfPassword": password,
  "sfDatabase": "mlops",
  "sfSchema": "public",
  "sfWarehouse": "COMPUTE_WH"
}

# Read data from a Snowflake table
df = spark.read.format("snowflake").options(**sfOptions).option("dbtable", "ice_cream").load()
# Show the data
# Convert Spark DataFrame to Pandas DataFrame
pdf = df.toPandas()
display(pdf)

def load_data():
    X = pdf[['TEMP']]
    y = pdf['PRICE']
    return X, y

def split_data(X, y):   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=6)
    return X_train, X_test, y_train, y_test

def model_train(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

   
if __name__ == "__main__":
    X, y = load_data()
    X_train, X_test, y_train, y_test = split_data(X, y)
    model = model_train(X_train, y_train)
    # Define output directory in DBFS
    output_dir = "/Workspace/Users/mahesh2139@gmail.com/Model"
    os.makedirs(output_dir, exist_ok=True)

    # Save files to DBFS
    joblib.dump(model, f"{output_dir}/model.pkl")
 
    print("Model and test data saved to DBFS at /dbfs/tmp/model_output/")