In [0]:
# Install the required package
%pip install azure-storage-blob


import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib
from azure.storage.blob import BlobServiceClient
import numpy as np

# Define connection string and container name
connection_string = "DefaultEndpointsProtocol=https;AccountName=amldatacnak;AccountKey=OD7x5wWUM2IoaFIUIVW1Sq8hI6HzKhnA7Rr4nI+9/OH0jElAD+eksiEoIsIyhjmSlgoynvE5cUwa+AStSBrKkQ==;EndpointSuffix=core.windows.net"
container_name = "amldata"

# Initialize BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Ensure the /dbfs/mnt/ directory exists
mnt_dir = "/dbfs/mnt/"
if not os.path.exists(mnt_dir):
    os.makedirs(mnt_dir)

# 1. Load the Model from Azure Blob Storage
model_blob_path = "models/random_forest_model.joblib"
model_blob_client = blob_service_client.get_blob_client(container=container_name, blob=model_blob_path)
model_file_path = os.path.join(mnt_dir, "random_forest_model.joblib")

# Download the model
with open(model_file_path, "wb") as model_file:
    model_file.write(model_blob_client.download_blob().readall())

# Load the model
rf_loaded = joblib.load(model_file_path)

# 2. Fetch the User's Data from Azure Blob Storage
data_blob_path = "raw_data/data.csv"
data_blob_client = blob_service_client.get_blob_client(container=container_name, blob=data_blob_path)
data_file_path = os.path.join(mnt_dir, "data.csv")

# Download the user's data
with open(data_file_path, "wb") as data_file:
    data_file.write(data_blob_client.download_blob().readall())

# Load the data into a DataFrame
new_data = pd.read_csv(data_file_path)

# 3. Separate 'From Bank', 'Account', 'To Bank', 'Account.1' into a different DataFrame
bank_info_df = new_data[['From Bank', 'Account', 'To Bank', 'Account.1']]

# Retain only the relevant features for prediction
new_data = new_data[['Amount Received', 'Amount Paid', 'Payment Format', 'Year', 'Month']]

# 4. Preprocess the Data
categorical_features = ['Payment Format']  # Update with actual categorical features

# Load the label encoders from storage
label_encoders_path = "models/label_encoders.joblib"
label_encoders_blob_client = blob_service_client.get_blob_client(container=container_name, blob=label_encoders_path)
label_encoders_file_path = os.path.join(mnt_dir, "label_encoders.joblib")

with open(label_encoders_file_path, "wb") as enc_file:
    enc_file.write(label_encoders_blob_client.download_blob().readall())

label_encoders = joblib.load(label_encoders_file_path)

# Handle unseen labels in categorical features
for feature in categorical_features:
    if feature in new_data.columns:
        encoder = label_encoders.get(feature)
        if encoder:
            new_labels = set(new_data[feature].unique()) - set(encoder.classes_)
            if new_labels:
                encoder.classes_ = np.concatenate([encoder.classes_, list(new_labels)])
            new_data[feature] = encoder.transform(new_data[feature].astype(str))
        else:
            print(f"Label encoder for feature '{feature}' not found.")

# 5. Generate Predictions
predictions = rf_loaded.predict(new_data)

# 6. Decode the 'Payment Format' Column
for feature in categorical_features:
    if feature in new_data.columns:
        encoder = label_encoders.get(feature)
        if encoder:
            new_data[feature] = encoder.inverse_transform(new_data[feature])

# 7. Combine Predictions with the Original DataFrame
new_data['predictions'] = predictions
final_df = pd.concat([bank_info_df, new_data], axis=1)

# 8. Save Results to Azure Blob Storage
result_blob_path = "predictions/predictions_with_results.csv"
result_blob_client = blob_service_client.get_blob_client(container=container_name, blob=result_blob_path)
result_file_path = os.path.join(mnt_dir, "predictions_with_results.csv")

# Save results to CSV
final_df.to_csv(result_file_path, index=False)

# Upload the results file
with open(result_file_path, "rb") as result_file:
    result_blob_client.upload_blob(result_file, overwrite=True)

print("Predictions have been saved and uploaded successfully.")


[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Predictions have been saved and uploaded successfully.
