In [0]:
!pip install azure-storage-blob
!pip install azureml-sdk
!pip install pandas
!pip install plotly
!pip install statsmodels


In [0]:
%restart_python

In [0]:
from azure.storage.blob import BlobServiceClient
import pandas as pd
import pickle
from datetime import datetime
from statsmodels.tsa.arima.model import ARIMA

connection_string = "ADL CONNECTION STRING"
container_name = "gold-forecasting-container"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Function to download the files from Azure Data Lake
def download_file_from_blob(container_name, blob_name, local_path):
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    try:
        with open(local_path, "wb") as file:
            file.write(blob_client.download_blob().readall())
        print(f"Downloaded: {blob_name}")
    except Exception as e:
        print(f"Error downloading {blob_name}: {str(e)}")

# Get the current year and month
current_date = datetime.now()
current_year_month = current_date.strftime('%y%m')

# Try to list the blobs in the 'transformed-data' container to find the correct file name
container_client = blob_service_client.get_container_client(container_name)
blobs = container_client.list_blobs(name_starts_with="transformed-data/")

# Find the correct file name by checking if it matches the year-month format
correct_file_name = None
for blob in blobs:
    if f"{current_year_month}_transformed.csv" in blob.name:
        correct_file_name = blob.name
        break

# If no file is found for the current month, fall back to the most recent file
if not correct_file_name:
    blobs = container_client.list_blobs(name_starts_with="transformed-data/")
    for blob in blobs:
        correct_file_name = blob.name
        break

if correct_file_name:
    download_file_from_blob(container_name, correct_file_name, f"{current_year_month}_transformed.csv")
else:
    print("No matching transformed data file found.")

# Download the latest price gold CSV file
download_file_from_blob(container_name, "latest_gold_history/latest_price_gold.csv", "latest_price_gold.csv")

# Loading the CSV files
recent_data = pd.read_csv(f"{current_year_month}_transformed.csv")
latest_gold_data = pd.read_csv("latest_price_gold.csv")

# Merge the two datasets of gold prices
merged_data = pd.concat([latest_gold_data, recent_data], ignore_index=True)

# Convert 'date' column to datetime, removing the strict format
merged_data['date'] = pd.to_datetime(merged_data['date'], errors='coerce')

# Drop rows with invalid dates if any
merged_data = merged_data.dropna(subset=['date'])

# Remove duplicates based on the 'date' column
merged_data = merged_data.drop_duplicates(subset=['date'])

# Sort data by 'date'
merged_data.sort_values(by='date', inplace=True)

# Set 'date' as the index
merged_data.set_index('date', inplace=True)

# Generate a continuous date range
date_range = pd.date_range(start=merged_data.index.min(), end=merged_data.index.max(), freq='D')

# Reindex the data with the continuous date range
merged_data = merged_data.reindex(date_range)

# Interpolate missing data linearly
merged_data['price'] = merged_data['price'].interpolate(method='linear')

# Reset index and rename
merged_data.reset_index(inplace=True)
merged_data.rename(columns={'index': 'date'}, inplace=True)

# Save the merged and cleaned data to a CSV
merged_data.to_csv("updated_latest_price_gold.csv", index=False)

# Upload the merged file back to Azure Data Lake
def upload_file_to_blob(local_path, container_name, blob_name):
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    with open(local_path, "rb") as data:
        blob_client.upload_blob(data, overwrite=True)

# Upload the updated latest gold price CSV
upload_file_to_blob("updated_latest_price_gold.csv", container_name, "latest_gold_history/latest_price_gold.csv")

# Retrain the model with the merged data
train_data = merged_data['price'] 

train_diff = train_data.diff().dropna()
model = ARIMA(endog=train_diff, order=(8, 0, 8)) 
model_fitted = model.fit()

# Save the retrained model
with open("retrained_model.pkl", "wb") as file:
    pickle.dump(model_fitted, file)

# Upload the retrained model to Azure Data Lake
upload_file_to_blob("retrained_model.pkl", container_name, "model/model_pickel.pkl")


Downloaded: transformed-data/202412_transformed.csv
Downloaded: latest_gold_history/latest_price_gold.csv


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
