In [None]:
import os # https://docs.python.org/3/library/os.html
import pandas as pd # https://pandas.pydata.org/docs/user_guide/10min.html
import h2o # https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html
import matplotlib.pyplot as plt # https://matplotlib.org/
from sklearn.preprocessing import LabelEncoder # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error # https://scikit-learn.org/stable/api/sklearn.metrics.html
import numpy as np # https://numpy.org/
from h2o.estimators.xgboost import H2OXGBoostEstimator # https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/xgboost.html

from h2o.automl import H2OAutoML # AutoML functionality from H2O

# Initialize H2O runtime
h2o.init()

# Load Data
# https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
# List all CSV files in the folder
folder_path = '' # Your file path
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')] # https://docs.python.org/3/library/os.html

dfs = []
for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file) # https://docs.python.org/3/library/os.html
    df = pd.read_csv(file_path) # Load CSV file https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
    dfs.append(df)

# Combine all CSVs into a single DataFrame
full_df = pd.concat(dfs, ignore_index=True) # # https://pandas.pydata.org/docs/reference/api/pandas.concat.

# Convert Date column to datetime
full_df['Date'] = pd.to_datetime(full_df['Date'], errors='coerce') # https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
full_df.dropna(subset=['Date'], inplace=True) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html

# Label Encoding for Symbols
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
encoder = LabelEncoder()
full_df['Symbol_Encoded'] = encoder.fit_transform(full_df['Symbol'])

# Drop non-numeric columns
df_encoded = full_df.drop(columns=['Symbol', 'Name'])

# Initialize lists to store results
summary_list = []
actual_predicted_data = []

# Iterate over each cryptocurrency
unique_symbols = full_df['Symbol_Encoded'].unique() # https://pandas.pydata.org/docs/reference/api/pandas.unique.html

for symbol_id in unique_symbols:
    symbol_name = encoder.inverse_transform([symbol_id])[0]
    print(f"Processing: {symbol_name}")

    crypto_df = df_encoded[df_encoded['Symbol_Encoded'] == symbol_id].sort_values(by='Date') # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html

    # Set target and features
    target_column = 'Close'
    time_column = 'Date'
    features = [col for col in df_encoded.columns if col not in [target_column, time_column]] # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.columns.html

    # Split 80/20 chronologically
    total_len = len(crypto_df)
    split_index = int(total_len * 0.8)
    train_df = crypto_df.iloc[:split_index] # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html
    full_test_df = crypto_df.iloc[split_index:]

    # Take only the first 7 days from test set
    if len(full_test_df['Date'].unique()) < 7: # https://pandas.pydata.org/docs/reference/api/pandas.unique.html
        print(f"Skipping {symbol_name}, not enough test data.")
        continue

    first_7_dates = sorted(full_test_df['Date'].unique())[:7]
    test_df = full_test_df[full_test_df['Date'].isin(first_7_dates)] # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isin.html

    # Convert to H2OFrame
    train_h2o = h2o.H2OFrame(train_df) # https://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/frame.html
    test_h2o = h2o.H2OFrame(test_df)

    # Train AutoML model with cross-validation
    xgb = H2OXGBoostEstimator( # https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/xgboost.html
        nfolds=5,         # Try up to 5 different models
        seed=42,          # Reproducibility
        max_runtime_secs=3600, # Max training time: 1 hour
    )
    xgb.train(x=features, y=target_column, training_frame=train_h2o)

    best_model = xgb

    # Predict on test data
    test_predictions = best_model.predict(test_h2o)
    test_df['Predicted'] = test_predictions.as_data_frame().values # https://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/frame.html

    # Evaluate
    y_true = test_df[target_column].values
    y_pred = test_df['Predicted'].values

    mae = mean_absolute_error(y_true, y_pred) # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html
    mse = mean_squared_error(y_true, y_pred) # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))# https://numpy.org/doc/2.1/reference/generated/numpy.sqrt.html
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100 # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_percentage_error.html

    # Build final forecast table for this crypto
    summary_list.append({ # https://docs.python.org/3/tutorial/datastructures.html
        "Symbol": symbol_name,
        "Test MAE": mae,
        "Test MSE": mse,
        "Test RMSE": rmse,
        "Test MAPE (%) 7d H2O XGB": mape
    })

    
    for date, actual, predicted in zip(test_df[time_column], test_df[target_column], test_df['Predicted']):
        actual_predicted_data.append({
            "Symbol": symbol_name,
            "Date": date,
            "Actual": actual,
            "Predicted": predicted
        })

    # Plot predictions
    # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.plot.html
    # https://matplotlib.org/stable/tutorials/index.html
    plt.figure(figsize=(12, 6))
    plt.plot(train_df[time_column], train_df[target_column], color='blue', label="Train Data")
    plt.plot(test_df[time_column], test_df[target_column], color='green', label="Actual (15 days)")
    plt.plot(test_df[time_column], test_df['Predicted'], linestyle='dashed', color='red', label="Predicted (15 days)")
    plt.xlabel("Date")
    plt.ylabel("Close Price")
    plt.title(f"{symbol_name} - 7-Day Prediction")
    plt.legend()
    plt.xticks(rotation=45)
    plt.grid()
    plt.show()

# Save results
summary_df = pd.DataFrame(summary_list) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html
actual_predicted_df = pd.DataFrame(actual_predicted_data)

# Print evaluation metrics
print("\n Summary Metrics for 7-Day Predictions:")
print(summary_df)

print("\n Actual vs Predicted (First 7 Test Days):")
print(actual_predicted_df)

# Save as csv files
summary_df.to_csv("H2O_XGB_7d_metrics.csv", index=False) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
actual_predicted_df.to_csv("H2O_XGB_7d_predictions.csv", index=False)

# Shutdown H2O
h2o.shutdown(prompt=False) # https://docs.h2o.ai/h2o/latest-stable/h2o-r/docs/reference/h2o.shutdown.html
    