<a href="https://colab.research.google.com/github/luiz-oliveir/lstm_mensal/blob/main/LSTM_VAE_Mensal_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/luiz-oliveir/LSTM_mensal.git

Cloning into 'LSTM_mensal'...
remote: Enumerating objects: 107, done.[K
remote: Counting objects: 100% (107/107), done.[K
remote: Compressing objects: 100% (106/106), done.[K
remote: Total 107 (delta 23), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (107/107), 14.21 MiB | 19.61 MiB/s, done.
Resolving deltas: 100% (23/23), done.


In [6]:
"""
LSTM VAE Model for Monthly Temperature Analysis - Google Colab Version
"""

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os
import sys
import glob
import shutil
import logging
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Disable GPU for Colab (if causing memory issues)
try:
    tf.config.set_visible_devices([], 'GPU')
except:
    pass

# Diretórios
lstm_root = '/content/drive/MyDrive/LSTM'
base_dir = os.path.join(lstm_root, 'lstm mensal')
data_dir = os.path.join(lstm_root, 'Convencionais processadas temperaturas')
model_dir = os.path.join(base_dir, 'lstm_vae_model')
images_dir = os.path.join(base_dir, 'lstm_vae_images')
results_dir = os.path.join(base_dir, 'Resumo resultados')

def copy_data_file():
    """Copy data file to Google Drive if needed"""
    target_file = os.path.join(data_dir, '82024.xlsx')

    # Check if file already exists in Google Drive
    if os.path.exists(target_file):
        print(f"\nData file already exists in Google Drive: {target_file}")
        return True

    # Try to find the file in Colab environment
    possible_locations = [
        '/content/82024.xlsx',  # Root Colab directory
        '/content/sample_data/82024.xlsx',  # Sample data directory
        '/content/drive/MyDrive/82024.xlsx'  # Google Drive root
    ]

    source_file = None
    for loc in possible_locations:
        if os.path.exists(loc):
            source_file = loc
            break

    if source_file:
        print(f"\nCopying data file from {source_file} to {target_file}")
        shutil.copy2(source_file, target_file)
        return True

    print("\nError: Could not find 82024.xlsx")
    print("Please upload the file using one of these methods:")
    print("1. Upload directly to Google Drive in the 'Convencionais processadas temperaturas' folder")
    print("2. Upload to Colab using the file browser (left sidebar)")
    print("\nFile required: 82024.xlsx")
    return False

# Setup Google Drive directories
def setup_drive_directories():
    """Create necessary directories in Google Drive"""
    print("\nSetting up Google Drive directories...")

    # Create all required directories
    for dir_path in [lstm_root, base_dir, data_dir, model_dir, images_dir, results_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            print(f"Created directory: {dir_path}")
        else:
            print(f"Directory exists: {dir_path}")

# Debug: Print directory existence
print("\nVerifying directories:")
print(f"LSTM root exists: {os.path.exists(lstm_root)}")
print(f"Base dir exists: {os.path.exists(base_dir)}")
print(f"Data dir exists: {os.path.exists(data_dir)}")
print(f"Model dir exists: {os.path.exists(model_dir)}")
print(f"Images dir exists: {os.path.exists(images_dir)}")
print(f"Results dir exists: {os.path.exists(results_dir)}")

# Create directories if needed
setup_drive_directories()

# Copy data file if needed
if not copy_data_file():
    sys.exit(1)

# Validate data directory and files
data_files = glob.glob(os.path.join(data_dir, "*.xlsx"))
if not data_files:
    print(f"\nError: No Excel files found in the data directory: {data_dir}")
    print("\nPlease ensure 82024.xlsx is in the correct location:")
    print(f"{os.path.join(data_dir, '82024.xlsx')}")
    sys.exit(1)

print(f"\nFound {len(data_files)} Excel file(s):")
for file in data_files:
    print(f"- {os.path.basename(file)}")

# Configurações
row_mark = 740
batch_size = 128
timesteps = 7  # Janela de tempo para análise
n_features = 1  # Número de features (temperatura)
latent_dim = 32  # Dimensão do espaço latente
epoch_num = 100
threshold = None

# Dicionário de meses
meses = {
    1:'jan', 2:'fev', 3:'mar', 4:'abr', 5:'mai', 6:'jun',
    7:'jul', 8:'ago', 9:'set', 10:'out', 11:'nov', 12:'dez'
}

class ReparameterizationLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(ReparameterizationLayer, self).__init__(**kwargs)

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch_size = tf.shape(z_mean)[0]
        latent_dim = tf.shape(z_mean)[1]
        epsilon = tf.random.normal(shape=(batch_size, latent_dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

class RepeatVectorLayer(tf.keras.layers.Layer):
    def __init__(self, timesteps, **kwargs):
        super(RepeatVectorLayer, self).__init__(**kwargs)
        self.timesteps = timesteps

    def call(self, inputs):
        return tf.repeat(tf.expand_dims(inputs, axis=1), repeats=self.timesteps, axis=1)

    def get_config(self):
        config = super(RepeatVectorLayer, self).get_config()
        config.update({'timesteps': self.timesteps})
        return config

class TemperatureWeightLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(TemperatureWeightLayer, self).__init__(**kwargs)

    def call(self, x):
        mean = tf.keras.backend.mean(x, axis=[1, 2], keepdims=True)
        std = tf.keras.backend.std(x, axis=[1, 2], keepdims=True) + tf.keras.backend.epsilon()
        z_scores = tf.abs((x - mean) / std)
        weights = tf.exp(z_scores)
        weights = weights / (tf.keras.backend.mean(weights, axis=[1, 2], keepdims=True) + tf.keras.backend.epsilon())
        return weights

class LSTM_VAE(tf.keras.Model):
    def __init__(self, timesteps=7, n_features=1, latent_dim=32, **kwargs):
        super(LSTM_VAE, self).__init__(**kwargs)
        self.timesteps = timesteps
        self.n_features = n_features
        self.latent_dim = latent_dim

        # First LSTM layer
        self.lstm = tf.keras.layers.LSTM(32, return_sequences=True, name='lstm')

        # First LSTM output processing
        self.dense = tf.keras.layers.Dense(4, name='dense')
        self.dense_1 = tf.keras.layers.Dense(4, name='dense_1')
        self.dense_2 = tf.keras.layers.Dense(32, name='dense_2')

        # Second LSTM layer
        self.lstm_1 = tf.keras.layers.LSTM(32, return_sequences=True, name='lstm_1')

        # Second LSTM output processing
        self.dense_3 = tf.keras.layers.Dense(32, name='dense_3')
        self.dense_4 = tf.keras.layers.Dense(16, name='dense_4')
        self.dense_5 = tf.keras.layers.Dense(1, name='dense_5')

        # Final processing branch
        self.dense_6 = tf.keras.layers.Dense(16, name='dense_6')
        self.dense_7 = tf.keras.layers.Dense(1, name='dense_7')

        # Additional layers
        self.repeat_vector = tf.keras.layers.RepeatVector(timesteps, name='repeat_vector')
        self.dropout = tf.keras.layers.Dropout(0.2, name='dropout')
        self.layer_norm = tf.keras.layers.LayerNormalization(name='layer_normalization')

    def call(self, inputs, training=None):
        # First LSTM processing
        x = self.lstm(inputs)

        # Process first LSTM output
        x = self.dense(x)
        x = self.dense_1(x)
        x = self.dense_2(x)

        # Second LSTM processing
        x = self.lstm_1(x)

        # Get last timestep for repeat vector
        last_timestep = x[:, -1, :]
        x = self.repeat_vector(last_timestep)

        # Process repeated vector
        x = self.dense_3(x)
        x = self.dense_4(x)

        # Apply dropout during training
        if training:
            x = self.dropout(x)

        # Apply layer normalization
        x = self.layer_norm(x)

        # Generate outputs through two branches
        output1 = self.dense_5(x)
        x = self.dense_6(x)
        output2 = self.dense_7(x)

        # Combine outputs
        outputs = tf.concat([output1, output2], axis=-1)

        return outputs

    def get_config(self):
        config = super(LSTM_VAE, self).get_config()
        config.update({
            'timesteps': self.timesteps,
            'n_features': self.n_features,
            'latent_dim': self.latent_dim
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

def reshape(da):
    """Reshape dados para formato LSTM"""
    data = []
    for i in range(len(da) - timesteps + 1):
        data.append(da[i:(i + timesteps)])
    return np.array(data)

def prepare_training_data(data, batch_size=128):
    """Prepara dados para o modelo"""
    data = reshape(data)
    data = data.reshape(-1, timesteps, n_features)
    dataset = tf.data.Dataset.from_tensor_slices(data)
    dataset = dataset.shuffle(buffer_size=1024).batch(batch_size)
    return dataset

def calculate_advanced_metrics(predictions, originals):
    """Calculate advanced evaluation metrics"""
    mse = np.mean(np.square(predictions - originals))
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(predictions - originals))
    mape = np.mean(np.abs((originals - predictions) / originals)) * 100

    # Calculate log-likelihood metrics
    residuals = predictions - originals
    std = np.std(residuals)
    log_likelihood = -0.5 * np.sum(np.square(residuals) / (2 * np.square(std)) + np.log(2 * np.pi * np.square(std)))

    # Calculate percentile metrics
    percentiles = [1, 5, 25, 50, 75, 95, 99]
    orig_percentiles = np.percentile(originals, percentiles)
    pred_percentiles = np.percentile(predictions, percentiles)

    percentile_errors = {
        f'p{p}_error': abs(o - p)
        for p, o, p in zip(percentiles, orig_percentiles, pred_percentiles)
    }

    metrics = {
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'mape': mape,
        'log_likelihood': log_likelihood,
        **percentile_errors
    }

    return metrics

def train_monthly_models(data_files):
    """Train separate LSTM VAE models for each month"""
    print("\nStarting monthly model training...")
    print(f"Model directory: {model_dir}")
    print(f"Processing file: {os.path.basename(data_files[0])}")

    try:
        # Read data
        file_path = data_files[0]  # We know we only have one file
        df = pd.read_excel(file_path)
        df['Data'] = pd.to_datetime(df['Data'])
        df = df.set_index('Data')

        # Sort index to ensure chronological order
        df = df.sort_index()
        print(f"Data range: {df.index.min().strftime('%Y-%m-%d')} to {df.index.max().strftime('%Y-%m-%d')}")

        # Process each month
        for month in range(1, 13):
            month_name = meses[month]
            print(f"\nProcessing month: {month_name}")

            # Filter data for current month
            month_data = df[df.index.month == month]['Temperatura'].values
            print(f"Number of samples for {month_name}: {len(month_data)}")

            if len(month_data) < batch_size:
                print(f"Warning: Insufficient data for month {month_name}. Minimum required: {batch_size}")
                continue

            # Scale data
            scaler = MinMaxScaler()
            month_data_scaled = scaler.fit_transform(month_data.reshape(-1, 1))

            # Save scaler
            scaler_path = os.path.join(model_dir, f'scaler_{month_name}.pkl')
            with open(scaler_path, 'wb') as f:
                pickle.dump(scaler, f)
            print(f"Saved scaler to: {os.path.basename(scaler_path)}")

            # Prepare training data
            train_dataset = prepare_training_data(month_data_scaled, batch_size)

            # Create and compile model
            model = LSTM_VAE(timesteps=timesteps, n_features=n_features, latent_dim=latent_dim)

            # Build model with input shape
            dummy_input = tf.zeros((1, timesteps, n_features))
            _ = model(dummy_input, training=False)

            # Compile model
            model.compile(
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                loss='mse',
                metrics=['mae']
            )

            # Callbacks
            model_path = os.path.join(model_dir, f'lstm_vae_model_{month_name}.h5')
            checkpoint = tf.keras.callbacks.ModelCheckpoint(
                model_path,
                monitor='val_loss',
                save_best_only=True,
                mode='min',
                verbose=1
            )

            early_stopping = tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=10,
                restore_best_weights=True
            )

            reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.2,
                patience=5,
                min_lr=0.0001
            )

            # Train model
            print(f"Training model for {month_name}...")
            history = model.fit(
                train_dataset,
                epochs=epoch_num,
                validation_split=0.2,
                callbacks=[checkpoint, early_stopping, reduce_lr],
                verbose=1
            )

            # Plot training history
            plt.figure(figsize=(10, 6))
            plt.plot(history.history['loss'], label='Training Loss')
            plt.plot(history.history['val_loss'], label='Validation Loss')
            plt.title(f'Model Loss - {month_name}')
            plt.xlabel('Epoch')
            plt.ylabel('Loss')
            plt.legend()
            history_plot_path = os.path.join(images_dir, f'training_history_{month_name}.png')
            plt.savefig(history_plot_path)
            plt.close()

            print(f"Completed training for {month_name}")
            print(f"Model saved as: lstm_vae_model_{month_name}.h5")
            print(f"Training plot saved as: training_history_{month_name}.png")

    except Exception as e:
        logging.error(f"Error in training process: {str(e)}")
        print(f"Error during training: {str(e)}")
        raise

def analyze_monthly_data(data_files):
    """Analyze data using monthly models"""
    print("\nStarting monthly data analysis...")
    print(f"Results directory: {results_dir}")
    print(f"Processing file: {os.path.basename(data_files[0])}")

    try:
        # Read data
        file_path = data_files[0]  # We know we only have one file
        filename = os.path.basename(file_path)
        df = pd.read_excel(file_path)
        df['Data'] = pd.to_datetime(df['Data'])
        df = df.set_index('Data')
        print(f"Analyzing data from {df.index.min().strftime('%Y-%m-%d')} to {df.index.max().strftime('%Y-%m-%d')}")

        # Process each month
        for month in range(1, 13):
            month_name = meses[month]
            print(f"\nAnalyzing month: {month_name}")

            # Filter data for current month
            month_data = df[df.index.month == month]['Temperatura'].values
            print(f"Number of samples: {len(month_data)}")

            if len(month_data) < batch_size:
                print(f"Warning: Insufficient data for month {month_name}. Skipping...")
                continue

            # Load scaler and model
            scaler_path = os.path.join(model_dir, f'scaler_{month_name}.pkl')
            model_path = os.path.join(model_dir, f'lstm_vae_model_{month_name}.h5')

            if not os.path.exists(scaler_path) or not os.path.exists(model_path):
                print(f"Warning: Model or scaler not found for {month_name}. Skipping...")
                continue

            print(f"Loading model and scaler for {month_name}")
            with open(scaler_path, 'rb') as f:
                scaler = pickle.load(f)

            model = tf.keras.models.load_model(
                model_path,
                custom_objects={
                    'LSTM_VAE': LSTM_VAE,
                    'ReparameterizationLayer': ReparameterizationLayer,
                    'RepeatVectorLayer': RepeatVectorLayer,
                    'TemperatureWeightLayer': TemperatureWeightLayer
                }
            )

            # Scale data and make predictions
            month_data_scaled = scaler.transform(month_data.reshape(-1, 1))
            test_dataset = prepare_training_data(month_data_scaled, batch_size)
            predictions_scaled = model.predict(test_dataset)
            predictions = scaler.inverse_transform(predictions_scaled[:, :, 0])

            # Calculate metrics
            metrics = calculate_advanced_metrics(predictions.flatten(), month_data)

            # Create yearly analysis
            yearly_metrics = {}
            years = df[df.index.month == month].index.year.unique()
            print(f"Analyzing years: {', '.join(map(str, years))}")

            for year in years:
                year_data = df[(df.index.month == month) & (df.index.year == year)]['Temperatura'].values
                year_predictions = predictions[df[df.index.month == month].index.year == year].flatten()
                if len(year_data) > 0:
                    yearly_metrics[year] = calculate_advanced_metrics(year_predictions, year_data)

            # Save results
            results_filename = f'analise_mensal_{month_name}_{filename}'
            results_path = os.path.join(results_dir, results_filename)

            writer = pd.ExcelWriter(results_path, engine='openpyxl')

            # Save all sheets
            pd.DataFrame([metrics]).to_excel(writer, sheet_name='Summary')
            pd.DataFrame({
                'Original': month_data,
                'Predicted': predictions.flatten(),
                'Error': predictions.flatten() - month_data
            }).to_excel(writer, sheet_name='Detailed Data')

            pd.DataFrame({
                'Metric': list(metrics.keys()),
                'Value': list(metrics.values())
            }).to_excel(writer, sheet_name='Statistical Analysis')

            pd.DataFrame(yearly_metrics).T.to_excel(writer, sheet_name='Yearly Analysis')

            writer.close()
            print(f"Results saved as: {results_filename}")

    except Exception as e:
        logging.error(f"Error in analysis process: {str(e)}")
        print(f"Error during analysis: {str(e)}")
        raise

if __name__ == "__main__":
    # Get list of data files
    data_files = glob.glob(os.path.join(data_dir, "*.xlsx"))

    if not data_files:
        print(f"\nError: No Excel files found in the data directory: {data_dir}")
        sys.exit(1)

    if len(data_files) > 1:
        print(f"\nWarning: Multiple Excel files found in {data_dir}")
        print("Using only the first file:", os.path.basename(data_files[0]))

    print("\nInitializing LSTM VAE Monthly Analysis")
    print("="*50)
    print(f"Data file: {os.path.basename(data_files[0])}")
    print(f"Model directory: {model_dir}")
    print(f"Results directory: {results_dir}")
    print("="*50)

    try:
        # Train models
        print("\nPhase 1: Model Training")
        print("="*50)
        train_monthly_models(data_files)
        print("\nModel training completed successfully!")

        # Analyze data
        print("\nPhase 2: Data Analysis")
        print("="*50)
        analyze_monthly_data(data_files)
        print("\nData analysis completed successfully!")

        print("\nProcess completed!")
        print("="*50)
        print("Outputs can be found in:")
        print(f"1. Models: {model_dir}")
        print(f"2. Training plots: {images_dir}")
        print(f"3. Analysis results: {results_dir}")

    except Exception as e:
        print("\nError: Process failed!")
        print(str(e))
        sys.exit(1)


Mounted at /content/drive

Verifying directories:
LSTM root exists: True
Base dir exists: True
Data dir exists: True
Model dir exists: True
Images dir exists: True
Results dir exists: True

Setting up Google Drive directories...
Directory exists: /content/drive/MyDrive/LSTM
Directory exists: /content/drive/MyDrive/LSTM/lstm mensal
Directory exists: /content/drive/MyDrive/LSTM/Convencionais processadas temperaturas
Directory exists: /content/drive/MyDrive/LSTM/lstm mensal/lstm_vae_model
Directory exists: /content/drive/MyDrive/LSTM/lstm mensal/lstm_vae_images
Directory exists: /content/drive/MyDrive/LSTM/lstm mensal/Resumo resultados

Error: Could not find 82024.xlsx
Please upload the file using one of these methods:
1. Upload directly to Google Drive in the 'Convencionais processadas temperaturas' folder
2. Upload to Colab using the file browser (left sidebar)

File required: 82024.xlsx


SystemExit: 1