Data Stacking and Transformation (Concatenating Server Logs)
Simulate collecting partial server log statistics (different times, same metrics) from two separate servers and stack them together to analyze the global distribution.

In [5]:
import pandas as pd
import numpy as np
import warnings

def create_mock_server_logs():
    """Creates mock log data from two different server regions (different time periods)."""
    
    # DF 1: Server A (Early Shift Data)
    df_server_a = pd.DataFrame({
        'Server_ID': 'A_East',
        # FIX: Specify format='%H:%M' to tell pandas exactly how to parse the time string.
        'Timestamp': pd.to_datetime(['09:00', '09:15', '09:30', '09:45'], format='%H:%M'),
        'CPU_Usage': [65, 70, 72, 68],
        'Memory_Load': [40, 42, 45, 41]
    })
    
    # DF 2: Server B (Late Shift Data - Same Columns)
    df_server_b = pd.DataFrame({
        'Server_ID': 'B_West',
        # FIX: Specify format='%H:%M'
        'Timestamp': pd.to_datetime(['16:00', '16:15', '16:30', '16:45'], format='%H:%M'),
        'CPU_Usage': [85, 90, 88, 92],
        'Memory_Load': [60, 65, 62, 68]
    })
    
    # DF 3: Server C (Different Metrics - For Axis=1 Concatenation Example)
    df_server_c = pd.DataFrame({
        'Server_ID': 'C_South',
        # FIX: Specify format='%H:%M'
        'Timestamp': pd.to_datetime(['09:00', '09:15', '09:30', '09:45'], format='%H:%M'),
        'Network_Latency_ms': [12, 15, 10, 18],
    })
    
    return df_server_a, df_server_b, df_server_c


def stack_and_pivot_server_data():
    df_a, df_b, df_c = create_mock_server_logs()
    
    print("--- 1. Data Concatenation (Stacking Observations) ---")
    
    # --- Integration Step (Concatenate Axis=0: Stacking Rows) ---
    # This combines observations from A and B into one large time-series pool.
    # This increases the volume of observations, as described in the lecture (axis=0).
    global_logs_stacked = pd.concat([df_a, df_b], axis=0, ignore_index=True)
    
    print("\nGlobal Stacked Logs (Servers A & B):\n", global_logs_stacked)
    
    # --- Transformation Step (Melt/Unpivot) ---
    # Convert the wide format (CPU_Usage, Memory_Load) into a long format 
    # (Metric Name, Metric Value) for easier visualization/analysis.
    logs_long_format = global_logs_stacked.melt(
        id_vars=['Server_ID', 'Timestamp'], 
        value_vars=['CPU_Usage', 'Memory_Load'], 
        var_name='Metric', 
        value_name='Value'
    )
    
    print("\n--- 2. Transformed (Long) Format for Analysis ---")
    print(logs_long_format.head(5))
    
    # --- Analysis Step (Grouping and Aggregation) ---
    # Find the average load across all servers for each metric.
    avg_load_by_metric = logs_long_format.groupby('Metric')['Value'].mean()
    
    print("\n--- 3. Aggregated Analysis ---")
    print("Average Load Across All Servers:\n", avg_load_by_metric)
    
    # --- Example of Axis=1 Concatenation (Combining Attributes) ---
    # Simulate combining Server A's performance metrics with Server C's network metrics 
    # based on their shared Timestamp/index (axis=1).
    df_c.set_index('Timestamp', inplace=True)
    df_a_subset = df_a[['Timestamp', 'CPU_Usage', 'Memory_Load']].set_index('Timestamp')
    
    combined_attributes = pd.concat([df_a_subset, df_c], axis=1, join='inner')
    print("\n--- 4. Concatenated Attributes (Axis=1, Inner Join) ---")
    print("Server A Performance combined with Server C Latency:\n", combined_attributes)


stack_and_pivot_server_data()

--- 1. Data Concatenation (Stacking Observations) ---

Global Stacked Logs (Servers A & B):
   Server_ID           Timestamp  CPU_Usage  Memory_Load
0    A_East 1900-01-01 09:00:00         65           40
1    A_East 1900-01-01 09:15:00         70           42
2    A_East 1900-01-01 09:30:00         72           45
3    A_East 1900-01-01 09:45:00         68           41
4    B_West 1900-01-01 16:00:00         85           60
5    B_West 1900-01-01 16:15:00         90           65
6    B_West 1900-01-01 16:30:00         88           62
7    B_West 1900-01-01 16:45:00         92           68

--- 2. Transformed (Long) Format for Analysis ---
  Server_ID           Timestamp     Metric  Value
0    A_East 1900-01-01 09:00:00  CPU_Usage     65
1    A_East 1900-01-01 09:15:00  CPU_Usage     70
2    A_East 1900-01-01 09:30:00  CPU_Usage     72
3    A_East 1900-01-01 09:45:00  CPU_Usage     68
4    B_West 1900-01-01 16:00:00  CPU_Usage     85

--- 3. Aggregated Analysis ---
Average Load Across 