In [1]:
# --- Cell 1 (Combined): Install Libraries ---
print("--- Installing necessary libraries ---")
!pip install pandas gdown numpy -q
print("Libraries installed.")
print("-" * 35)

--- Installing necessary libraries ---
Libraries installed.
-----------------------------------


In [2]:
# --- Cell 2 (Combined): Download Data ---

import gdown
import os
import numpy as np
import pandas as pd
import warnings



--- Downloading data file ---


In [4]:

# File ID extracted from your Google Drive share link:
file_id = '1244-7CXSuv_n_OuYET5QBEqdgjD-DaPt'

# Define the name for the file within the Colab environment
local_file_path = 'SCOM.txt' # This will be the file used by the rest of the script

print(f"Attempting to download file with ID: {file_id} to {local_file_path}")
try:
    # Download the file using its ID
    # Reference: This step obtains the sample data needed for estimation (Ch 5)
    gdown.download(id=file_id, output=local_file_path, quiet=False, fuzzy=True)

    # Verify that the file was actually downloaded
    if os.path.exists(local_file_path):
        print(f"File downloaded successfully to {local_file_path}")
    else:
        print("\n--- ERROR: File download appears to have failed. ---")
        print("Please double-check the following:")
        print("1. The File ID is correct.")
        print("2. The file's sharing permissions in Google Drive are set to 'Anyone with the link can view'.")
        print("----------------------------------------------------")
        raise SystemExit("Stopping due to download failure.")

except Exception as e:
     print(f"\n--- ERROR downloading file ---")
     print(f"An error occurred: {e}")
     print("Please ensure the File ID ('{file_id}') is correct and the file is shared ('Anyone with the link can view').")
     print("-----------------------------")
     raise SystemExit(f"Stopping due to download error: {e}")

print("-" * 35)

# --- Cell 3 (Combined): Synthetic Data Generation using EDF Sampling ---

print("\n--- Generating Synthetic Data using EDF Sampling ---")

# Suppress warnings (optional)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# --- Configuration ---
# Use the LOCAL path where gdown saved the file
file_path = local_file_path

# --- 1. Load the Data ---
print(f"\nAttempting to load data from local Colab path: {file_path}")
try:
    # Load the original data
    # Reference: Loading the sample data ( Ch 5 Estimation)
    real_data = pd.read_csv(file_path, sep='\s+', header=None, engine='python', na_values=['NaN', 'nan'])

    # Assign generic column names (same features as original)
    real_data.columns = [f'feature_{i}' for i in range(real_data.shape[1])]

    print(f"Successfully loaded real data. Shape: {real_data.shape}")
    print("Real data sample (first 5 rows):\n", real_data.head())
    print(f"\nNumber of NaN values loaded: {real_data.isna().sum().sum()}")

    # *** Automatically set synthetic rows to match original data ***
    num_synthetic_rows = len(real_data)
    print(f"\nSetting number of synthetic rows to generate: {num_synthetic_rows} (matching original data)")

except FileNotFoundError:
    print(f"\n--- ERROR ---")
    print(f"File not found locally at '{file_path}'. Download might have failed.")
    print("---------------")
    raise SystemExit("Stopping due to file not found locally.")
except Exception as e:
    print(f"\n--- ERROR loading data ---")
    print(f"An error occurred: {e}")
    print("Check the file format and content.")
    print("--------------------------")
    raise SystemExit(f"Stopping due to loading error: {e}")

# --- 2. Generate Synthetic Data using Independent Column Sampling from EDF ---
print(f"\nGenerating {num_synthetic_rows} synthetic data rows by sampling from EDF for each column independently...")

synthetic_columns = {}
num_real_rows = len(real_data) # Keep track of original length if needed elsewhere

for col in real_data.columns:
    # Get the original column data, dropping NaNs for sampling purposes
    # Reference: Using the observed data to represent the distribution (Ch 5.3 - EDF)
    original_col_data = real_data[col].dropna()

    if len(original_col_data) == 0:
        # Handle cases where a column might be entirely NaN
        print(f"Warning: Column '{col}' has no valid data. Filling synthetic column with NaNs.")
        synthetic_columns[col] = [np.nan] * num_synthetic_rows
        continue

    # --- Sampling from the Empirical Distribution Function ---
    # Reference: This uses numpy's random.choice (based on Ch 6.1 pseudorandomness)
    #            to sample with replacement directly from the observed column values.
    #            This is equivalent to Inverse Transform Sampling from the EDF (Ch 6.2, Ch 5.3).
    synthetic_col = np.random.choice(
        original_col_data,
        size=num_synthetic_rows, # Use the determined number of rows
        replace=True
    )
    synthetic_columns[col] = synthetic_col

# Combine the synthetic columns into a DataFrame
synthetic_data_edf = pd.DataFrame(synthetic_columns)

print("\nSynthetic data generation complete using EDF sampling.")

# --- 3. Display Sample Results ---
print(f"\nSynthetic data (EDF Sampling) sample (first 5 rows):\n", synthetic_data_edf.head())
print(f"\nGenerated synthetic data shape: {synthetic_data_edf.shape} (should match original)")

# --- 4. Important Note on Limitations (Referencing Lecture Notes concepts) ---

print("-" * 65)

# --- 5. Save the Synthetic Data LOCALLY in the Colab Session ---
# Define the output filename in the Colab environment
output_filename_local = 'synthetic_data_edf.csv'

print(f"\n--- Saving synthetic data locally in Colab session ---")
print(f"Attempting to save to: {output_filename_local}")
try:
    synthetic_data_edf.to_csv(output_filename_local, index=False)
    print(f"\nSynthetic data (EDF) saved successfully LOCALLY to:")
    print(f"'{output_filename_local}'")

except Exception as e:
     print(f"\n--- ERROR saving synthetic data (EDF) locally ---")
     print(f"An error occurred: {e}")
     print("-------------------------------------------------")

print("\n--- Code execution finished ---")

Attempting to download file with ID: 1244-7CXSuv_n_OuYET5QBEqdgjD-DaPt to SCOM.txt


Downloading...
From: https://drive.google.com/uc?id=1244-7CXSuv_n_OuYET5QBEqdgjD-DaPt
To: /content/SCOM.txt
100%|██████████| 5.39M/5.39M [00:00<00:00, 235MB/s]


File downloaded successfully to SCOM.txt
-----------------------------------

--- Generating Synthetic Data using EDF Sampling ---

Attempting to load data from local Colab path: SCOM.txt
Successfully loaded real data. Shape: (1567, 590)
Real data sample (first 5 rows):
    feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0    3030.93    2564.00  2187.7333  1411.1265     1.3602      100.0   
1    3095.78    2465.14  2230.4222  1463.6606     0.8294      100.0   
2    2932.61    2559.94  2186.4111  1698.0172     1.5102      100.0   
3    2988.72    2479.90  2199.0333   909.7926     1.3204      100.0   
4    3032.24    2502.87  2233.3667  1326.5200     1.5334      100.0   

   feature_6  feature_7  feature_8  feature_9  ...  feature_580  feature_581  \
0    97.6133     0.1242     1.5005     0.0162  ...          NaN          NaN   
1   102.3433     0.1247     1.4966    -0.0005  ...       0.0060     208.2045   
2    95.4878     0.1241     1.4436     0.0041  ...       0.01