In [1]:
import numpy as np
import pandas as pd
import random


In [13]:
def select_ecc(snr, ber, noise_power, latency, bandwidth, error_burst, data_type, network_type):
    """
    AI Model Simulation: Chooses the best ECC based on conditions.
    """
    if snr >= 15 and ber <= 1e-3:
        return "Hamming"  # High SNR, low BER → Simple ECC
    elif error_burst >= 6:
        return "Reed-Solomon"  # Good for burst errors
    elif noise_power > 0.35 or snr < 4:
        return "LDPC"  # Strong error correction needed
    elif latency > 350:
        return "Turbo"  # High latency tolerance
    elif data_type == "Streaming":
        return "Convolutional"  # Best for continuous data
    elif network_type == "5G":
        return "Polar"  # 5G applications
    else:
        return "BCH"  # General-purpose correction


In [29]:
num_samples = 10000

data = []
for _ in range(num_samples):
    # Normal ranges
    snr = round(random.uniform(0, 30), 2)
    ber = round(random.uniform(1e-7, 1e-2), 8)
    noise_power = round(random.uniform(0.01, 0.5), 3)
    latency = random.randint(10, 500)
    bandwidth = round(random.uniform(1, 100), 2)

    # Error burst length: introduce skew for fewer high values
    error_burst = random.choices(
        population=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        weights=[18, 15, 13, 12, 10, 8, 7, 5, 3, 2], 
        k=1
    )[0]

    data_type = random.choice(["Streaming", "Packet"])
    network_type = random.choice(["Wi-Fi", "5G", "Satellite", "IoT", "Storage"])

    # Inject **outliers** (1-2% of data)
    if random.random() < 0.015:  # ~1.5% outlier probability
        snr = round(random.uniform(-5, 50), 2)  # Extend SNR range (including negatives)
        ber = round(random.uniform(1e-10, 1), 8)  # BER outliers
        noise_power = round(random.uniform(0, 2), 3)  # Higher possible noise
        latency = random.randint(1, 2000)  # Extreme latency cases
        bandwidth = round(random.uniform(0.1, 500), 2)  # Ultra-low or ultra-high BW

    # Add small **random noise** (normal distribution) to avoid perfect structure
    snr += np.random.normal(0, 1)  # Small fluctuations
    ber *= (1 + np.random.normal(0, 0.05))  # Vary BER slightly
    latency += int(np.random.normal(0, 10))  # ±10ms fluctuation
    bandwidth *= (1 + np.random.normal(0, 0.02))  # ±2% variation

    # Clip values to keep within reasonable limits
    snr = max(-5, min(snr, 50))
    ber = max(1e-10, min(ber, 1))
    noise_power = max(0, min(noise_power, 2))
    latency = max(1, min(latency, 2000))
    bandwidth = max(0.1, min(bandwidth, 500))

    # Determine best ECC
    ecc = select_ecc(snr, ber, noise_power, latency, bandwidth, error_burst, data_type, network_type)

    # **Introduce Ambiguous ECC Cases (5-10% of data)**
    if random.random() < 0.075:  # ~7.5% of samples
        possible_eccs = ["Hamming", "Reed-Solomon", "LDPC", "Turbo", "Convolutional", "Polar", "BCH"]
        possible_eccs.remove(ecc)  # Avoid selecting the same one
        ecc = random.choice(possible_eccs)  # Assign a different, but valid ECC

    # Append to dataset
    data.append([snr, ber, noise_power, latency, bandwidth, error_burst, data_type, network_type, ecc])

# Create DataFrame
df = pd.DataFrame(data, columns=["SNR", "BER", "Noise Power", "Latency", "Bandwidth",
                                 "Error Burst Length", "Data Type", "Network Type", "Optimal ECC"])

# Save to CSV
df.to_csv("ecc_dataset.csv", index=False)

print("Dataset with outliers and ambiguity generated and saved as ecc_dataset.csv")

Dataset with outliers and ambiguity generated and saved as ecc_dataset_with_variability.csv


In [31]:
df = pd.read_csv("ecc_dataset.csv")
print(df.head())  # Print first few rows
print(df.info())  # Check dataset details
print(df["Optimal ECC"].value_counts())

     SNR       BER  Noise Power  Latency  Bandwidth  Error Burst Length  \
0   9.80  0.001990        0.301      414  71.265642                   4   
1   7.84  0.008785        0.036       62  40.257541                   7   
2  16.41  0.000209        0.106      379  67.365008                   1   
3  26.32  0.005471        0.200      175  65.998675                   4   
4  19.84  0.007162        0.383      432  53.129020                   6   

   Data Type Network Type   Optimal ECC  
0  Streaming           5G         Turbo  
1  Streaming    Satellite  Reed-Solomon  
2     Packet      Storage       Hamming  
3     Packet          IoT           BCH  
4     Packet        Wi-Fi  Reed-Solomon  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   SNR                 10000 non-null  float64
 1   BER                 10000 non-null  fl