# Noise Injection Methods

## Libraries

In [1]:
import numpy as np
import pandas as pd

np.set_printoptions(precision=4, suppress=False, linewidth=10000)

## Noise Transition Matrix Generation

### Distributed robust support vector ordinal regression under label noise

In [2]:
import numpy as np

def create_noise_transition_matrix(num_classes, noise_rate, sigma):
    """
    Create a noise transition matrix based on the Gaussian decaying label noise method.

    Parameters:
    num_classes (int): Number of classes (Q).
    noise_rate (float): Noise rate (tau).
    sigma (float): Sigma parameter controlling the decay rate of the noise.

    Returns:
    np.ndarray: Noise transition matrix of shape (num_classes, num_classes).
    """
    # Initialize the noise transition matrix
    T = np.zeros((num_classes, num_classes))
    
    # Calculate the off-diagonal elements using Gaussian decaying noise
    for i in range(num_classes):
        for j in range(num_classes):
            if i != j:
                T[i, j] = np.exp(-((i - j) ** 2) / (2 * sigma ** 2))
    
    # Normalize the off-diagonal elements to ensure the sum of each row is 1
    # First, calculate the sum of the off-diagonal elements for each row
    row_sums = T.sum(axis=1)
    
    # Calculate the normalization factor rho
    rho = noise_rate * num_classes / row_sums.sum()
    
    # Scale the off-diagonal elements by rho
    T = rho * T
    
    # Set the diagonal elements to ensure the sum of each row is 1
    for i in range(num_classes):
        T[i, i] = 1 - T[i, :].sum()
    
    return T

# Example usage:
num_classes = 5
noise_rate = 0.2
sigma = 3

noise_matrix = create_noise_transition_matrix(num_classes, noise_rate, sigma)
print("Noise Transition Matrix:")
print(noise_matrix)
print("Row sums:", noise_matrix.sum(axis=1))
print("Overall noise rate:", (noise_matrix.sum() - np.trace(noise_matrix)) / num_classes, '\n')

Noise Transition Matrix:
[[0.823  0.0606 0.0513 0.0388 0.0263]
 [0.0606 0.7888 0.0606 0.0513 0.0388]
 [0.0513 0.0606 0.7764 0.0606 0.0513]
 [0.0388 0.0513 0.0606 0.7888 0.0606]
 [0.0263 0.0388 0.0513 0.0606 0.823 ]]
Row sums: [1. 1. 1. 1. 1.]
Overall noise rate: 0.2 



# Create Noisy Versions

In [4]:
def flip_labels_using_noise_matrix(input_csv, transition_matrix, output_csv, seed=None):
    """
    Randomly flip the labels in the age column of a CSV file using a given label noise transition matrix.

    Parameters:
    input_csv (str): Path to the input CSV file (e.g., train.csv).
    transition_matrix (np.ndarray): The label noise transition matrix.
    output_csv (str): Path to the output CSV file.
    seed (int, optional): Seed for NumPy random number generator to stabilize random results.

    The output CSV file will contain a new column 'original_age' with the original labels and
    the 'age' column with the flipped labels.
    """
    # Create a random generator with the provided seed
    rng = np.random.default_rng(seed)

    # Load the CSV into a pandas DataFrame
    df = pd.read_csv(input_csv)

    # Ensure 'age' column is treated as categorical labels
    unique_ages = sorted(df['age'].unique())
    num_classes = len(unique_ages)

    # Map ages to class indices
    age_to_index = {age: i for i, age in enumerate(unique_ages)}
    index_to_age = {i: age for i, age in enumerate(unique_ages)}

    # Function to flip a label based on the transition matrix
    def flip_label(original_age):
        original_index = age_to_index[original_age]
        flipped_index = rng.choice(num_classes, p=transition_matrix[original_index])  # Use local RNG
        return index_to_age[flipped_index]

    # Store the original labels in a new column
    df['original_age'] = df['age']

    # Flip the labels in the 'age' column
    df['age'] = df['age'].apply(flip_label)

    # Save the new DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Labels flipped and saved to {output_csv}")

In [5]:
def calculate_empirical_noise_matrix(csv_file):
    """
    Calculate the empirical label noise matrix based on the flipped labels in the CSV file.

    Parameters:
    csv_file (str): Path to the CSV file containing 'age' (flipped) and 'original_age' columns.

    Returns:
    np.ndarray: The empirical label noise matrix (num_classes x num_classes).
    """
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file)

    # Get unique age labels (original and flipped), assuming they are the same
    unique_ages = sorted(df['original_age'].unique())
    num_classes = len(unique_ages)

    # Map ages to class indices (0, 1, 2, ..., num_classes-1)
    age_to_index = {age: i for i, age in enumerate(unique_ages)}

    # Initialize the confusion matrix (empirical noise matrix)
    noise_matrix = np.zeros((num_classes, num_classes))

    # Loop through the DataFrame and count transitions from original_age to flipped age
    for _, row in df.iterrows():
        original_label = row['original_age']
        flipped_label = row['age']

        # Convert age labels to indices
        original_index = age_to_index[original_label]
        flipped_index = age_to_index[flipped_label]

        # Increment the count in the matrix
        noise_matrix[original_index, flipped_index] += 1

    # Normalize each row to sum to 1 to get the transition probabilities
    noise_matrix = noise_matrix / noise_matrix.sum(axis=1, keepdims=True)

    return noise_matrix

In [6]:
num_classes = 26
noise_rate = 0.2
sigma = 3
seed = 0

input_csv = f"/home/vision/alireza-sm/coral/coral-cnn/datasets/my_afad_train.csv"
output_csv = f"/home/vision/alireza-sm/coral/coral-cnn/datasets/my_afad_train_0.2_3.csv"
noise_matrix = create_noise_transition_matrix(num_classes, noise_rate, sigma)

print(noise_matrix, '\n')

flip_labels_using_noise_matrix(input_csv=input_csv, transition_matrix=noise_matrix, output_csv=output_csv, seed=seed)

[[0.8882 0.0324 0.0275 0.0208 0.0141 0.0085 0.0046 0.0023 0.001  0.0004 0.0001 0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.    ]
 [0.0324 0.8558 0.0324 0.0275 0.0208 0.0141 0.0085 0.0046 0.0023 0.001  0.0004 0.0001 0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.    ]
 [0.0275 0.0324 0.8284 0.0324 0.0275 0.0208 0.0141 0.0085 0.0046 0.0023 0.001  0.0004 0.0001 0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.    ]
 [0.0208 0.0275 0.0324 0.8076 0.0324 0.0275 0.0208 0.0141 0.0085 0.0046 0.0023 0.001  0.0004 0.0001 0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.    ]
 [0.0141 0.0208 0.0275 0.0324 0.7935 0.0324 0.0275 0.0208 0.0141 0.0085 0.0046 0.0023 0.001  0.0004 0.0001 0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.    ]
 [0.0085 0.0141 0.0208 0.0275 0.0324 0.7849 0.0324 0.0275 0.0208 0.0141 0.0

In [20]:
num_classes = 8
noise_rate = 0.2
sigma = 3
seed = 0

input_csv = f"/home/vision/alireza-sm/coral/coral-cnn/datasets/adience/train.csv"
output_csv = f"/home/vision/alireza-sm/coral/coral-cnn/datasets/adience/train_0.2_3.csv"
noise_matrix = create_noise_transition_matrix(num_classes, noise_rate, sigma)

print(noise_matrix, '\n')

flip_labels_using_noise_matrix(input_csv=input_csv, transition_matrix=noise_matrix, output_csv=output_csv, seed=seed)

[[0.8504 0.044  0.0373 0.0282 0.0191 0.0116 0.0063 0.0031]
 [0.044  0.8094 0.044  0.0373 0.0282 0.0191 0.0116 0.0063]
 [0.0373 0.044  0.7784 0.044  0.0373 0.0282 0.0191 0.0116]
 [0.0282 0.0373 0.044  0.7618 0.044  0.0373 0.0282 0.0191]
 [0.0191 0.0282 0.0373 0.044  0.7618 0.044  0.0373 0.0282]
 [0.0116 0.0191 0.0282 0.0373 0.044  0.7784 0.044  0.0373]
 [0.0063 0.0116 0.0191 0.0282 0.0373 0.044  0.8094 0.044 ]
 [0.0031 0.0063 0.0116 0.0191 0.0282 0.0373 0.044  0.8504]] 

Labels flipped and saved to /home/vision/alireza-sm/coral/coral-cnn/datasets/adience/train_0.2_3.csv


In [3]:
num_classes = 8 
noise_rate = 0.4
sigma = 3

noise_matrix = create_noise_transition_matrix(num_classes, noise_rate, sigma)

print(noise_matrix, '\n')

np.save("noise_matrix_8_0.4_3.npy", noise_matrix)

[[0.7007 0.0881 0.0745 0.0565 0.0383 0.0232 0.0126 0.0061]
 [0.0881 0.6188 0.0881 0.0745 0.0565 0.0383 0.0232 0.0126]
 [0.0745 0.0881 0.5569 0.0881 0.0745 0.0565 0.0383 0.0232]
 [0.0565 0.0745 0.0881 0.5236 0.0881 0.0745 0.0565 0.0383]
 [0.0383 0.0565 0.0745 0.0881 0.5236 0.0881 0.0745 0.0565]
 [0.0232 0.0383 0.0565 0.0745 0.0881 0.5569 0.0881 0.0745]
 [0.0126 0.0232 0.0383 0.0565 0.0745 0.0881 0.6188 0.0881]
 [0.0061 0.0126 0.0232 0.0383 0.0565 0.0745 0.0881 0.7007]] 



In [4]:

def calculate_noise_transition_matrix(noise_free_csv, noise_injected_csv, output_file=None):
    """
    Calculate the noise transition matrix based on differences between noise-free and noise-injected labels.
    
    Args:
        noise_free_csv (str): Path to the noise-free CSV file.
        noise_injected_csv (str): Path to the noise-injected CSV file.
        output_file (str, optional): Path to save the noise transition matrix as a CSV. Defaults to None.
    
    Returns:
        np.ndarray: The noise transition matrix.
    """
    # Load the CSV files
    noise_free_data = pd.read_csv(noise_free_csv, header=None)
    noise_injected_data = pd.read_csv(noise_injected_csv, header=None)

    noise_free_data = noise_free_data[noise_free_data[2] == 0]
    noise_injected_data = noise_injected_data[noise_injected_data[2] == 0]

    # Ensure both files have the same number of rows
    if len(noise_free_data) != len(noise_injected_data):
        raise ValueError("The two CSV files must have the same number of rows.")

    # Extract labels
    noise_free_labels = noise_free_data[3].values
    noise_injected_labels = noise_injected_data[3].values

    # Determine the number of unique labels
    num_labels = noise_free_labels.max() - noise_free_labels.min() + 1

    # Initialize the noise transition matrix
    noise_matrix = np.zeros((num_labels, num_labels), dtype=np.float64)

    # Populate the noise transition matrix
    for true_label, noisy_label in zip(noise_free_labels, noise_injected_labels):
        noise_matrix[true_label, noisy_label] += 1

    # Normalize each row to get probabilities
    row_sums = noise_matrix.sum(axis=1, keepdims=True)
    noise_matrix = np.divide(
        noise_matrix,
        row_sums,
        out=np.zeros_like(noise_matrix),  # Fill with zeros if division fails
        where=row_sums != 0  # Avoid division by zero
    )
    
    # Save the noise transition matrix to a file (if specified)
    if output_file:
        np.savetxt(output_file, noise_matrix, delimiter=",", fmt="%.4f")
        print(f"Noise transition matrix saved to {output_file}")

    return noise_matrix

In [9]:
noise_free_csv_path = "facebase/data/Adience_256x256_resnet50_imagenet_dldl_v2_clean/data_split4.csv"
noise_injected_csv_path = "facebase/data/Adience_256x256_resnet50_imagenet_noisy_dldl_v2/data_split4.csv"

transition_matrix = calculate_noise_transition_matrix(noise_free_csv_path, noise_injected_csv_path)
 # Adjust print options to display the entire matrix
np.set_printoptions(threshold=np.inf, linewidth=np.inf, suppress=True)

# Print the full noise transition matrix
print("Noise Transition Matrix:")
print(transition_matrix)
print("Row sums:", transition_matrix.sum(axis=1))
print("Overall noise rate:", (transition_matrix.sum() - np.trace(transition_matrix)) / 8, '\n')

Noise Transition Matrix:
[[0.7242 0.0725 0.0592 0.0439 0.0353 0.0344 0.0172 0.0134]
 [0.1022 0.6496 0.0623 0.0613 0.0409 0.039  0.026  0.0186]
 [0.0853 0.1031 0.5829 0.0634 0.0547 0.0438 0.0374 0.0294]
 [0.0586 0.076  0.1045 0.5479 0.076  0.0538 0.0491 0.034 ]
 [0.0427 0.0552 0.0858 0.1013 0.5452 0.0573 0.0531 0.0594]
 [0.0273 0.0475 0.0514 0.0864 0.0896 0.5857 0.0545 0.0576]
 [0.0135 0.0189 0.0541 0.0378 0.0811 0.0919 0.6595 0.0432]
 [0.0042 0.0168 0.0294 0.0378 0.0567 0.0462 0.1155 0.6933]]
Row sums: [1. 1. 1. 1. 1. 1. 1. 1.]
Overall noise rate: 0.3764611424174591 

