In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from tqdm import tqdm

In [2]:
#TODO: define all your input here
experiment_dir = 'outputs/mirror-mouse/hand=100_pseudo=1000'
dataset = "mirror-mouse"
likelihood_threshold = 4
variance_threshold = 400

In [3]:

data_dir = '/teamspace/studios/this_studio'
file_paths = [
    os.path.join(data_dir, experiment_dir,'networks', f'rng{i}', 'predictions_new.csv')
    for i in range(5)
]
review_dir = os.path.join(data_dir, experiment_dir, 'review_frames')
os.makedirs(review_dir, exist_ok=True)

output_file_name = 'frames_quality_control.csv'
output_file_path = os.path.join(review_dir, output_file_name)

def process_and_merge_csv_files(file_paths_and_identifiers, output_file_path):
    merged_data = None

    for file_path in file_paths:

        # Extract the identifier from the file path
        rng_identifier = os.path.basename(os.path.dirname(file_path))
        data = pd.read_csv(file_path)
        data_transformed = []

        # Extract body part names from the first row
        # taking every third column starting from the second
        body_parts = data.iloc[0, 1::3].tolist()

        # starting from the third row (index 2)
        for i in range(2, len(data)):
            
            # Get the image path from the first column of the current row
            image_path = data.iloc[i, 0]
            for j, part in enumerate(body_parts):

                # index for the x-coordinate - every third column starting from the second
                x_index = 1 + j*3
                # index for the y-coordinate - every third column starting from the third
                y_index = 2 + j*3
                # index for likelihood 
                likelihood_index = 3 + j*3

                if all(index < len(data.columns) for index in [x_index, y_index, likelihood_index]):
                    x, y, likelihood = data.iloc[i, [x_index, y_index, likelihood_index]]
                    data_transformed.append([image_path, part, x, y, likelihood])

        transformed_df = pd.DataFrame(data_transformed, columns=["image_path", "body_part", f"x_{rng_identifier}", f"y_{rng_identifier}", f"likelihood_{rng_identifier}"])
        
        if merged_data is None:
            merged_data = transformed_df
        else:
            merged_data = pd.merge(merged_data, transformed_df, on=["image_path", "body_part"])

    # Calculate ensemble mean and variance for x and y coordinates
    merged_data['ensemble_mean_x'] = merged_data[
        [f'x_rng{i}' for i in range(5)]
    ].astype(float).mean(axis=1)

    merged_data['ensemble_mean_y'] = merged_data[
        [f'y_rng{i}' for i in range(5)]
    ].astype(float).mean(axis=1)

    merged_data['ensemble_median_x'] = merged_data[
        [f'x_rng{i}' for i in range(5)]
    ].astype(float).median(axis=1)

    merged_data['ensemble_median_y'] = merged_data[
        [f'y_rng{i}' for i in range(5)]
    ].astype(float).median(axis=1)

    merged_data['ensemble_variance_x'] = merged_data[
        [f'x_rng{i}' for i in range(5)]
    ].astype(float).var(axis=1)

    merged_data['ensemble_variance_y'] = merged_data[
        [f'y_rng{i}' for i in range(5)]
    ].astype(float).var(axis=1)

    merged_data['ensemble_variance'] = merged_data['ensemble_variance_x'] + merged_data['ensemble_variance_y']

    # Count likelihoods above 0.9
    merged_data['num_likelihood_above_0.9'] = merged_data[
        [f'likelihood_rng{i}' for i in range(5)]
    ].astype(float).gt(0.9).sum(axis=1)

    # Apply logarithmic scale to ensemble variance
    merged_data['ensemble_variance_log'] = np.log1p(merged_data['ensemble_variance'])

    # Create bins for logarithmic ensemble variance
    merged_data['ensemble_variance_log_bin'] = pd.cut(
        merged_data['ensemble_variance_log'], bins=10, labels=False
    )

    log_bins = pd.cut(merged_data['ensemble_variance_log'], bins=10)
    merged_data['ensemble_variance_log_bin_label'] = log_bins.apply(lambda x: f'{x.left:.1f}-{x.right:.1f}')

    ensemble_variance_bins_labels = pd.cut(merged_data['ensemble_variance'], bins=10)
    merged_data['ensemble_variance_bin_label'] = ensemble_variance_bins_labels.apply(lambda x: f'{x.left:.1f}-{x.right:.1f}')
    
    # Save the merged dataframe to a CSV file
    merged_data.to_csv(output_file_path, index=False)
    print(f"Concatenated data saved to {output_file_path}")

    return merged_data

quality_control_df = process_and_merge_csv_files(file_paths, output_file_path)



Concatenated data saved to /teamspace/studios/this_studio/outputs/mirror-mouse/hand=100_pseudo=1000/review_frames/frames_quality_control.csv


In [4]:
#Read frames_quality_control.csv
try:
    quality_control_df = pd.read_csv(output_file_path)
except FileNotFoundError:
    print(f"Error: The file {output_file_path} was not found.")
except pd.errors.EmptyDataError:
    print(f"Error: The file {output_file_path} is empty.")
except Exception as e:
    print(f"An error occurred while reading the file: {str(e)}")

In [5]:
def create_heatmaps(data, output_dir):
    def plot_heatmap(data, title, xlabel, ax):
        sns.heatmap(data[::-1], annot=True, fmt="d", cmap="Blues", ax=ax)
        ax.set_title(title)
        ax.set_xlabel(xlabel)
        ax.set_ylabel('Number of Highly Confident Networks')
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

    heatmap_data_normal = data.pivot_table(
        index='num_likelihood_above_0.9', 
        columns='ensemble_variance_bin_label', 
        aggfunc='size', 
        fill_value=0
    )

    heatmap_data_log = data.pivot_table(
        index='num_likelihood_above_0.9', 
        columns='ensemble_variance_log_bin_label', 
        aggfunc='size', 
        fill_value=0
    )

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))

    plot_heatmap(heatmap_data_normal, 'Heatmap of Ensemble Variance Bins\nvs Number of Highly Confident Networks', 'Ensemble Variance Bin', ax1)
    plot_heatmap(heatmap_data_log, 'Heatmap of Log-Scaled Ensemble Variance Bins\nvs Number of Highly Confident Networks', 'Log-Scaled Ensemble Variance Bin', ax2)

    plt.tight_layout()
    
    output_image_path = os.path.join(output_dir, 'ensemble_variance_heatmaps.png')
    fig.savefig(output_image_path, dpi=300, bbox_inches='tight')
    plt.close(fig) 
    print(f"Heatmap image saved to {output_image_path}")

create_heatmaps(quality_control_df, review_dir)

    

Heatmap image saved to /teamspace/studios/this_studio/outputs/mirror-mouse/hand=100_pseudo=1000/review_frames/ensemble_variance_heatmaps.png


In [6]:
def create_body_part_heatmaps(data, output_dir):
    unique_body_parts = data['body_part'].unique()
    num_body_parts = len(unique_body_parts)
    rows = (num_body_parts + 2) // 3  # Calculate number of rows, rounding up
    cols = 3

    fig, axes = plt.subplots(rows, cols, figsize=(18, 6 * rows), squeeze=False)
    
    for idx, body_part in enumerate(unique_body_parts):
        row = idx // 3
        col = idx % 3
        ax = axes[row, col]

        body_part_data = data[data['body_part'] == body_part]
        heatmap_data_body_part = body_part_data.pivot_table(
            index='num_likelihood_above_0.9', 
            columns='ensemble_variance_log_bin_label', 
            aggfunc='size', 
            fill_value=0
        )
        
        sns.heatmap(heatmap_data_body_part[::-1], annot=True, fmt="d", cmap="Blues", ax=ax)
        ax.set_title(f'Heatmap for {body_part}')
        ax.set_xlabel('Log-Scaled Ensemble Variance Bin')
        ax.set_ylabel('Number of Highly Confident Networks')
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

    # Remove any unused subplots
    for idx in range(len(unique_body_parts), rows * cols):
        row = idx // 3
        col = idx % 3
        fig.delaxes(axes[row, col])

    plt.tight_layout()
    
    output_image_path = os.path.join(output_dir, 'body_part_heatmaps.png')
    fig.savefig(output_image_path, dpi=300, bbox_inches='tight')
    plt.close(fig)  # Close the figure to free up memory
    print(f"Body part heatmap image saved to {output_image_path}")

create_body_part_heatmaps(quality_control_df, review_dir)


Body part heatmap image saved to /teamspace/studios/this_studio/outputs/mirror-mouse/hand=100_pseudo=1000/review_frames/body_part_heatmaps.png


In [8]:


def filter_csv_and_process_images(input_file, output_dir, data_dir, likelihood_threshold=4, variance_threshold=400):
    # Read the input CSV file
    df = pd.read_csv(input_file)
    
    # Filter the data based on the criteria
    filtered_df = df[
        (df['num_likelihood_above_0.9'] <= likelihood_threshold) &
        (df['ensemble_variance'] <= variance_threshold)
    ]
    
    # Create the output filename
    output_filename = f"frames_confidence_leq_{likelihood_threshold}_variance_{variance_threshold}.csv"
    output_path = os.path.join(output_dir, output_filename)
    
    # Save the filtered data to a single CSV file
    filtered_df.to_csv(output_path, index=False)
    print(f"Saved filtered data to: {output_path}")
    print(f"Total frames in filtered data: {len(filtered_df)}")

    # Parameters for image processing
    height = 3
    color = 'y'
    median_color = 'm'

    # Iterate through each row in the filtered DataFrame
    for _, row in tqdm(filtered_df.iterrows(), total=filtered_df.shape[0]):
        img_name = row['image_path']
        img_path = os.path.join(data_dir, img_name)
        
        # Check if the image file exists
        if not os.path.exists(img_path):
            print(f"Image file does not exist: {img_path}")
            continue

        # Read and process the image
        fr_ = cv2.imread(img_path)
        if fr_ is None:
            print(f"Failed to read image: {img_path}")
            continue

        fr_ = cv2.cvtColor(fr_, cv2.COLOR_BGR2RGB)
        img_height = fr_.shape[0]
        img_width = fr_.shape[1]
        h = height
        w = h * (img_width / img_height)

        fig, ax = plt.subplots(1, 1, figsize=(w, h), facecolor='k')
        
        ax.imshow(fr_, cmap='gray')

        # Plot predictions from all 5 networks
        for i in range(5):
            x = row[f'x_rng{i}']
            y = row[f'y_rng{i}']
            likelihood = row[f'likelihood_rng{i}']
            
            if likelihood > 0.9:
                ax.plot(x, y, 'o', markersize=2, color=color, alpha=1)
            else:
                ax.plot(x, y, 'o', markersize=2, color=color, alpha=0.5, markeredgecolor=color, markerfacecolor=color, markeredgewidth=0.5)

        # Plot the ensemble median
        median_x = row["ensemble_median_x"]
        median_y = row["ensemble_median_y"]
        ax.plot(median_x, median_y, 'o', markersize=2, color=median_color)

        # Prepare the title
        body_part = row["body_part"]
        ensemble_variance = row["ensemble_variance"]
        num_likelihood_above_0_9 = row["num_likelihood_above_0.9"]

        # Calculate ensemble standard deviation
        ensemble_std_dev = np.sqrt(ensemble_variance) if not np.isnan(ensemble_variance) else np.nan

        title = (f"{img_name.replace('labeled-data/', '')} - {body_part}\n"
                 f"Ensemble Std Dev: {ensemble_std_dev:.2f} pixels\n"
                 f"Highly Confident Networks: {num_likelihood_above_0_9}")

        # Set the title
        ax.set_title(title, fontsize=5, color='w', y=1.1, va='bottom', linespacing=1.5)

        # Add legend
        ax.plot([], [], 'o', color=color, alpha=1, label='Network pred. (conf > 0.9)', markersize=2)
        ax.plot([], [], 'o', color=color, alpha=0.1, label='Network pred. (conf < 0.9)', markersize=2, markeredgecolor=color, markerfacecolor=color)
        ax.plot([], [], 'o', color=median_color, label='Ensemble median', markersize=2)
        ax.legend(loc='lower right', fontsize=4, framealpha=0.7)

        ax.set_xticks([])
        ax.set_yticks([])
        
        # Create the filename structure
        video_name = img_name.split('/')[1]  # Assuming the format is 'labeled-data/<video_name>/img<number>.png'
        image_number = os.path.splitext(os.path.basename(img_name))[0]
        
        # Create the directory based on confidence level
        confidence_level = int(num_likelihood_above_0_9) if not np.isnan(num_likelihood_above_0_9) else 'unknown'
        save_dir = os.path.join(output_dir, f'confidence_{confidence_level}')
        os.makedirs(save_dir, exist_ok=True)

        # Save the image with the new filename structure
        filename = os.path.join(save_dir, f'{video_name}_{image_number}_{body_part}.png')
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        plt.close()

    return filtered_df


# labeled_data_dir = '/teamspace/studios/this_studio/data/mirror-mouse'

labeled_data_dir = os.path.join('data', dataset)

filtered_df = filter_csv_and_process_images(output_file_path, review_dir, labeled_data_dir, likelihood_threshold, variance_threshold)

Saved filtered data to: /teamspace/studios/this_studio/outputs/mirror-mouse/hand=100_pseudo=1000/review_frames/frames_confidence_leq_4_variance_400.csv
Total frames in filtered data: 466


  0%|          | 0/466 [00:00<?, ?it/s]

100%|██████████| 466/466 [02:35<00:00,  3.01it/s]
