# Workbook for inspecting bulk prediction data

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the confidence data and plot the histogram of distributions
# Confidence data is expected to be in a CSV file named 'boltz_results_combined.csv'%%!

confidence_data = pd.read_csv('boltz_results_combined.csv')


if 'confidence_score' in confidence_data.columns:
    # Plot the histogram of confidence values
    plt.figure(figsize=(10, 6))
    sns.histplot(confidence_data['confidence_score'], bins=50, kde=True)
    plt.title('Distribution of Confidence Values')
    plt.xlabel('Boltz-2 ConfidenceScore')
    plt.ylabel('Frequency')
    plt.grid()
    plt.show()






In [None]:
# Plot the distribution of distances to specific residues
# The distance data is expected to be in a CSV file named 'distance_matrices.csv'

distance_data = pd.read_csv('distance_matrices.csv')

# The distance data has the first column as the model name and each subsequent column as the distance to a specific residue.

# Initialally just want to define the column to plot
binding_site_residue1 = 158
binding_site_residue2 = 90
binding_site_residue3 = 212
binding_site_residue4 = 57


plt.figure(figsize=(10, 10))

plt.subplot(2, 2, 1)
sns.histplot(distance_data.iloc[:, binding_site_residue1], bins=50, kde=True)
plt.title(f'Distribution of Distances to Residue {binding_site_residue1}')
plt.xlabel('Mean ligand distance to Residue (angstroms)')
plt.ylabel('Frequency')
plt.grid()

plt.subplot(2, 2, 2)
sns.histplot(distance_data.iloc[:, binding_site_residue2], bins=50, kde=True)
plt.title(f'Distribution of Distances to Residue {binding_site_residue2}')
plt.xlabel('Mean ligand distance to Residue (angstroms)')
plt.ylabel('Frequency')
plt.grid() 

plt.subplot(2, 2, 3)
sns.histplot(distance_data.iloc[:, binding_site_residue3], bins=50, kde=True)
plt.title(f'Distribution of Distances to Residue {binding_site_residue3}')
plt.xlabel('Mean ligand distance to Residue (angstroms)')
plt.ylabel('Frequency')
plt.grid()

plt.subplot(2, 2, 4)
sns.histplot(distance_data.iloc[:, binding_site_residue4], bins=50, kde=True)
plt.title(f'Distribution of Distances to Residue {binding_site_residue4}')
plt.xlabel('Mean ligand distance to Residue (angstroms)')
plt.ylabel('Frequency')
plt.grid()

plt.tight_layout()
plt.show()

# Plot each column of the distance data, ploting the mean and standard deviation of each column
# This will help to visualize the distribution of distances to each residue across all models
mean_distance = []
std_distance = []

for i in range(1, distance_data.shape[1]):
    # Calculate and plot the mean and standard deviation
    mean_distance.append(distance_data.iloc[:, i].mean())
    std_distance.append(distance_data.iloc[:, i].std())

# Create a DataFrame for the mean and standard deviation
distance_summary = pd.DataFrame({
    'Residue': distance_data.columns[1:],
    'Mean Distance': mean_distance,
    'Standard Deviation': std_distance
})

# Plot the mean and standard deviation
plt.figure(figsize=(40, 10))
plt.errorbar(distance_summary['Residue'], distance_summary['Mean Distance'], yerr=distance_summary['Standard Deviation'], fmt='o', capsize=5)
plt.title('Mean and Standard Deviation of Distances to Residues')
plt.xlabel('Residue')
plt.ylabel('Mean Distance (angstroms)')
plt.xticks(rotation=45)
plt.grid()
plt.tight_layout()
plt.show()

# Filter the distance_summary, and return the residue index that have the mean distance below a certain threshold
distance_threshold = 12.0

# Filter the distance data based on the distance threshold
filtered_distance_data = distance_summary[
    (distance_summary['Mean Distance'] < distance_threshold)    
]

# Save the filtered distance data to a CSV file
filtered_distance_data.to_csv('filtered_distance_data.csv', index=False)

print(filtered_distance_data)







