<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [3]</a>'.</span>

# Data Import

In [1]:
virtual_env = "False"

In [2]:
# Parameters
execution_date = "2024-05-21 14:35:11.555470+00:00"


<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [3]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

if virtual_env == True:
    con = sqlite3.connect("/mnt/02-Data_Curation/unified.db")
else:
    con = sqlite3.connect("../02-Data_Curation/unified.db")
    
cur = con.cursor()

df = pd.read_sql_query("SELECT * FROM prod", con)
df

ModuleNotFoundError: No module named 'matplotlib'

# Compare Positive/negative Data
To ensure a model does not determine the AB of a seq based on something else the seq information we try to match as many properties of data. Goal is to avoid that we train a model which just discriminates based on length or some property similiar.

In [None]:
pd.read_sql_query("SELECT COUNT(*), AB FROM prod GROUP BY AB", con)

This is close enough to a 1:1 ratio. This ensures that potential models do not determine the activity by guessing based on length.

# Compare lengths


In [None]:
# Calculate lengths
df['seq_length'] = df['seq'].apply(lambda x: len(x))

# Separate by AB values
positive = df[df['AB'] == 1]
negative = df[df['AB'] == 0]

# Calculate histogram data
hist_data_pos, bin_edges_pos = np.histogram(positive['seq_length'], bins=15, density=True)
hist_data_neg, bin_edges_neg = np.histogram(negative['seq_length'], bins=15, density=True)

# Midpoints of bins
bin_centers_pos = 0.5 * (bin_edges_pos[1:] + bin_edges_pos[:-1])
bin_centers_neg = 0.5 * (bin_edges_neg[1:] + bin_edges_neg[:-1])

# Create line plot
plt.figure(figsize=(10, 6))
plt.plot(bin_centers_pos, hist_data_pos, label='Positive AB', drawstyle='steps-mid')
plt.plot(bin_centers_neg, hist_data_neg, label='Negative AB', drawstyle='steps-mid')
plt.legend()
plt.title('Sequence Length Distribution')
plt.xlabel('Sequence Length')
plt.ylabel('Density')
plt.grid(True)
plt.show()

The length destribution is roughly the same. Meaning models cant guess based on sequence length.

# AA make up
Goal check wether AA make up is the same with positive AB and negative AB.

In [None]:
# Calculate relative abundance
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Assuming df is provided
df_backup = df.copy()

# List of amino acids
amino_acids = list('ACDEFGHIKLMNPQRSTVWY')

# Function to calculate relative abundance of amino acids
def calculate_abundance(seq):
    abundance = {aa: 0 for aa in amino_acids}
    for aa in seq:
        if aa in abundance:
            abundance[aa] += 1
    seq_length = len(seq)
    if seq_length > 0:
        for aa in abundance:
            abundance[aa] /= seq_length
    return abundance

# Apply the function to each sequence and create a new DataFrame
abundance_df = df['seq'].apply(calculate_abundance).apply(pd.Series)

# Merge the new DataFrame with the original DataFrame
result_df = pd.concat([df['AB'], abundance_df], axis=1)

# Separate positive and negative groups
positive = result_df[result_df['AB'] == 1].drop(columns=['AB']).reset_index(drop=True)
negative = result_df[result_df['AB'] == 0].drop(columns=['AB']).reset_index(drop=True)

n_pos = len(positive)
n_neg = len(negative)

# Calculate means and standard deviations
df_local = pd.concat([positive.mean(), positive.std(), negative.mean(), negative.std()], axis=1)
df_local.columns = ['mean_pos', 'std_pos', 'mean_neg', 'std_neg']
df_local.index = amino_acids

print(df_local)

# Plotting
fig, ax = plt.subplots(figsize=(12, 6))

# X-axis positions for the amino acids
x = np.arange(len(df_local))

# Plotting data points and error bars
ax.errorbar(x - 0.1, df_local['mean_pos'], yerr=df_local['std_pos'], fmt='o', label='Positive (Mean 1)', capsize=5)
ax.errorbar(x + 0.1, df_local['mean_neg'], yerr=df_local['std_neg'], fmt='o', label='Negative (Mean 3)', capsize=5)

# Setting the labels and title
ax.set_xlabel('Amino Acids')
ax.set_ylabel('Relative Abundance')
ax.set_title('Relative Abundance of Amino Acids with Error Bars')

# Setting the x-ticks and labels
ax.set_xticks(x)
ax.set_xticklabels(df_local.index)

# Adding legend
ax.legend()

# Displaying the plot
plt.tight_layout()
plt.show()


Graphic shows no large deviation per amino acid postive vs negative.