In [None]:

import pandas as pd

# Replace 'file1.csv', 'file2.csv', 'file3.csv' with your actual file paths
df1 = pd.read_csv('file1.csv')
df2 = pd.read_csv('file2.csv')
df3 = pd.read_csv('file3.csv')
df1['source'] = 'AmpSeq'
df2['source'] = 'UMIExo'
df3['source'] = '2Step'

combined_df = pd.concat([df1, df2, df3], ignore_index=True)
# Melt the category columns to long format
melted_df = combined_df.melt(id_vars=['pct1', 'pct2', 'pct3', 'class', 'source'],
                             value_vars=['cat1', 'cat2', 'cat3'],
                             var_name='category',
                             value_name='value')

# Split the 'value' column by semicolon and explode
melted_df['value'] = melted_df['value'].str.split(';')
exploded_df = melted_df.explode('value')
# Extract the numerical part before the first underscore
exploded_df['x_position'] = exploded_df['value'].str.extract(r'(\d+)_')[0].astype(int)
aggregated_df = exploded_df.groupby('value')['pct1', 'pct2', 'pct3'].sum().reset_index()

# Calculate the total percentage
aggregated_df['total_pct'] = aggregated_df[['pct1', 'pct2', 'pct3']].sum(axis=1)
source_presence = exploded_df.groupby('value')['source'].unique().reset_index()

# Create flags for each source
source_presence['source1'] = source_presence['source'].apply(lambda x: 1 if 1 in x else 0)
source_presence['source2'] = source_presence['source'].apply(lambda x: 1 if 2 in x else 0)
source_presence['source3'] = source_presence['source'].apply(lambda x: 1 if 3 in x else 0)

# Merge with the aggregated data
final_df = pd.merge(aggregated_df, source_presence, on='value')
# Example: Calculate standard deviation across pct1, pct2, pct3
final_df['deviation'] = final_df[['pct1', 'pct2', 'pct3']].std(axis=1)
# Extract x positions
final_df['x'] = final_df['value'].str.extract(r'(\d+)_')[0].astype(int)

# y is the total percentage
final_df['y'] = final_df['total_pct']


In [ ]:
# This is the plotting but we can adjust this later.
import plotly.express as px

fig = px.scatter(final_df, 
                 x='x', 
                 y='y',
                 error_y='deviation',
                 color='source',  # You can adjust this based on your needs
                 hover_data=['value'])

fig.update_layout(title='Sum Percentage by X Position with Deviation',
                  xaxis_title='X Position',
                  yaxis_title='Sum Percentage')

fig.show()