In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

Load data

In [None]:
expression_df = pd.read_csv('data/GSE104878_20170811_average_promoter_ELs_per_seq_OLS_Glu_goodCores_ALL.txt.gz', sep="\t")
scaffold_sequences = pd.read_csv('data/GSE104878_Scaffold_library_sequences.txt.gz', sep="\t")

Check that dfs look correct.

In [None]:
scaffold_sequences.head()

In [None]:
expression_df.head()

Group by scaffold name and calculate statistics.

In [None]:
def f(df):
  return pd.Series({
    'mean': np.mean(df['Expression']),
    'stdev': np.std(df['Expression']),
    'reads': len(df['Expression']),
  })

grouped = expression_df.groupby('ScaffoldName').apply(f)
grouped.head()

Merge to include designed scaffold sequence.

In [None]:
grouped = grouped.merge(scaffold_sequences, on='ScaffoldName')
grouped.head()

In [None]:
sns.scatterplot(data=grouped, x='mean', y='stdev', hue='reads')

In [None]:
high_mean_sorted_stddev = grouped[(grouped['mean'] > 5) & (grouped['reads'] > 200)].sort_values('stdev', ascending=False)

In [None]:
high_stdev_names= high_mean_sorted_stddev['ScaffoldName'][:20]

In [None]:
high_mean_sorted_stddev[:20]

In [None]:
g = sns.FacetGrid(expression_df, col="ScaffoldName", col_order=high_stdev_names, col_wrap=5)
g.map(sns.histplot, "Expression")

In [None]:
high_mean_sorted_stddev_random = grouped[(grouped['mean'] > 4) & (grouped['reads'] > 200) & (grouped['ScaffoldName'].astype(str).str[:4] == 'Pure')].sort_values('stdev', ascending=False)

In [None]:
high_mean_sorted_stddev_random[:20]

In [None]:
random_names = high_mean_sorted_stddev_random['ScaffoldName'][:20]
g = sns.FacetGrid(expression_df, col="ScaffoldName", col_order=random_names, col_wrap=5)
g.map(sns.histplot, "Expression")