In [1]:
import pandas as pd 
import plotly.express as px 
import datetime
import numpy as np

In [None]:
data = pd.read_csv('../output/2024-01-26_two_peps_all_runs_1-2-3-4.csv')
data['k'] = data['kmer'].str.len()

In [None]:
# FILTER OUT GROUPS NAMED INITIAL INSTEAD OF CONTROL OR WCL 

data = data[data['group'] != 'Initial']

# Generate dfs containing 2, 3, 4, length kmers. 

# Block may be used for 
agg_data_2mer = data[data['k'] == 2].groupby(['group', 'kmer', 'k']).agg({'count':'sum'})
agg_data_3mer = data[data['k'] == 3].groupby(['group', 'kmer', 'k']).agg({'count':'sum'})
agg_data_4mer = data[data['k'] == 4].groupby(['group', 'kmer', 'k']).agg({'count':'sum'})

top_features_2mer = agg_data_2mer.groupby('group').apply(lambda x: x.nlargest(100, 'count')).reset_index(allow_duplicates=True)
top_features_2mer = top_features_2mer.iloc[:, 1:]
top_features_3mer = agg_data_3mer.groupby('group').apply(lambda x: x.nlargest(100, 'count')).reset_index(allow_duplicates=True)
top_features_3mer = top_features_3mer.iloc[:, 1:]
top_features_4mer = agg_data_4mer.groupby('group').apply(lambda x: x.nlargest(100, 'count')).reset_index(allow_duplicates=True)
top_features_4mer = top_features_4mer.iloc[:, 1:]

In [2]:
df = pd.read_csv('../output/kmer_enrichment_log2fold.csv')
df

Unnamed: 0,group,k-mer,p-value,log2 fold change
0,enriched in control,WEMG,2.974435e-02,4.254720
1,enriched in control,DPDD,1.773648e-11,1.107941
2,enriched in control,PDTT,3.817398e-02,0.951670
3,enriched in control,NDYS,2.060812e-02,1.046828
4,enriched in control,FMST,3.817398e-02,1.352729
...,...,...,...,...
843,enriched in wcl,RCHC,1.429642e-02,4.459525
844,enriched in wcl,RNLN,4.950251e-02,1.161444
845,enriched in wcl,INMQ,4.432111e-03,3.044488
846,enriched in wcl,CAWA,3.742330e-02,4.225060


In [3]:
def change_value(row):
    if row['group'] == 'enriched in control':
        row['log2 fold change'] = -row['log2 fold change']
    return row

# Apply the function
df = df.apply(change_value, axis=1)
df['-log10 p-value'] = -np.log(df['p-value'])

In [8]:
fig = px.scatter(
    df, 
    x = 'log2 fold change',
    y = '-log10 p-value', 
    # color = '-log10 p-value',
    color = 'log2 fold change',
    hover_name = 'k-mer',
    template = 'simple_white',
    # vv Change the continuous color scale here to a built-in color scale vv
    # Color scale names are on this website: https://plotly.com/python/builtin-colorscales/
    # I suggest a two-tone color scale for a volcano plot. (Or just monocolor) 
    # color_continuous_scale='RdBu', 
    #                       ^^ ^^
    title='Log2 fold change vs -log10 p-value | Enriched in control (-), enriched in WCL (+)'
    
)

# IF YOU WANT A MONOCOLOR PLOT: 
fig.update_traces(marker=dict(color='black'))
fig.show()
# fig.write_html('../output/volcano_plot_color_log2fold.html')

In [None]:
fig = px.bar(
    top_features_2mer,
    x = 'kmer',
    y = 'count',
    color = 'group',
    barmode='group',
    template='simple_white',
    title = 'Top 2-mers in Control vs WCL'
)

In [None]:
fig.write_html(f'../output/{datetime.date.today()}bar_chart_2mer_counts_top_100_each_treatment.html')

In [None]:
fig.show()