In [2]:
import pandas as pd
import altair as alt

In [114]:
assays_long = pd.read_csv('results.csv')

In [115]:
assays_long['outcome'].unique()

array(['Active', 'Inactive'], dtype=object)

# Proportion stacked bar plot:

In [116]:
###Count up conditions using groupby:
numActive = assays_long.groupby(['lig_chemblid', 'pref_name']).apply( 
    lambda grouped_df: (grouped_df['outcome'].str.contains('Active').any())).values.sum()
numInactive = assays_long.groupby(['lig_chemblid', 'pref_name']).apply( 
    lambda grouped_df: (grouped_df['outcome'].str.contains('Inactive').any())).values.sum()

numBoth = assays_long.groupby(['lig_chemblid', 'pref_name']).apply( 
    lambda grouped_df: ((grouped_df['outcome'].str.contains('Inactive').any()) & 
                        (grouped_df['outcome'].str.contains('Active').any()))).values.sum()

numNone = assays_long.groupby(['lig_chemblid', 'pref_name']).apply( 
    lambda grouped_df: (~(grouped_df['outcome'].str.contains('Inactive').any()) & 
                        ~(grouped_df['outcome'].str.contains('Active').any()))).values.sum()

In [117]:
###Or count up conditions the boring way:
y=0
n=0
b=0
neither=0
instance_ids = assays_long.drop_duplicates(['lig_chemblid', 'pref_name'])['lig_chemblid']
pref_names = assays_long.drop_duplicates(['lig_chemblid', 'pref_name'])['pref_name']
for i, name in zip(instance_ids, pref_names):
    mask = (assays_long['pref_name']==name) & (assays_long['lig_chemblid']==i)
    group = assays_long[mask]
    outcomes = list(group['outcome'])
    if ('Active' in outcomes) and ('Inactive' in outcomes):
        b+=1
    elif ('Active' in outcomes):
        y+=1
    elif ('Inactive' in outcomes):
        n+=1
    else:
        neither+=1
        

In [118]:
assays_long.groupby(['lig_chemblid', 'pref_name']).apply( 
    lambda grouped_df: (~(grouped_df['outcome'].str.contains('Inactive').any()) & 
                        ~(grouped_df['outcome'].str.contains('Active').any()))).values.sum()

0

In [119]:
#Verify both ways are the same:
print(f'Active: {y}')
print(f'Inactive: {n}')
print(f'Both: {b}')
print(f'None: {neither}')

Active: 488
Inactive: 126
Both: 32
None: 0


In [120]:
#Both ways are the same:
print(f'Active: {numActive-numBoth}')
print(f'Inactive: {numInactive-numBoth}')
print(f'Both: {numBoth}')
print(f'None: {numNone}')

Active: 488
Inactive: 126
Both: 32
None: 0


In [121]:
df = pd.DataFrame(columns=['results','activity', 'number'])
df.loc[0]=['Results', 'Active', numActive]
df.loc[1]=['Results', 'Inactive', numInactive]
df.loc[2]=['Results', 'Both', numBoth]

###Remember - inconclusive or unspecified must be added to all the predictions that had NO records.
#without the 'unknown' records:
#df.loc[3]=['Results', 'Unknown', numNone ]
#including all absent records:
number_of_absent_records = 10000 - sum([df.iloc[0]['number'], df.iloc[1]['number']])
df.loc[3] = ['Results', 'Unknown', number_of_absent_records]

df['proportion'] = df['number']/df['number'].sum()
df

Unnamed: 0,results,activity,number,proportion
0,Results,Active,520,0.0518341
1,Results,Inactive,158,0.0157496
2,Results,Both,32,0.00318979
3,Results,Unknown,9322,0.929226


In [122]:
chart_one = alt.Chart(df).transform_joinaggregate(
        Total='sum(number)'
    ).transform_calculate(PercentOfTotal="datum.number / datum.Total").mark_bar().encode(
    x=alt.X('PercentOfTotal:Q', title='All assays'),#,sort='descending'),
    y=alt.Y('results', title=''),
    color=alt.Color('activity', title='Activity type'),
    order=alt.Order('activity', sort='ascending')
)#.add_selection(
 #   selector
#)

chart_two = alt.Chart(df.iloc[:3]).transform_joinaggregate(
        Total='sum(number)'
    ).transform_calculate(PercentOfTotal="datum.number / datum.Total").mark_bar().encode(
    x=alt.X('PercentOfTotal:Q', title='Assays with activity evidence'),#,sort='descending'),
    y=alt.Y('results', title=''),
    color=alt.Color('activity', title='Activity type'),
    order=alt.Order('activity', sort='ascending')
)
#.add_selection(
#    selector
#)

chart_one&chart_two


In [123]:
####Interactive version:
# brush = alt.selection(type='interval', encodings=['x'])

# base = alt.Chart(df).transform_joinaggregate(
#         Total='sum(number)'
#     ).transform_calculate(PercentOfTotal="datum.number / datum.Total").mark_bar().encode(
#     #x='number',
#     x='PercentOfTotal:Q',
#     y='results',
#     color='activity')

# upper = base.encode(
#     alt.X('PercentOfTotal:Q', scale=alt.Scale(domain=brush)))

# lower = base.add_selection(brush)

# lower&upper 


# Target type barplot:


In [124]:
out = assays_long.groupby(['lig_chemblid', 'pref_name']).apply( 
    lambda grouped_df: (~(grouped_df['outcome'].str.contains('Inactive').any()) & 
                        (grouped_df['outcome'].str.contains('Active').any())))

positive_targets=[i[1] for i, v in zip(out.index, out.values) if v==True]
all_pos_targets = pd.DataFrame(out[out==True]).reset_index().drop([0],axis=1)
pos_targets_gb= all_pos_targets.groupby(['pref_name']).count().reset_index().sort_values(by='lig_chemblid', ascending=False)
pos_targets_gb.columns=['pref_name', 'count']



# All together now:

In [125]:
chart_one = alt.Chart(df).transform_joinaggregate(
        Total='sum(number)'
    ).transform_calculate(PercentOfTotal="datum.number / datum.Total").mark_bar().encode(
    x=alt.X('PercentOfTotal:Q'),#,sort='descending'),
    y=alt.Y('results', title='',axis=alt.Axis(labels=False)),
    color=alt.Color('activity', title='Activity type'),
    order=alt.Order('activity', sort='ascending')
).properties(height=30, width=550)
#.add_selection(
 #   selector
#).properties(height=30, width=550)

chart_two = alt.Chart(df.iloc[:3]).transform_joinaggregate(
        Total='sum(number)'
    ).transform_calculate(PercentOfTotal="datum.number / datum.Total").mark_bar().encode(
    x=alt.X('PercentOfTotal:Q', title='Hi'),#,sort='descending'),
    y=alt.Y('results', title='',axis=alt.Axis(labels=False)),
    color=alt.Color('activity', title='Activity type'),
    order=alt.Order('activity', sort='ascending')
).properties(height=30, width=550)
#.add_selection(
 #   selector
#).properties(height=30, width=550)

chart = alt.Chart(all_pos_targets).mark_bar()

chart = chart.encode(
    x = alt.X('pref_name', sort='-y', title='Target preferred name'),
    y = alt.Y('count()'), 
).properties(height=200,width=700)

(chart_one&chart_two&chart
).configure_axisBottom(labelAngle=-35).configure_header(
    titleFontSize=40, 
    labelFontSize=40
).configure_title()

In [126]:
def parse_group(group):
    inactive = group['outcome'].str.contains('Inactive').any()
    active = group['outcome'].str.contains('Active').any()
    if inactive and active:
        return 'Both'
    elif active and ~inactive:
        return 'Active'
    elif inactive and ~active:
        return 'Inactive'
    else:
        return False
    
    
out = assays_long.groupby(['lig_chemblid', 'pref_name']).apply(parse_group)
out_df= pd.DataFrame(out[out!=False]).reset_index().drop('lig_chemblid',axis=1)
out_df.columns=['pref_name', 'activity']

In [127]:
out_df

Unnamed: 0,pref_name,activity
0,Serine/threonine-protein kinase PIM1,Inactive
1,Tyrosine-protein kinase JAK1,Inactive
2,Tyrosine-protein kinase JAK2,Inactive
3,Vascular endothelial growth factor receptor 2,Inactive
4,MAP kinase signal-integrating kinase 2,Inactive
...,...,...
641,Tyrosine-protein kinase JAK1,Inactive
642,Tyrosine-protein kinase JAK2,Inactive
643,Vascular endothelial growth factor receptor 2,Both
644,Serotonin 2a (5-HT2a) receptor,Inactive


In [128]:
chart_one = alt.Chart(df).transform_joinaggregate(
        Total='sum(number)'
    ).transform_calculate(PercentOfTotal="datum.number / datum.Total").mark_bar().encode(
    x=alt.X('PercentOfTotal:Q', title='Proportion'),#,sort='descending'),
    y=alt.Y('results', title='All records',axis=alt.Axis(labels=False)),
    color=alt.Color('activity', title='Activity type'),
    order=alt.Order('activity', sort='ascending')
).properties(height=30, width=750)
#.add_selection(
#    selector
#).properties(height=30, width=550)

chart_two = alt.Chart(df.iloc[:3]).transform_joinaggregate(
        Total='sum(number)'
    ).transform_calculate(PercentOfTotal="datum.number / datum.Total").mark_bar().encode(
    x=alt.X('PercentOfTotal:Q', title='Proportion'),#,sort='descending'),
    y=alt.Y('results', title='Records with evidence',axis=alt.Axis(labels=False)),
    color=alt.Color('activity', title='Activity type'),
    order=alt.Order('activity', sort='ascending')
).properties(height=30, width=750)
#.add_selection(
#    selector
#).properties(height=30, width=550)






ch = alt.Chart(out_df).mark_bar()

ch = ch.encode(
    x = alt.X('pref_name', sort='-y', title='Target preferred name'),
    y = alt.Y('count()'),
    color=alt.Color('activity'),
    order=alt.Order('activity', sort='ascending'),
).properties(height=200,width=750)

# ch = ch.transform_aggregate(
#     count='count()',
#     groupby=['pref_name']
# ).transform_filter(
#     'datum.count >= 5'
# ).mark_bar().encode(
#    alt.X('pref_name:N', sort='-y', title='Target preferred name'),
#    alt.Y('count:Q'),
#     color=alt.Color('activity'),
#     order=alt.Order('activity', sort='ascending'),
# ).properties(height=200,width=750)



(chart_one&chart_two&ch
).configure_axisBottom(labelAngle=-35).configure_header(
    titleFontSize=40, 
    labelFontSize=40
).configure_axisY(
    titleAngle=0,
    titleAlign="left",
    titleY=-10,
    titleX=-10,
    titleFontSize=16,
)


In [129]:


chart_one = alt.Chart(df).transform_joinaggregate(
        Total='sum(number)'
    ).transform_calculate(PercentOfTotal="datum.number / datum.Total").mark_bar().encode(
    y=alt.Y('PercentOfTotal:Q', title='All assays'),#,sort='descending'),
    x=alt.X('results', title='',axis=alt.Axis(labels=False)),
    color=alt.Color('activity', title=''),
    order=alt.Order('activity', sort='ascending')
).properties(height=500, width=30)

chart_two = alt.Chart(df.iloc[:3]).transform_joinaggregate(
        Total='sum(number)'
    ).transform_calculate(PercentOfTotal="datum.number / datum.Total").mark_bar().encode(
    y=alt.Y('PercentOfTotal:Q', title='Existing evidence',),#,sort='descending'),
    x=alt.X('results', title='',axis=alt.Axis(labels=False)),
    color=alt.Color('activity', title=''),
    order=alt.Order('activity', sort='ascending')
).properties(height=500, width=30)








ch = alt.Chart(out_df).mark_bar()
ch = ch.encode(
    y = alt.Y('pref_name', sort='-x', title='Target occurrence'),
    x = alt.X('count()', title='Count'),
    color=alt.Color('activity'),
    order=alt.Order('activity', sort='ascending'),
).properties(height=500,width=300)


(chart_one|chart_two|ch
).configure_axisBottom(labelAngle=-35).configure_header(
    titleFontSize=40, 
    labelFontSize=40
).configure_axisY(
    titleAngle=0,
    titleAlign="left",
    titleY=-10,
    titleX=-20,
    titleFontSize=16,
).configure_legend(orient='bottom',labelFontSize=22)
#top right' is not one of ['none', 'left', 'right', 'top', 
#'bottom', 'top-left', 'top-right', 'bottom-left', 'bottom-right']

In [130]:
temp = out_df.groupby('pref_name').filter(lambda x : len(x)>2)
ch = alt.Chart(temp).mark_bar()

# ch = ch.encode(
#     x = alt.X('pref_name', sort='-y', title='Target preferred name'),
#     y = alt.Y('count()'),
#     color=alt.Color('activity'),
#     order=alt.Order('activity', sort='ascending'),
# )

ch = ch.encode(
    y = alt.Y('pref_name', sort='-x', title='Target preferred name'),
    x = alt.X('count()', title='Count'),
    color=alt.Color('activity'),
    order=alt.Order('activity', sort='ascending'),
).properties(height=500,width=300)

ch.properties(height=200,width=750).configure_axisY(
titleFontSize=13,
).configure_axisX(
titleFontSize=13
).configure_axisBottom(labelAngle=-35).configure_header(
    titleFontSize=40, 
    labelFontSize=40
).configure_axisY(
    titleAngle=0,
    titleAlign="left",
    titleY=-10,
    titleX=-20,
    titleFontSize=16,
).configure_legend(orient='bottom',labelFontSize=22)
# ch = ch.transform_aggregate(
#     count='count()',
#     groupby=['pref_name']
# ).transform_filter(
#     'datum.count >= 5'
# ).mark_bar().encode(
#    alt.X('pref_name:N', sort='-y', title='Target preferred name'),
#    alt.Y('count:Q'),
#     color=alt.Color('activity'),
#     order=alt.Order('activity', sort='ascending'),
# ).properties(height=200,width=750)

ch

In [94]:
gb = out_df.groupby('pref_name')

In [107]:
out_df.groupby('pref_name').filter(lambda x : len(x)>3)

Unnamed: 0,pref_name,activity
0,Serine/threonine-protein kinase PIM1,Inactive
1,Tyrosine-protein kinase JAK2,Inactive
2,Vascular endothelial growth factor receptor 2,Inactive
4,Fibroblast growth factor receptor 2,Both
5,Fibroblast growth factor receptor 3,Inactive
...,...,...
418,Vascular endothelial growth factor receptor 2,Inactive
421,Tyrosine-protein kinase JAK2,Inactive
422,Vascular endothelial growth factor receptor 2,Both
423,Serotonin 2a (5-HT2a) receptor,Inactive


In [105]:
gb.count().reset_index()[(gb.count()>2)['activity']]

  """Entry point for launching an IPython kernel.


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [103]:
(gb.count()>2)['activity']

pref_name
ADAM17                                           False
Acetylcholinesterase                              True
Alpha-1d adrenergic receptor                     False
Beta-secretase 1                                 False
Cannabinoid CB1 receptor                         False
                                                 ...  
Tyrosine-protein kinase TIE-2                    False
Tyrosine-protein kinase TYK2                      True
Tyrosine-protein kinase receptor FLT3             True
Vascular endothelial growth factor receptor 2     True
c-Jun N-terminal kinase 1                        False
Name: activity, Length: 73, dtype: bool

In [33]:
out_df

Unnamed: 0,pref_name,activity
0,Serine/threonine-protein kinase PIM1,Inactive
1,Tyrosine-protein kinase JAK2,Inactive
2,Vascular endothelial growth factor receptor 2,Inactive
3,MAP kinase signal-integrating kinase 2,Inactive
4,Fibroblast growth factor receptor 2,Both
...,...,...
420,Tyrosine-protein kinase TIE-2,Inactive
421,Tyrosine-protein kinase JAK2,Inactive
422,Vascular endothelial growth factor receptor 2,Both
423,Serotonin 2a (5-HT2a) receptor,Inactive
