In [None]:
dists = '../out/associations/gene_distances.tsv'
kmer_hits = '../out/associations/summary_cont_lmm_kmer.tsv'
names = '../out/associations/associated_ogs.final.tsv'
hpi = '../data/hpi.tsv'

In [None]:
# plotting imports
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text

sns.set_style('white')

plt.rc('font', size=11)
plt.rc('xtick', labelsize=11)
plt.rc('ytick', labelsize=11)
plt.rc('axes', labelsize=12, titlesize=12)
plt.rc('legend', fontsize=11)

In [None]:
import numpy as np
import pandas as pd
import networkx as nx

In [None]:
rhpi = [x.rstrip().split()[0]
       for x in open(hpi)]
hpi = [x.rstrip().split()[1]
       for x in open(hpi)]

In [None]:
u = pd.read_table(names)
n = u.set_index('query')['preferred_og_name'].to_dict()
nsize = u.set_index('query')['representative_protein_length']

In [None]:
m = pd.read_table(dists)

In [None]:
k = pd.read_table(kmer_hits,
                  index_col=0)
k['normalized_hits'] = (k['hits'] / nsize)
k.index = [n.get(x, x)
           for x in k.index]
k = k[k['specific_hits'] > 0]
kh = k['normalized_hits'].to_dict()

Top gene hits
---

In [None]:
k.sort_values('normalized_hits',
              ascending=False)

In [None]:
plt.figure(figsize=(3.5, 3.5))

sp = sns.scatterplot(x='avg_maf',
                     y='maxp',
                     size='Gene length fraction',
                     data=k.loc[k.index.difference(hpi)].rename(columns={'normalized_hits':
                                                                         'Gene length fraction'}),
                     color='k',
                     sizes=(20, 200))
sp = sns.scatterplot(x='avg_maf',
                     y='maxp',
                     size='Gene length fraction',
                     data=k.loc[hpi].rename(columns={'normalized_hits':
                                                     'Gene length fraction'}),
                     color=sns.xkcd_rgb['dark sky blue'],
                     sizes=(20, 200), 
                     legend=False)

texts = [plt.text(x, y, t,
                  ha='center', va='center')
         for (x, y), t in zip(k[['avg_maf', 'maxp']].values,
                              k.index)
         if y > 15 and t not in hpi and not t.startswith('group')]
adjust_text(texts,
            arrowprops=dict(arrowstyle='-', color='k'),
            force_points=1.5)

plt.xlabel('Average MAF')
plt.ylabel('$-log_{10}(pvalue)$')

plt.savefig('hits.png',
            dpi=300, bbox_inches='tight',
            transparent=True)
plt.savefig('hits.svg',
            dpi=300, bbox_inches='tight',
            transparent=True);

In [None]:
k.loc[k.index.intersection(hpi), 'Gene'] = 'HPI'
k.loc[k.index.difference(hpi), 'Gene'] = 'Other'

In [None]:
plt.figure(figsize=(3.5, 3.5))

sp = sns.scatterplot(x='OG_af',
                     y='avg_af',
                     hue='Gene',
                     size='$-log_{10}(pvalue)$',
                     data=k.rename(columns={'maxp':
                                            '$-log_{10}(pvalue)$'}),
                     palette=[sns.xkcd_rgb['dark sky blue'], 'k'],
                     hue_order=['HPI', 'Other'],
                     sizes=(20, 200))

plt.plot([-0.05, 1.05],
         [-0.05, 1.05],
         '--',
         color=sns.xkcd_rgb['grey'])

plt.xlim(-0.05, 1.05)
plt.ylim(-0.05, 1.05)

plt.legend(loc='center left',
           bbox_to_anchor=(1, 0.5))

texts = [plt.text(z, x, t,
                  ha='center', va='center')
         for (x, y, z), t in zip(k[['avg_af', 'maxp', 'OG_af']].values,
                              k.index)
         if y > 15 and t not in hpi and not t.startswith('group')]
adjust_text(texts,
            arrowprops=dict(arrowstyle='-', color='k'),
            force_points=2)

plt.xlabel('Orthologous group AF')
plt.ylabel('Unitigs average AF')

plt.savefig('unitigs_og.png',
            dpi=300, bbox_inches='tight',
            transparent=True)
plt.savefig('unitigs_og.svg',
            dpi=300, bbox_inches='tight',
            transparent=True);

Gene hits distances
---

In [None]:
k = pd.read_table(kmer_hits,
                  index_col=0)
k['normalized_hits'] = (k['hits'] / nsize)
genes = set(k[k['normalized_hits'] >= 0.1].index)

In [None]:
d = m[(m['replicon'] == True) &
      (m['set'] == 'real') &
      (m['og1'] != m['og2']) &
      (m['strain'] == 'IAI39') &
      (m['og1'].isin(genes.difference(rhpi))) &
      (m['og2'].isin(genes.difference(rhpi)))].groupby(['og1', 'og2']).min()['distance'].reset_index()
h = m[(m['replicon'] == True) &
      (m['set'] == 'real') &
      (m['og1'] != m['og2']) &
      (m['strain'] == 'IAI39') &
      (m['og1'].isin(rhpi)) &
      (m['og2'].isin(rhpi))].groupby(['og1', 'og2']).min()['distance'].reset_index()
r = m[(m['replicon'] == True) &
      (m['set'] == 'random') &
      (m['og1'] != m['og2']) &
      ((m['strain'] == 'IAI39') |
       (m['strain'] == 'IAI01'))].groupby(['og1', 'og2']).min()['distance'].reset_index()

In [None]:
plt.figure(figsize=(3.5, 3.5))

sns.kdeplot(np.log10(h['distance']),
            label='HPI',
            color=sns.xkcd_rgb['dark sky blue'],
            zorder=10)
sns.kdeplot(np.log10(d['distance']),
            label='Other Genes',
            color='r',
            zorder=5)
sns.kdeplot(np.log10(r['distance']),
            label='Random genes',
            color='grey',
            zorder=0)

plt.legend(loc='upper left',
           bbox_to_anchor=(0, 1))

sns.despine(left=True)
plt.yticks([])
plt.xticks(range(0, 10, 2),
           ['$10^{%d}$' % x for x in range(0, 10, 2)])
plt.xlabel('Minimum gene distance (bp)')

plt.savefig('genes.png',
            dpi=300, bbox_inches='tight',
            transparent=True)
plt.savefig('genes.svg',
            dpi=300, bbox_inches='tight',
            transparent=True);

In [None]:
d = m[(m['replicon'] == True) &
      (m['set'] == 'real') &
      (m['og1'] != m['og2']) &
      (m['strain'] == 'IAI39') &
      (m['og1'].isin(genes)) &
      (m['og2'].isin(genes)) &
      (m['distance'] <= 500)].groupby(['og1', 'og2']).min()['distance'].reset_index()

In [None]:
g = nx.from_pandas_edgelist(d,
                            source='og1',
                            target='og2',
                            edge_attr='distance')

In [None]:
graph_pos = nx.layout.spring_layout(g, k=0.3)

In [None]:
plt.figure(figsize=(9, 9))

# Draw nodes
nx.draw_networkx_nodes(g, graph_pos,
                       # Node size depends on gene hits
                       node_size=[kh.get(n.get(x, x), min(kh.values()))*100
                                  for x in g.nodes()],
                       node_color=sns.xkcd_rgb['light grey'],
                       edgecolors='k')
# Draw edges
nx.draw_networkx_edges(g, graph_pos,
                       # Width depends on minimum distance
                       width=1,
#                        width=[x['distance']/2000 for x in dict(g.edges()).values()],
                       color='grey')
# Draw labels
nx.draw_networkx_labels(g, graph_pos,
                        {x:n.get(x, x) for x in g.nodes()})

sns.despine(bottom=True, left=True)
plt.xticks([])
plt.yticks([])

plt.savefig('graphs.png',
            dpi=300, bbox_inches='tight',
            transparent=True)
plt.savefig('graphs.svg',
            dpi=300, bbox_inches='tight',
            transparent=True);

In [None]:
plt.figure(figsize=(9, 9))

# Draw nodes
nx.draw_networkx_nodes(g, graph_pos,
                       # Node size depends on gene hits
                       node_size=[kh.get(n.get(x, x), min(kh.values()))*100
                                  for x in g.nodes()],
                       node_color=sns.xkcd_rgb['light grey'],
                       edgecolors='k')
# Draw edges
nx.draw_networkx_edges(g, graph_pos,
                       # Width depends on minimum distance
                       width=1,
#                        width=[x['distance']/2000 for x in dict(g.edges()).values()],
                       color='grey')
# Draw labels
nx.draw_networkx_labels(g, graph_pos,
                        #{x:n.get(x, x) for x in g.nodes()}
                       )

sns.despine(bottom=True, left=True)
plt.xticks([])
plt.yticks([])

plt.savefig('graphs1.png',
            dpi=300, bbox_inches='tight',
            transparent=True)
plt.savefig('graphs1.svg',
            dpi=300, bbox_inches='tight',
            transparent=True);

<h5><a href="javascript:toggle()" target="_self">toggle source code</a></h5>