In [None]:
odds_ratio = '../out/associations/odds_ratio.tsv'
associated = '../out/associations/associated_ogs.txt'
names = '../out/associations/associated_ogs.final.tsv'
kmer_hits = '../out/associations/summary_cont_lmm_kmer.tsv'
hpi = '../data/hpi.tsv'
others = '../data/others.tsv'

In [None]:
# plotting imports
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text

sns.set_style('white')

plt.rc('font', size=11)
plt.rc('xtick', labelsize=11)
plt.rc('ytick', labelsize=11)
plt.rc('axes', labelsize=12, titlesize=12)
plt.rc('legend', fontsize=11)

In [None]:
import numpy as np
import pandas as pd

In [None]:
rhpi = [x.rstrip().split()[0]
       for x in open(hpi)]
hpi = [x.rstrip().split()[1]
       for x in open(hpi)]

In [None]:
other = {}
rother = {}
for l in open(others):
    operon = l.rstrip().split()[0]
    other[operon] = {x.split(',')[0]: x.split(',')[1]
                      for x in l.rstrip().split()[1:]}
    rother[operon] = {x.split(',')[1]: x.split(',')[0]
                       for x in l.rstrip().split()[1:]}

In [None]:
f = pd.read_table(associated,
                  index_col=0,
                  header=None)

In [None]:
m = pd.read_table(odds_ratio,
                  index_col=0)
m['lrt-pvalue'] = [float(x)
                   if x != 'NAN'
                   else np.nan
                   for x in m['lrt-pvalue']]
m = m.dropna()

In [None]:
u = pd.read_csv(names, sep='\t')
n = u.set_index('query')['preferred_og_name'].to_dict()
nsize = u.set_index('query')['representative_protein_length']

In [None]:
for k,v in other.items():
    n[k] = v

In [None]:
k = pd.read_table(kmer_hits,
                  index_col=0)
k['normalized_hits'] = (k['length'] / nsize)
# k.index = [n.get(x, x)
#            for x in k.index]
k = k[k['specific_hits'] > 0]
kh = k['normalized_hits'].to_dict()

In [None]:
m.loc[m.index.intersection(kh),
      'normalized_hits'] = [kh[x]
                            for x in m.index.intersection(kh)]

In [None]:
m['logpvalue'] = -np.log10(m['lrt-pvalue'])

In [None]:
plt.figure(figsize=(3.5, 3.5))

plt.plot(m.loc[rhpi]['odds-ratio'],
         -np.log10(m.loc[rhpi]['lrt-pvalue']),
         'o',
         color=sns.xkcd_rgb['dark sky blue'],
         label='HPI',
         zorder=10)
plt.plot(m.loc[other['aerobactin']]['odds-ratio'],
         -np.log10(m.loc[other['aerobactin']]['lrt-pvalue']),
         'o',
         color=sns.xkcd_rgb['bluish green'],
         label='Aerobactin',
         zorder=9)
plt.plot(m.loc[other['sitABCD']]['odds-ratio'],
         -np.log10(m.loc[other['sitABCD']]['lrt-pvalue']),
         'o',
         color=sns.xkcd_rgb['yellow orange'],
         label='sitABCD',
         zorder=8)
plt.plot(m.loc[m.index.intersection(k[(k['OG_af'] > 0.9) & (abs(k['OG_af'] - k['avg_af']) >= 0.1)].index)]['odds-ratio'],
         -np.log10(m.loc[m.index.intersection(k[(k['OG_af'] > 0.9) & (abs(k['OG_af'] - k['avg_af']) >= 0.1)].index)]['lrt-pvalue']),
         'o',
         color=sns.xkcd_rgb['pinkish red'],
         label='Core genes',
         zorder=5)
plt.plot(m.loc[m.index.intersection(k.index)]['odds-ratio'],
         -np.log10(m.loc[m.index.intersection(k.index)]['lrt-pvalue']),
         'ko',
#          alpha=0.3,
         label='Other genes',
         zorder=0)

# sns.scatterplot(data=m.loc[m.index.difference(rhpi).difference(m.index.difference(m.dropna().index))].reset_index().rename(columns={'normalized_hits':
#                                                                                   'Gene length fraction'}),
#                 x='odds-ratio',
#                 y='logpvalue',
# #                 size='Gene length fraction',
#                 color='r',
#                 sizes=(50, 200),
#                 legend=False
#                )
# sns.scatterplot(data=m.loc[rhpi].reset_index().rename(columns={'normalized_hits':
#                                                               'Gene length fraction'}),
#                 x='odds-ratio',
#                 y='logpvalue',
# #                 size='Gene length fraction',
#                 color=sns.xkcd_rgb['dark sky blue'],
#                 sizes=(50, 200),
#                 legend=False)

# texts = [plt.text(x, y, n.get(t, t),
#                   ha='center', va='center')
#          for (x, y), t in zip(m.dropna()[['odds-ratio',
#                                  'logpvalue']].values,
#                               m.dropna().index)
#          if y > 8 and t not in rhpi and not n.get(t, t).startswith('group') ]

# adjust_text(texts,
#             arrowprops=dict(arrowstyle='-', color='k'),
#             force_points=0.5
#             )

plt.legend(loc='upper left',
           bbox_to_anchor=(0, 1),
           frameon=True)

plt.xlabel('Orthologous group odds ratio')
plt.ylabel('$-log_{10}(pvalue)$')

plt.savefig('odds_ratio.png',
            dpi=300, bbox_inches='tight',
            transparent=True)
plt.savefig('odds_ratio.svg',
            dpi=300, bbox_inches='tight',
            transparent=True);