In [None]:
odds_ratio = '../out/associations/odds_ratio.tsv'
associated = '../out/associations/associated_ogs.txt'
names = '../out/associations/associated_ogs.final.tsv'
kmer_hits = '../out/associations/summary_cont_lmm_kmer.tsv'

In [None]:
# plotting imports
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text

sns.set_style('white')

plt.rc('font', size=11)
plt.rc('xtick', labelsize=11)
plt.rc('ytick', labelsize=11)
plt.rc('axes', labelsize=12, titlesize=12)
plt.rc('legend', fontsize=11)

In [None]:
import numpy as np
import pandas as pd

In [None]:
f = pd.read_table(associated,
                  index_col=0,
                  header=None)

In [None]:
m = pd.read_table(odds_ratio,
                  index_col=0)
m['lrt-pvalue'] = [float(x)
                   if x != 'NAN'
                   else np.nan
                   for x in m['lrt-pvalue']]
m = m.dropna()

In [None]:
u = pd.read_csv(names, sep='\t')
n = u.set_index('query')['preferred_og_name'].to_dict()
nsize = u.set_index('query')['representative_protein_length']

In [None]:
k = pd.read_table(kmer_hits,
                  index_col=0)
k['normalized_hits'] = (k['hits'] / nsize)
# k.index = [n.get(x, x)
#            for x in k.index]
k = k[k['specific_hits'] > 0]
kh = k['normalized_hits'].to_dict()

In [None]:
m.loc[m.index.intersection(kh),
      'normalized_hits'] = [kh[x]
                            for x in m.index.intersection(kh)]

In [None]:
m['logpvalue'] = -np.log10(m['lrt-pvalue'])

In [None]:
plt.figure(figsize=(6.5, 6.5))

plt.plot(m.loc[m.index.difference(m.dropna().index)]['odds-ratio'],
         -np.log10(m.loc[m.index.difference(m.dropna().index)]['lrt-pvalue']),
         'k.',
         alpha=0.3,
         label='_')

sns.scatterplot(data=m.reset_index().rename(columns={'normalized_hits':
                                                     'Normalized hits'}),
                x='odds-ratio',
                y='logpvalue',
                size='Normalized hits',
                color='r',
                sizes=(20, 200))

texts = [plt.text(x, y, n.get(t, t),
                  ha='center', va='center')
         for (x, y), t in zip(m.dropna()[['odds-ratio',
                                 'logpvalue']].values,
                              m.dropna().index)]

adjust_text(texts,
            arrowprops=dict(arrowstyle='-', color='k'),
            force_points=0.5
            )

plt.legend(loc='center left',
           bbox_to_anchor=(1, 0.5),
           frameon=True)

plt.xlabel('OG odds ratio')
plt.ylabel('$-log_{10}(pvalue)$')

plt.savefig('odds_ratio.png',
            dpi=300, bbox_inches='tight',
            transparent=True)
plt.savefig('odds_ratio.svg',
            dpi=300, bbox_inches='tight',
            transparent=True);