In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
def read_mi(fname, name='all', date=True):
    a = pd.read_csv(fname, sep='\t')

    # filtering
    #
    # 1. end of the genome
    # 2. same gene and same codon
    # 3. different gene and too close
    #
    a = a[((a['pos_source'] <= 29674) & (a['pos_target'] <= 29674)) &
          (((a['gene_source'] == a['gene_target']) &
            (a['feature_codon_source'] != a['feature_codon_target']) &
            (a['codon_distance'] > 1)) |
           ((a['gene_source'] != a['gene_target']) &
            (a['distance'] > 1)))]

    a['name'] = name
    if date:
        a['year'] = int(name.split('-')[0])
        a['month'] = int(name.split('-')[1])

    return a

In [3]:
a = read_mi('../out/all-notime/mi_annotated.tsv.gz', date=False)

In [4]:
a.shape[0]

474

In [5]:
a[a['outlier'] >= 4].shape[0]

222

In [6]:
len(set(a['pos_source']).union(a['pos_target']))

247

In [7]:
a.groupby('gene_source')['pos_source'].count()

gene_source
E           3
M          18
N          32
ORF10       3
ORF1ab    179
ORF3a      10
ORF6        4
ORF7a       6
ORF7b       3
ORF8       17
S         185
Name: pos_source, dtype: int64

In [8]:
a.groupby('outlier')['pos_source'].count()

outlier
1     92
2     96
3     64
4    222
Name: pos_source, dtype: int64

In [9]:
a.to_csv('supplementary_table_1.tsv', sep='\t', index=False)

In [10]:
for i in a.columns:
    print(i)

pos_source
pos_target
distance
outlier
mi
feature_position_source
gene_source
codon_source
feature_codon_source
feature_position_target
gene_target
codon_target
feature_codon_target
codon_distance
interaction
name


In [11]:
for name, idir in zip([1_000, 10_000,
                       100_000, 1_000_000],
                      ['../out/subsets-notime/'] * 4):
    a = read_mi(f'{idir}{name}/mi_annotated.tsv.gz', name, date=False)
    a.to_csv(f'subset_{name}.tsv', sep='\t', index=False)
!zip supplementary_material_1.zip subset_*.tsv

  adding: subset_1000000.tsv (deflated 79%)
  adding: subset_100000.tsv (deflated 81%)
  adding: subset_10000.tsv (deflated 83%)
  adding: subset_1000.tsv (deflated 89%)


In [12]:
no_time = read_mi(f'../out/time-subsets-notime/2023-03/mi_annotated.tsv.gz',
                  '2023-03',
                  date=True)

In [13]:
no_time.to_csv('supplementary_table_2.tsv', sep='\t', index=False)

In [14]:
p = pd.read_csv('../out/spikes/2023-03/2023-03.EC', sep=' ', header=None)
p.columns = ['pos_source', 'base_source',
             'pos_target', 'base_target',
             '_', 'plmc']

In [15]:
p.columns

Index(['pos_source', 'base_source', 'pos_target', 'base_target', '_', 'plmc'], dtype='object')

In [16]:
p[['pos_source', 'pos_target',
   'plmc']
 ].to_csv('supplementary_material_2.tsv.zip', sep='\t', index=False)

In [17]:
for name in sorted(os.listdir('../out/time-filtered-subsets/')):
    try:
        a = read_mi(f'../out/time-filtered-subsets/{name}/mi_annotated.tsv.gz',
                    name,
                    date=True)
        a.sort_values('mi', ascending=False
                     ).to_csv(f'time_subset_{name}.tsv',
                              sep='\t', index=False)
    except FileNotFoundError:
        pass
!zip supplementary_material_3.zip time_subset_*.tsv

  adding: time_subset_2020-03.tsv (deflated 90%)
  adding: time_subset_2020-04.tsv (deflated 90%)
  adding: time_subset_2020-05.tsv (deflated 90%)
  adding: time_subset_2020-06.tsv (deflated 90%)
  adding: time_subset_2020-07.tsv (deflated 90%)
  adding: time_subset_2020-08.tsv (deflated 89%)
  adding: time_subset_2020-09.tsv (deflated 89%)
  adding: time_subset_2020-10.tsv (deflated 89%)
  adding: time_subset_2020-11.tsv (deflated 89%)
  adding: time_subset_2020-12.tsv (deflated 89%)
  adding: time_subset_2021-01.tsv (deflated 89%)
  adding: time_subset_2021-02.tsv (deflated 89%)
  adding: time_subset_2021-03.tsv (deflated 89%)
  adding: time_subset_2021-04.tsv (deflated 89%)
  adding: time_subset_2021-05.tsv (deflated 88%)
  adding: time_subset_2021-06.tsv (deflated 88%)
  adding: time_subset_2021-07.tsv (deflated 88%)
  adding: time_subset_2021-08.tsv (deflated 88%)
  adding: time_subset_2021-09.tsv (deflated 88%)
  adding: time_subset_2021-10.tsv (deflated 89%)
  adding: time_subse