### Import libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Find the number of sequences

In [2]:
import pathlib
vir_path = pathlib.Path('./sequences/edwards/virus/')
host_path = pathlib.Path('./sequences/edwards/host/')
vir_count = 0
host_count = 0
for vir_file in vir_path.iterdir():
    vir_count += 1
for host_file in host_path.iterdir():
    host_count += 1

print(f'virus files: {vir_count}')
print(f'host files: {host_count}')
print(f'paired files: {vir_count * host_count}')

virus files: 820
host files: 2699
paired files: 2213180


## Importing the dataset

### Merge the datasets

In [3]:
import pathlib
from functools import reduce

features_path = pathlib.Path('./features/')
dfs = [pd.read_csv(feat_file, sep='\t', header=0) for feat_file in features_path.iterdir()]
final_df = reduce(lambda left,right: pd.merge(left,right,on=["#virus", "bacteria"],
                                                    how="outer"), dfs)
final_df.shape

(2213180, 11)

### Replace the column names

In [4]:
col_names = ['#virus', 'host']
col_names += [file.stem for file in features_path.iterdir()]
final_df.columns = col_names
final_df.head()

Unnamed: 0,#virus,host,blastn,crisprdetect-2mismatch,gc_content,k25,k6-chebyshev,k6-kendalltau,k6-manhattan,piler-2mismatch,wish
0,NC_010363,NC_008527,60.8,,0.021973,,0.002122,0.398421,0.382144,,-1.33553
1,NC_010363,NC_002662,59.0,,0.016709,,0.001929,0.397773,0.377498,,-1.33035
2,NC_010363,NC_017949,59.0,,0.020818,,0.002088,0.396969,0.379686,,-1.33341
3,NC_010363,NC_017492,59.0,,0.022209,,0.002131,0.396148,0.38093,,-1.33767
4,NC_010363,NC_009004,59.0,,0.020871,,0.002088,0.397095,0.379834,,-1.33341


In [5]:
check_rows = final_df[~final_df.isnull().any(axis=1)]
check_rows.head()

Unnamed: 0,#virus,host,blastn,crisprdetect-2mismatch,gc_content,k25,k6-chebyshev,k6-kendalltau,k6-manhattan,piler-2mismatch,wish
3611,NC_011357,NC_012731,670.0,1.0,0.06762,2.0,0.00266,0.468791,0.410775,1.0,-1.38249
4182,NC_004585,NC_008022,22561.0,2.0,0.001751,14364.0,0.000713,0.285006,0.232619,1.0,-1.3304
4183,NC_004585,NC_008021,22561.0,2.0,0.002832,10850.0,0.000736,0.288697,0.235694,2.0,-1.33668
4187,NC_004585,NC_002737,12311.0,2.0,0.00257,6765.0,0.000741,0.290344,0.23706,2.0,-1.34749
4190,NC_004585,NC_020540,4621.0,3.0,0.002453,5271.0,0.000749,0.290872,0.237612,1.0,-1.36364


## Save the dataframe to files

In [7]:
final_df.to_csv('main_df.csv',index=False)
final_df.to_pickle('./main_df.pkl')