##### Author:
    Diana Y. Lee, Luque Lab, SDSU
    dlee@sdsu.edu

##### Purpose:
    This is an example of using the current G2T model for predicting the capsid architecture (as measured by the 
    T-number) of a tailed phage from the genome

##### Requires: 

    phage_functions.ipynb  :  Functions for calculating T based on genome size

    data\PHAGE_GENOMES.csv : phage genome data for which we want to predict T-numbers. Must include the following columns:
        'genome_length' (in bp)
        'Virus_ID'
        
    Note that this notebook includes instructions for updating those column names after import if necessary

##### Creates:
    results\G2TResults.csv  :  results of the random forest prediction
    results\G2TResults_db.db  :  database state after the prediction


In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(42)
import csv
import time
import random

In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqUtils import GC

In [3]:
from ipynb.fs.full.phage_functions import tNearest
from ipynb.fs.full.phage_functions import tNearestValid
from ipynb.fs.full.phage_functions import tModel
from ipynb.fs.full.phage_functions import tNum
from ipynb.fs.full.phage_functions import tList
from ipynb.fs.full.phage_functions import tDictAll

In [4]:
# create T dictionaries
tdict2,tdict2rev = tDictAll(7)
# set the error margin
errMar = 0.09

In [5]:
# import a single fasta for a whole genome
for record in SeqIO.parse("data/test_genome.fasta", "fasta"):
    print(record.description)
    print(len(record.seq))
    test_genome = len(record.seq)

Mycobacterium phage 20ES
53124


In [14]:
# predict a T-number for a specific genome length in bps
#test_genome = 57385
T_raw_test = round(tNum(test_genome/1000,0),4)
T_nearest_test = tNum(test_genome/1000,1)
T_errMar_test = tNum(test_genome/1000,2,errMar)

print("T raw from model: ", T_raw_test)
print("Nearest T, no restriction: ", T_nearest_test)
print("Nearest T within specified error margin (",errMar,"): ", T_errMar_test)

T raw from model:  8.2763
Nearest T, no restriction:  9.0
Nearest T within specified error margin ( 0.09 ):  9.0


In [7]:
# import bulk phage data for multiple phages
phageData = pd.read_csv("data\PHAGE_DATA.csv")

In [8]:
# exmaine your phage data and see if any changes are necessary to the required column names 
# ('genome_length', 'MCP_len', 'Virus_ID', 'MCP_Sequence')

phageData[0:5]

Unnamed: 0,ID,GBK_ID,CDS,DEFINITION,LOCUS_COMPLETE_GENOME,COMPLETE_GENOME_BP,ORGANISM,NCBI_GENPEPT_PROTEIN_ID,PROTEIN_PRODUCT,GENE_LOCUS,PROTEIN_BP,TRANSLATION
0,0,1262521.3,bp:11683..12636,"Leuconostoc phage phiLNTR3, complete genome.",NC_024378_1_28015,28015,"Viruses; dsDNA viruses, no RNA stage; Caudovir...",YP_009044222.1,putative major capsid protein,HL53_gp15,317,MGIEFLSTSKAVELYAKLALETQGNTETFSRKWKDIVSERSEQAIT...
1,1,1273740.3,bp:9604..10494,"Bacillus phage Curly, complete genome.",NC_020479_1_49425,49425,"Viruses; dsDNA viruses, no RNA stage; Caudovir...",YP_007517560.1,major capsid protein,CURLY_16,296,MADIVLGQHPLLKKVFLDRRIKDFTASGFVADQLFTNISVDALAIK...
2,2,1282994.3,bp:13216..14253,"Burkholderia phage ST79, complete genome.",NC_021343_1_35430,35430,"Viruses; dsDNA viruses, no RNA stage; Caudovir...",YP_008060494.1,major capsid protein,M190_gp20,345,MNPITRRALTRYMDNIAKLNGVASVAEKFAVAPSVQQTLEKRIQES...
3,3,633135.2,bp:4469..5662,Streptococcus phage Abc2,NC_013645_1_34882,34882,"Viruses; dsDNA viruses, no RNA stage; Caudovir...",YP_003347415.1,Phage major capsid protein # ACLAME 20,SP-Abc2_gp06,397,MKTSNELHDLWVAQGDKVENLNEKLNVAMLDDSVTAEELQKIKNER...
4,5,1168563.3,bp:5642..6655,"Stenotrophomonas phage Smp131, complete genome.",NC_023588_1_33525,33525,"Viruses; dsDNA viruses, no RNA stage; Caudovir...",YP_009008364.1,phage major capsid protein precursor,CH36_gp09,337,MRTKTRRLFEGYTQQVATLNNVSGVANTFSVEPTVQQSLEARMQES...


In [9]:
# change any necessary column names using this command, with the arguments formatted as {"original_column_name" : "New_name"}
phageData = phageData.rename(columns={"COMPLETE_GENOME_BP": 'genome_length',"ID": 'Virus_ID'})

In [10]:
n = len(phageData["Virus_ID"])

In [11]:
# calculate T numbers
ny = phageData.shape[0]
Y_T = []

for i in range(ny):
    Y_T.append(phageData.iloc[i]["Virus_ID"])
    Y_T.append(round(tNum(phageData.iloc[i]["genome_length"]/1000,0),4))
    Y_T.append(tNum(phageData.iloc[i]["genome_length"]/1000,1))
    Y_T.append(tNum(phageData.iloc[i]["genome_length"]/1000,2,errMar))
    Y_T.append(tdict2[tNum(phageData.iloc[i]["genome_length"]/1000,2,errMar)])
    
Y = np.asarray(Y_T)
Y = np.reshape(np.ravel(Y), (ny, 5));
Y = np.asarray(Y)

df_T = pd.DataFrame(Y)
df_T = df_T.rename(columns={0: 'Virus_ID', 1: 'T_raw', 2: 'T_nearest', 3: 'T_nearest_errMar', 4: 'T_nearest_errMar_code'})

df_T["T_raw"] = df_T["T_raw"].astype('float64')
df_T["T_nearest"] = df_T["T_nearest"].astype('float64')
df_T["T_nearest_errMar"] = df_T["T_nearest_errMar"].astype('float64')
df_T["T_nearest_errMar_code"] = df_T["T_nearest_errMar_code"].astype('int64')

df_T [0:5]

Unnamed: 0,Virus_ID,T_raw,T_nearest,T_nearest_errMar,T_nearest_errMar_code
0,0.0,5.2455,5.33,5.33,4
1,1.0,7.8613,7.0,0.0,47
2,2.0,6.201,7.0,0.0,47
3,3.0,6.1325,5.33,0.0,47
4,5.0,5.9615,5.33,0.0,47


In [12]:
# add T predictions to the phage data
phageG2TResult = phageData.merge(df_T, how='left', on='Virus_ID')
phageG2TResult[0:5]

Unnamed: 0,Virus_ID,GBK_ID,CDS,DEFINITION,LOCUS_COMPLETE_GENOME,genome_length,ORGANISM,NCBI_GENPEPT_PROTEIN_ID,PROTEIN_PRODUCT,GENE_LOCUS,PROTEIN_BP,TRANSLATION,T_raw,T_nearest,T_nearest_errMar,T_nearest_errMar_code
0,0,1262521.3,bp:11683..12636,"Leuconostoc phage phiLNTR3, complete genome.",NC_024378_1_28015,28015,"Viruses; dsDNA viruses, no RNA stage; Caudovir...",YP_009044222.1,putative major capsid protein,HL53_gp15,317,MGIEFLSTSKAVELYAKLALETQGNTETFSRKWKDIVSERSEQAIT...,5.2455,5.33,5.33,4
1,1,1273740.3,bp:9604..10494,"Bacillus phage Curly, complete genome.",NC_020479_1_49425,49425,"Viruses; dsDNA viruses, no RNA stage; Caudovir...",YP_007517560.1,major capsid protein,CURLY_16,296,MADIVLGQHPLLKKVFLDRRIKDFTASGFVADQLFTNISVDALAIK...,7.8613,7.0,0.0,47
2,2,1282994.3,bp:13216..14253,"Burkholderia phage ST79, complete genome.",NC_021343_1_35430,35430,"Viruses; dsDNA viruses, no RNA stage; Caudovir...",YP_008060494.1,major capsid protein,M190_gp20,345,MNPITRRALTRYMDNIAKLNGVASVAEKFAVAPSVQQTLEKRIQES...,6.201,7.0,0.0,47
3,3,633135.2,bp:4469..5662,Streptococcus phage Abc2,NC_013645_1_34882,34882,"Viruses; dsDNA viruses, no RNA stage; Caudovir...",YP_003347415.1,Phage major capsid protein # ACLAME 20,SP-Abc2_gp06,397,MKTSNELHDLWVAQGDKVENLNEKLNVAMLDDSVTAEELQKIKNER...,6.1325,5.33,0.0,47
4,5,1168563.3,bp:5642..6655,"Stenotrophomonas phage Smp131, complete genome.",NC_023588_1_33525,33525,"Viruses; dsDNA viruses, no RNA stage; Caudovir...",YP_009008364.1,phage major capsid protein precursor,CH36_gp09,337,MRTKTRRLFEGYTQQVATLNNVSGVANTFSVEPTVQQSLEARMQES...,5.9615,5.33,0.0,47


In [13]:
# exports results to csv
phageG2TResult.to_csv(r'results\G2TResults.csv', index=False)
# saves kernel state
import dill
dill.dump_session('results\G2TResults_db.db')