##### Author:
    Diana Y. Lee, Luque Lab, SDSU
    dlee@sdsu.edu

##### Purpose:
    Standalone random forest model for predicting the capsid architecture (as measured by the T-number) 
    of a tailed phage from the MCP sequence

##### Requires: 
    PHAGE_TABLE4.xlsx : phage data with indexes, genome size, and translations
    phage_functions.ipynb
    
    
##### Database file: 
    MCP2T_RF_state.db
    
##### Creates:



In [1]:
# imports
import pandas as pd
import numpy as np
np.random.seed(42)
import random
import statistics
import copy

In [2]:
# ML imports
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [3]:
from ipynb.fs.full.phage_functions import tNearest
from ipynb.fs.full.phage_functions import tNearestValid
from ipynb.fs.full.phage_functions import tModel
from ipynb.fs.full.phage_functions import tNum
from ipynb.fs.full.phage_functions import tList
from ipynb.fs.full.phage_functions import tDictAll

In [4]:
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP

In [5]:
#create a list of possible, valid T-numbers, as well as separate t-number lists for T_h and T_t 
tps2, tps, tps_t, tps_h = tList(7)

# create T dictionaries
tdict2,tdict2rev = tDictAll(7,1)

# set the error margin
errMar = 0.09

# create a list of equivalent genome length for each T
tMod = tModel()
TDNA = np.exp((np.log(tps)-tMod[2])/tMod[0])

note that if the consolidation option is selected, tdict_reverse will not be complete


In [8]:
# custom function to count amino acids
# amino acids are hardcoded to avoid broken dependencies, since they do not change
def createFreq(acidSeq, normF=None):
    normF = normF or 0
    if (normF > 1):
        print("Valid tTypes are 0 (raw) or 1 (normalized by sample). Defaults to 0.")
        return
    AA = []
    aaList = np.asarray(['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'])
    aaLen=len(aaList)
    n = len(acidSeq)
    for i in range(n):
        for acid in (aaList):
            trans1=str(np.asarray(acidSeq)[i])
            a = trans1.count(str(acid))
            AA.append(a)
    rFreq = np.asarray(AA).reshape((n, aaLen))
    if (normF == 0):
#        print("Success! Created an nx20 array, where n is the length of the list provided:",n)
#        print("Columns are frequency totals for each amino acid:",aaList)
        return rFreq
    if (normF == 1):
        nFreq = copy.copy(rFreq).astype(float)
        fff3 = copy.copy(rFreq).astype(float)
        nf = rFreq.shape[1]
        for i in range(nf):
            nFreq[:,i] = fff3[:,i]/fff3.sum(axis=1)
#        print("Success! Created an nx20 array, where n is the length of the list provided:",n)
#        print("Columns are frequency percentages for each amino acid:",aaList)
        return nFreq

In [9]:
# custom function to create dataset with only sequence length, frequency, and isoelectric point
# requires a dataframe with fields "Virus_Name", MCP_Sequence","IPC", and "MCP_len"

def createDataset3(dF):
    nn=dF.shape[0]
    freq = createFreq(dF["MCP_Sequence"], 1)
    AAT = []
    for i in range(nn):
        AAT.append(dF.iloc[i]["Virus_ID"])
        AAT.append(dF.iloc[i]["IPC"])
        AAT.append(dF.iloc[i]["MCP_len"])
        for j in range(20):
            AAT.append(freq[i][j])
        AAT.append(dF.iloc[i]["T_nearest_errMar_code"])
    AAT = np.reshape(np.ravel(AAT), (nn, 24));
    AAT = np.asarray(AAT)

    
#    print("Success! Created an nx24 array, where n is the length of the list provided:",n)
#    print("Column 0: Virus_Name")
#    print("Column 1: Isoelectric Point")
#    print("Column 2: length of MCP sequence")
#    print("Columns 3-22 are frequency percentages for each amino acid")
#    print("Column 23: Target T")
    return AAT

In [10]:
# import all phage data
testData = pd.read_excel("../data/PHAGE_TABLE4.xlsx")
# remove records from the dataframe if the ID is nan
for i in range(len(testData["ID"])):
    if(np.isnan(testData["ID"][i])):
        testData = testData.drop(index=i)
# get a count
nt = len(testData["ID"])
# change any necessary column names using this command, with the arguments formatted as {"original_column_name" : "New_name"}
testData = testData.rename(columns={"COMPLETE_GENOME_BP": 'genome_length',"PROTEIN_BP": 'MCP_len',"ID": 'Virus_ID','TRANSLATION':'MCP_Sequence'})


In [11]:
# calculate T numbers and isoelectric point
ny = testData.shape[0]
Y_T = []

for i in range(ny):
    # ID
    Y_T.append(testData.iloc[i]["Virus_ID"])
    # isoelectric point
    Y_T.append(IP(str(testData.iloc[i]["MCP_Sequence"])).pi())
    # T raw
    Y_T.append(round(tNum(testData.iloc[i]["genome_length"]/1000,0),4))
    # Check and see if the structure is on the High-Res list
    if(testData.iloc[i]['HR data']=="x"):
        # T nearest
        Y_T.append(testData.iloc[i]['HR_T'])
        # T nearest err mar
        Y_T.append(testData.iloc[i]['HR_T'])
        # T nearest err mar code
        Y_T.append(tdict2[testData.iloc[i]['HR_T']])
    else:
        # T nearest
        Y_T.append(tNum(testData.iloc[i]["genome_length"]/1000,1))
        # T nearest err mar
        Y_T.append(tNum(testData.iloc[i]["genome_length"]/1000,2,errMar))
        # T nearest err mar code
        Y_T.append(tdict2[tNum(testData.iloc[i]["genome_length"]/1000,2,errMar)])
    
Y = np.asarray(Y_T)
Y = np.reshape(np.ravel(Y), (ny, 6));
Y = np.asarray(Y)

df_T = pd.DataFrame(Y)
df_T = df_T.rename(columns={0: 'Virus_ID', 1: 'IPC', 2: 'T_raw', 3: 'T_nearest', 4: 'T_nearest_errMar', 5: 'T_nearest_errMar_code'})

df_T["T_raw"] = df_T["T_raw"].astype('float64')
df_T["T_nearest"] = df_T["T_nearest"].astype('float64')
df_T["T_nearest_errMar"] = df_T["T_nearest_errMar"].astype('float64')
df_T["T_nearest_errMar_code"] = df_T["T_nearest_errMar_code"].astype('int64')
df_T["IPC"] = df_T["IPC"].astype('float64')

In [12]:
# add T predictions to the phage data
testDataset = testData.merge(df_T, how='left', on='Virus_ID')
testDataset.shape

(635, 16)

In [13]:
# create the random forest dataset for the full database 
xTest_Phage = createDataset3(testDataset)

# assign the features and labels
xTest_actual = (xTest_Phage[0:ny,1:23]).astype(float)
yTest_actual = (xTest_Phage[0:ny,23]).astype(int)

n_feat = xTest_actual.shape[1]

In [14]:
# these are our best hyperparameters
rfBest_clf = RandomForestClassifier(random_state = 42, max_features=4,n_estimators=250, max_depth=20, min_samples_split=24, min_samples_leaf=1,bootstrap = True, class_weight='balanced')

In [15]:
# train the random forest
rfBest_clf.fit(xTest_actual, yTest_actual)

RandomForestClassifier(class_weight='balanced', max_depth=20, max_features=4,
                       min_samples_split=24, n_estimators=250, random_state=42)

In [16]:
# saves kernel state
import dill
dill.dump_session('MCP2T_RF_state(new).db')