# Model Comparison

## Imports and setup

In [1]:
from custom_models import *

In [2]:
from Bio import SeqIO
from pathlib import Path

cwd = Path.cwd()
benchmark_path = cwd / "Dataset" / "Benchmark_dataset"
independent_path = cwd / "Dataset" / "Independent_test_dataset"

# Read Sequence file in fasta format using BioPython library 
    
super_transform = {"70": PromoterType.SIGMA_70, 
                   "54": PromoterType.SIGMA_54, 
                   "38": PromoterType.SIGMA_38, 
                   "32": PromoterType.SIGMA_32, 
                   "28": PromoterType.SIGMA_28, 
                   "24": PromoterType.SIGMA_24                   
                  }
sequences = []
labels = []
for record in SeqIO.parse(independent_path / "independent.txt", "fasta"):
    seq = str(record.seq)
    label = super_transform[record.id]
    sequences.append(seq)
    labels.append(label)

In [3]:
with open(benchmark_path / 'promoter_and_non-promoter' / 'positive2860.txt') as handle:
    promoters = [str(record.seq) for record in SeqIO.parse(handle, "fasta")]
with open(benchmark_path / 'promoter_and_non-promoter' / 'negative2860.txt') as handle:
    non_promoters = [str(record.seq) for record in SeqIO.parse(handle, "fasta")]
with open(benchmark_path / 'sigma_subtypes' / 'sigma24promoter.txt') as handle:
    sigma24promoter = [str(record.seq) for record in SeqIO.parse(handle, "fasta")]
with open(benchmark_path / 'sigma_subtypes' / 'sigma28promoter.txt') as handle:
    sigma28promoter = [str(record.seq) for record in SeqIO.parse(handle, "fasta")]
with open(benchmark_path / 'sigma_subtypes' / 'sigma32promoter.txt') as handle:
    sigma32promoter = [str(record.seq) for record in SeqIO.parse(handle, "fasta")]
with open(benchmark_path / 'sigma_subtypes' / 'sigma38promoter.txt') as handle:
    sigma38promoter = [str(record.seq) for record in SeqIO.parse(handle, "fasta")]
with open(benchmark_path / 'sigma_subtypes' / 'sigma54promoter.txt') as handle:
    sigma54promoter = [str(record.seq) for record in SeqIO.parse(handle, "fasta")]
with open(benchmark_path / 'sigma_subtypes' / 'sigma70promoter.txt') as handle:
    sigma70promoter = [str(record.seq) for record in SeqIO.parse(handle, "fasta")]

benchmark_dataset = non_promoters + sigma24promoter + sigma28promoter + sigma32promoter + sigma38promoter + sigma54promoter + sigma70promoter
benchmark_labels = [PromoterType.NON_PROMOTER]*len(non_promoters) + [PromoterType.SIGMA_24]*len(sigma24promoter) + [PromoterType.SIGMA_28]*len(sigma28promoter) + [PromoterType.SIGMA_32]*len(sigma32promoter) + [PromoterType.SIGMA_38]*len(sigma38promoter) + [PromoterType.SIGMA_54]*len(sigma54promoter) + [PromoterType.SIGMA_70]*len(sigma70promoter)

In [4]:
from tabulate import tabulate
def make_test_table(model, sequences, labels):
    results, acc = model.test(sequences, labels)
    
    print(f"{model.name:^99}")
    
    formatted_res = {"": ("TP", "FP", "TN", "FN", "Specificity", "Sensitivity", "MCC", "Accuracy")}
    formatted_res.update({_type.name : (values["TP"], values["FP"], values["TN"], values["FN"], values["Specificity"], values["Sensitivity"], values["MCC"], values["Accuracy"]) for _type, values in results.items()})
    
    print(tabulate(formatted_res, headers="keys"))
    print(f"\nTotal accuracy: {acc}")

## Model loading

### Order definition for cascade-based models

In [5]:
iPromoter_order = [PromoterType.SIGMA_70, PromoterType.SIGMA_24, PromoterType.SIGMA_32, PromoterType.SIGMA_38, PromoterType.SIGMA_28, PromoterType.SIGMA_54]
pcPromoter_order = [PromoterType.SIGMA_70, PromoterType.SIGMA_24, PromoterType.SIGMA_28, PromoterType.SIGMA_38, PromoterType.SIGMA_32, PromoterType.SIGMA_54]

### pcPromoter-CNN

In [7]:
pcPromoter_paths = [Path("./weights/PCPromoter") / fold for fold in ["best_weights_Pro_NonPro.h5", "best_weights_Sigma70.h5", "best_weights_Sigma24_v1.h5", "best_weights_Sigma28.h5", "best_weights_Sigma38.h5", "best_weights_Sigma32.h5"]]
pcPromoter = PCPromoter("pcPromoter-CNN", pcPromoter_paths, pcPromoter_order, ModelType.WEIGHTS_ONLY)

### iPromoter-BnCNN

In [8]:
iPromoter_paths = [Path("./weights/IPromoter") / fold for fold in ["promoter_saved_model.h5", "sigma70_saved_model.h5", "sigma24_saved_model.h5", "sigma32_saved_model.h5", "sigma38_saved_model.h5", "sigma28_saved_model.h5"]]
iPromoter = IPromoter("iPromoter-BnCNN", iPromoter_paths, iPromoter_order)

### PromoterLCNN

In [9]:
promoterLCNN = PromoterLCNN("PromoterLCNN",
                [
                    Path("./weights/PromoterLCNN/IsPromoter_fold_5"),
                    Path("./weights/PromoterLCNN/PromotersOnly_fold_1")
                ]
)

## Tables!

### pcPromoter-CNN

#### Validation dataset

In [11]:
make_test_table(pcPromoter, sequences, labels)

                                          pcPromoter-CNN                                           
               NON_PROMOTER    SIGMA_70    SIGMA_24    SIGMA_28     SIGMA_38    SIGMA_32    SIGMA_54
-----------  --------------  ----------  ----------  ----------  -----------  ----------  ----------
TP                 0         162          22           2           0            2           0
FP                29          15           6           1           5            0          12
TN               227          42         220         251         241          243         244
FN                 0          37           8           2          10           11           0
Specificity        0.886719    0.736842    0.973451    0.996032    0.979675     1           0.953125
Sensitivity        0           0.81407     0.733333    0.5         0            0.153846    0
MCC                0           0.496188    0.728376    0.571635   -0.0284564    0.383645    0
Accuracy           0.886719    0.

#### Training dataset

In [12]:
make_test_table(pcPromoter, benchmark_dataset, benchmark_labels)

                                          pcPromoter-CNN                                           
               NON_PROMOTER     SIGMA_70     SIGMA_24     SIGMA_28      SIGMA_38     SIGMA_32      SIGMA_54
-----------  --------------  -----------  -----------  -----------  ------------  -----------  ------------
TP              2585         1400          400           85            6            67            8
FP               308          407          129           27           94            23          181
TN              2552         3619         5107         5559         5463          5406         5445
FN               275          294           84           49          157           224           86
Specificity        0.892308     0.898907     0.975363     0.995166     0.983084      0.995763     0.967828
Sensitivity        0.903846     0.826446     0.826446     0.634328     0.0368098     0.230241     0.0851064
MCC                0.796207     0.712376     0.770265     0.687198   

### IPromoter-BnCNN

#### Validation dataset

In [14]:
make_test_table(iPromoter, sequences, labels)

                                          iPromoter-BnCNN                                          
               NON_PROMOTER    SIGMA_70    SIGMA_24    SIGMA_28    SIGMA_38    SIGMA_32    SIGMA_54
-----------  --------------  ----------  ----------  ----------  ----------  ----------  ----------
TP                 0         178          25           1           3           9           0
FP                13           9           9           1           3           4           1
TN               243          48         217         251         243         239         255
FN                 0          21           5           3           7           4           0
Specificity        0.949219    0.842105    0.960177    0.996032    0.987805    0.983539    0.996094
Sensitivity        0           0.894472    0.833333    0.25        0.3         0.692308    0
MCC                0           0.690617    0.752058    0.346569    0.36857     0.675847    0
Accuracy           0.949219    0.882812   

#### Training dataset

In [16]:
make_test_table(iPromoter, benchmark_dataset, benchmark_labels)

                                          iPromoter-BnCNN                                          
               NON_PROMOTER     SIGMA_70     SIGMA_24     SIGMA_28     SIGMA_38     SIGMA_32     SIGMA_54
-----------  --------------  -----------  -----------  -----------  -----------  -----------  -----------
TP              2736         1579          452          118          120          266           70
FP                73          120           48           26           49           52           11
TN              2787         3906         5188         5560         5508         5377         5615
FN               124          115           32           16           43           25           24
Specificity        0.974476     0.970194     0.990833     0.995346     0.991182     0.990422     0.998045
Sensitivity        0.956643     0.932113     0.933884     0.880597     0.736196     0.914089     0.744681
MCC                0.931267     0.901538     0.911195     0.845738     0.714736 

### PromoterLCNN

#### Validation dataset

In [17]:
make_test_table(promoterLCNN, sequences, labels)

                                           PromoterLCNN                                            
               NON_PROMOTER    SIGMA_70    SIGMA_24    SIGMA_28    SIGMA_38    SIGMA_32    SIGMA_54
-----------  --------------  ----------  ----------  ----------  ----------  ----------  ----------
TP                 0         176          25           2           1           5                  0
FP                29           8           6           0           3           1                  0
TN               227          49         220         252         243         242                256
FN                 0          23           5           2           9           8                  0
Specificity        0.886719    0.859649    0.973451    1           0.987805    0.995885           1
Sensitivity        0           0.884422    0.833333    0.5         0.1         0.384615           0
MCC                0           0.688499    0.795427    0.704317    0.137169    0.552184           0


#### Training dataset

In [18]:
make_test_table(promoterLCNN, benchmark_dataset, benchmark_labels)

                                           PromoterLCNN                                            
               NON_PROMOTER     SIGMA_70     SIGMA_24     SIGMA_28     SIGMA_38     SIGMA_32     SIGMA_54
-----------  --------------  -----------  -----------  -----------  -----------  -----------  -----------
TP              2696         1578          435          102          129          252           64
FP               196          143           48            8           33           33            3
TN              2664         3883         5188         5578         5524         5396         5623
FN               164          116           49           32           34           39           30
Specificity        0.931469     0.964481     0.990833     0.998568     0.994062     0.993922     0.999467
Sensitivity        0.942657     0.931523     0.89876      0.761194     0.791411     0.865979     0.680851
MCC                0.874181     0.891944     0.890428     0.836763     0.787822 