In [9]:
import os
import pandas as pd
import numpy as np
import csv
from collections import defaultdict
import urllib
import matplotlib.pyplot as plt
from keras.utils import to_categorical
import gffpandas.gffpandas as gffpd  # Ensure this is uncommented
from pyfaidx import Fasta
from Bio import SeqIO

import warnings
warnings.filterwarnings('ignore')


In [14]:
def load_GFP_fasta(fasta_file_path):
    """
    Reads the first sequence from a FASTA file and returns it as a string.

    :param fasta_file_path: Path to the FASTA file.
    :return: DNA sequence as a string.
    """
    with open(fasta_file_path, 'r') as fasta_file:
        sequence_record = next(SeqIO.parse(fasta_file, "fasta"))
        dna_sequence = str(sequence_record.seq)

    return dna_sequence


GFPseq = load_GFP_fasta('GFP.fa')

print(GFPseq)
len_cd = len(GFPseq)
print(len_cd)

ATGGTCAGTAAGGGTGAAGAATTATTCACTGGTGTTGTTCCAATCTTGGTTGAATTGGATGGTGATGTTAACGGTCACAAGTTTTCTGTTTCTGGTGAAGGTGAAGGTGATGCTACTTATGGTAAATTGACCTTGAAGTTCATCTGTACCACAGGTAAATTGCCAGTTCCATGGCCAACTTTGGTTACTACTTTGACTTATGGTGTCCAATGCTTCTCTAGATACCCAGATCATATGAAGCAACACGACTTTTTCAAATCCGCTATGCCAGAAGGTTACGTTCAAGAAAGAACCATCTTCTTCAAGGATGACGGTAACTACAAAACTAGAGCCGAAGTTAAGTTCGAAGGTGATACCTTGGTTAACAGAATCGAATTGAAGGGTATCGACTTCAAAGAAGATGGTAACATCTTGGGTCATAAGTTGGAATACAACTACAACTCCCACAACGTTTACATTATGGCCGATAAGCAAAAGAACGGTATCAAGGTTAACTTCAAGATCAGACACAACATCGAAGATGGTAGTGTTCAATTGGCTGATCACTACCAACAAAACACTCCAATTGGTGATGGTCCAGTTTTGTTGCCAGATAACCATTACTTGTCTACCCAATCTGCTTTGTCTAAGGACCCAAACGAAAAAAGAGATCACATGGTCTTGTTGGAATTCGTTACTGCTGCTGGTATTACTTTGGGTATGGACGAATTATACAAGTAA
720


In [10]:


def count_codons(GFPseq):
    '''Codon frequency counter for sequences in a FASTA file.'''
    CodonsDict = {
        'TTT': 0, 'TTC': 0, 'TTA': 0, 'TTG': 0, 'CTT': 0,
        'CTC': 0, 'CTA': 0, 'CTG': 0, 'ATT': 0, 'ATC': 0,
        'ATA': 0, 'ATG': 0, 'GTT': 0, 'GTC': 0, 'GTA': 0,
        'GTG': 0, 'TAT': 0, 'TAC': 0, 'TAA': 0, 'TAG': 0,
        'CAT': 0, 'CAC': 0, 'CAA': 0, 'CAG': 0, 'AAT': 0,
        'AAC': 0, 'AAA': 0, 'AAG': 0, 'GAT': 0, 'GAC': 0,
        'GAA': 0, 'GAG': 0, 'TCT': 0, 'TCC': 0, 'TCA': 0,
        'TCG': 0, 'CCT': 0, 'CCC': 0, 'CCA': 0, 'CCG': 0,
        'ACT': 0, 'ACC': 0, 'ACA': 0, 'ACG': 0, 'GCT': 0,
        'GCC': 0, 'GCA': 0, 'GCG': 0, 'TGT': 0, 'TGC': 0,
        'TGA': 0, 'TGG': 0, 'CGT': 0, 'CGC': 0, 'CGA': 0,
        'CGG': 0, 'AGT': 0, 'AGC': 0, 'AGA': 0, 'AGG': 0,
        'GGT': 0, 'GGC': 0, 'GGA': 0, 'GGG': 0}
    
    
    # make the codon dictionary local
    codon_count = CodonsDict.copy()
    
    if str(GFPseq).islower():
        dna_sequence = str(GFPseq).upper()
    else:
        dna_sequence = str(GFPseq)
    
    for i in range(0, len(dna_sequence) - 2, 3):  # Adjusted to avoid index error
        codon = dna_sequence[i:i + 3]
        if codon in codon_count:
            codon_count[codon] += 1
        else:
            print(f"Illegal codon {codon}")
    
    # Prepare output array
    out = [value for key, value in sorted(codon_count.items())]
    
    return np.asarray(out)

# Now call the function with the path to your FASTA file
freq_cd = count_codons(GFPseq)
print(freq_cd)

[ 6 13 14  0  1  5  0 10  6  0  0  2  0  9  6  3  8  6  0  3 10  0  0  0
  0  0  0  0  0  0  0  0 16  5  0 13  0  2  0  6  0  0  0 22  0  3  0 15
  1  9  0  2  0  2  0  6  0  1  1  1  2 10 19  2]


In [11]:
def count_codon_GC(GFPseq):
    out1, out2, out3 = list(), list(), list()
    for i in range(0, len(GFPseq) - 2, 3):
        out1.append(int(GFPseq[i] in ['G', 'C']))
        out2.append(int(GFPseq[i+1] in ['G', 'C']))
        out3.append(int(GFPseq[i+2] in ['G', 'C']))
    return (3 * sum(out1) / len(GFPseq), 
            3 * sum(out2) / len(GFPseq), 
            3 * sum(out3) / len(GFPseq))



# Calculate GC content at each codon position
GC1, GC2, GC3 = count_codon_GC(GFPseq)
print(count_codon_GC(GFPseq))

(0.45416666666666666, 0.3125, 0.4375)


In [15]:

input_dir = 'Species_Xval'
output_dir = 'GFP_Xval'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)



for file_name in os.listdir(input_dir):
    if file_name.endswith('_Xval.npy'):
        file_path = os.path.join(input_dir, file_name)
        Xval = np.load(file_path, allow_pickle=True)
        
        # Assuming the new values should be applied to every row in Xval
        for i in range(len(Xval)):
              
            Xval[i, 1] = np.int16(len_cd)
            Xval[i, 5] = np.int16(GC1 * 1000)
            Xval[i, 6] = np.int16(GC2 * 1000)
            Xval[i, 7] = np.int16(GC3 * 1000)
            Xval[i, 8:] = np.asarray(freq_cd, dtype=np.int16)  # Adjust based on actual data
        
        # Save the modified Xval array back to the file
        #new_file_path = os.path.join(output_dir, file_name)
        #np.save(new_file_path, Xval)

In [16]:

input_dir = 'Species_Xval'
output_dir = 'GFP_Xval'
gfp_xval_files = []  # This list will hold the paths or identifiers of the modified files

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for file_name in os.listdir(input_dir):
    if file_name.endswith('_Xval.npy'):
        file_path = os.path.join(input_dir, file_name)
        Xval = np.load(file_path, allow_pickle=True)
        
        # Here you would modify Xval as needed
        # Assuming 'len_cd', 'GC1', 'GC2', 'GC3', and 'freq_cd' are predefined:
        for i in range(len(Xval)):
            Xval[i, 1] = np.int16(len_cd)
            Xval[i, 5] = np.int16(GC1 * 1000)
            Xval[i, 6] = np.int16(GC2 * 1000)
            Xval[i, 7] = np.int16(GC3 * 1000)
            Xval[i, 8:] = np.asarray(freq_cd, dtype=np.int16)  # Adjust based on actual data
            print(Xval)
        # Save the modified Xval array back to the file in the output directory
        new_file_path = os.path.join(output_dir, file_name)
        #np.save(new_file_path, Xval)

        # Add the path of the newly saved file to the list
        gfp_xval_files.append(new_file_path)

# Now gfp_xval_files contains the paths of all modified files


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



[[478.0 720 90.0 ... 10 19 2]
 [139.0 156.0 287.0 ... 1.0 0.0 1.0]
 [373.0 486.0 232.0 ... 5.0 7.0 2.0]
 ...
 [165.0 816.0 173.0 ... 4.0 14.0 4.0]
 [115.0 210.0 37.0 ... 2.0 1.0 6.0]
 [136.0 219.0 44.0 ... 0.0 1.0 2.0]]
[[478.0 720 90.0 ... 10 19 2]
 [139.0 720 287.0 ... 10 19 2]
 [373.0 486.0 232.0 ... 5.0 7.0 2.0]
 ...
 [165.0 816.0 173.0 ... 4.0 14.0 4.0]
 [115.0 210.0 37.0 ... 2.0 1.0 6.0]
 [136.0 219.0 44.0 ... 0.0 1.0 2.0]]
[[478.0 720 90.0 ... 10 19 2]
 [139.0 720 287.0 ... 10 19 2]
 [373.0 720 232.0 ... 10 19 2]
 ...
 [165.0 816.0 173.0 ... 4.0 14.0 4.0]
 [115.0 210.0 37.0 ... 2.0 1.0 6.0]
 [136.0 219.0 44.0 ... 0.0 1.0 2.0]]
[[478.0 720 90.0 ... 10 19 2]
 [139.0 720 287.0 ... 10 19 2]
 [373.0 720 232.0 ... 10 19 2]
 ...
 [165.0 816.0 173.0 ... 4.0 14.0 4.0]
 [115.0 210.0 37.0 ... 2.0 1.0 6.0]
 [136.0 219.0 44.0 ... 0.0 1.0 2.0]]
[[478.0 720 90.0 ... 10 19 2]
 [139.0 720 287.0 ... 10 19 2]
 [373.0 720 232.0 ... 10 19 2]
 ...
 [165.0 816.0 173.0 ... 4.0 14.0 4.0]
 [115.0 210.0 3

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



[[262.0 720 197.0 ... 10 19 2]
 [159.0 1803.0 45.0 ... 12.0 23.0 16.0]
 [94.0 195.0 239.0 ... 0.0 0.0 0.0]
 ...
 [186.0 960.0 395.0 ... 8.0 15.0 6.0]
 [183.0 2280.0 184.0 ... 14.0 37.0 18.0]
 [485.0 291.0 252.0 ... 4.0 4.0 4.0]]
[[262.0 720 197.0 ... 10 19 2]
 [159.0 720 45.0 ... 10 19 2]
 [94.0 195.0 239.0 ... 0.0 0.0 0.0]
 ...
 [186.0 960.0 395.0 ... 8.0 15.0 6.0]
 [183.0 2280.0 184.0 ... 14.0 37.0 18.0]
 [485.0 291.0 252.0 ... 4.0 4.0 4.0]]
[[262.0 720 197.0 ... 10 19 2]
 [159.0 720 45.0 ... 10 19 2]
 [94.0 720 239.0 ... 10 19 2]
 ...
 [186.0 960.0 395.0 ... 8.0 15.0 6.0]
 [183.0 2280.0 184.0 ... 14.0 37.0 18.0]
 [485.0 291.0 252.0 ... 4.0 4.0 4.0]]
[[262.0 720 197.0 ... 10 19 2]
 [159.0 720 45.0 ... 10 19 2]
 [94.0 720 239.0 ... 10 19 2]
 ...
 [186.0 960.0 395.0 ... 8.0 15.0 6.0]
 [183.0 2280.0 184.0 ... 14.0 37.0 18.0]
 [485.0 291.0 252.0 ... 4.0 4.0 4.0]]
[[262.0 720 197.0 ... 10 19 2]
 [159.0 720 45.0 ... 10 19 2]
 [94.0 720 239.0 ... 10 19 2]
 ...
 [186.0 960.0 395.0 ... 8.0 15

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



[[140.0 720 49.0 ... 10 19 2]
 [487.0 1572.0 93.0 ... 21.0 6.0 14.0]
 [23.0 807.0 83.0 ... 5.0 6.0 6.0]
 ...
 [62.0 1803.0 170.0 ... 17.0 8.0 9.0]
 [34.0 1587.0 119.0 ... 18.0 7.0 15.0]
 [89.0 387.0 237.0 ... 2.0 0.0 4.0]]
[[140.0 720 49.0 ... 10 19 2]
 [487.0 720 93.0 ... 10 19 2]
 [23.0 807.0 83.0 ... 5.0 6.0 6.0]
 ...
 [62.0 1803.0 170.0 ... 17.0 8.0 9.0]
 [34.0 1587.0 119.0 ... 18.0 7.0 15.0]
 [89.0 387.0 237.0 ... 2.0 0.0 4.0]]
[[140.0 720 49.0 ... 10 19 2]
 [487.0 720 93.0 ... 10 19 2]
 [23.0 720 83.0 ... 10 19 2]
 ...
 [62.0 1803.0 170.0 ... 17.0 8.0 9.0]
 [34.0 1587.0 119.0 ... 18.0 7.0 15.0]
 [89.0 387.0 237.0 ... 2.0 0.0 4.0]]
[[140.0 720 49.0 ... 10 19 2]
 [487.0 720 93.0 ... 10 19 2]
 [23.0 720 83.0 ... 10 19 2]
 ...
 [62.0 1803.0 170.0 ... 17.0 8.0 9.0]
 [34.0 1587.0 119.0 ... 18.0 7.0 15.0]
 [89.0 387.0 237.0 ... 2.0 0.0 4.0]]
[[140.0 720 49.0 ... 10 19 2]
 [487.0 720 93.0 ... 10 19 2]
 [23.0 720 83.0 ... 10 19 2]
 ...
 [62.0 1803.0 170.0 ... 17.0 8.0 9.0]
 [34.0 1587.0 1

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



[[162.0 720 263.0 ... 10 19 2]
 [101.0 555.0 134.0 ... 7.0 3.0 1.0]
 [565.0 1068.0 115.0 ... 4.0 2.0 10.0]
 ...
 [126.0 1413.0 158.0 ... 8.0 4.0 3.0]
 [290.0 1278.0 53.0 ... 26.0 3.0 3.0]
 [244.0 1185.0 67.0 ... 2.0 8.0 8.0]]
[[162.0 720 263.0 ... 10 19 2]
 [101.0 720 134.0 ... 10 19 2]
 [565.0 1068.0 115.0 ... 4.0 2.0 10.0]
 ...
 [126.0 1413.0 158.0 ... 8.0 4.0 3.0]
 [290.0 1278.0 53.0 ... 26.0 3.0 3.0]
 [244.0 1185.0 67.0 ... 2.0 8.0 8.0]]
[[162.0 720 263.0 ... 10 19 2]
 [101.0 720 134.0 ... 10 19 2]
 [565.0 720 115.0 ... 10 19 2]
 ...
 [126.0 1413.0 158.0 ... 8.0 4.0 3.0]
 [290.0 1278.0 53.0 ... 26.0 3.0 3.0]
 [244.0 1185.0 67.0 ... 2.0 8.0 8.0]]
[[162.0 720 263.0 ... 10 19 2]
 [101.0 720 134.0 ... 10 19 2]
 [565.0 720 115.0 ... 10 19 2]
 ...
 [126.0 1413.0 158.0 ... 8.0 4.0 3.0]
 [290.0 1278.0 53.0 ... 26.0 3.0 3.0]
 [244.0 1185.0 67.0 ... 2.0 8.0 8.0]]
[[162.0 720 263.0 ... 10 19 2]
 [101.0 720 134.0 ... 10 19 2]
 [565.0 720 115.0 ... 10 19 2]
 ...
 [126.0 1413.0 158.0 ... 8.0 4.0

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

