In [1]:
from IPython.display import Markdown as md

In [2]:
from datasets import load_dataset
import pandas as pd

In [3]:
inputFile="./S288C_Expression/EmpiricalExp/S288C.datatable_0.3Top_0.3Bot.csv"
df=pd.read_csv(inputFile)
print(df[:3])

   Unnamed: 0   GeneName                                           Sequence  \
0        1941  YHR079C-A  ATGAACTATTTGGAAACACAGTTAAATAAAAAGCAAAAACAGATAC...   
1        4329    YPL121C  ATGCACAATCAGGAAGAGTGGCTAGACAAGGACAAAACTTTGGTGA...   
2        1496    YGL170C  ATGGGAGCTGGCACTCTTTTAAATGGATTAGAAAAGGAAAACTTTC...   

         EXP  Label  
0  59.599320      0  
1  59.995476      0  
2  62.303841      0  


After loading the data, the next step is to convert the raw coding sequences to text sentences compsed of codon usage ranks.

In [4]:
#Start

In [5]:
from Bio import SeqIO
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, './CUB_Code')
import CodonLibraries as CL

seqList=df["Sequence"]
sentenceList=[]
for seq in seqList:
    codonList = CL.loadSequence(seq)
    sentence=""
    for codon in codonList:
        sentence+=codon+" "
    sentenceList.append(sentence[:-1])#this removes the extra redundant space in the end

Now we have converted the sequence to codonLists, then concated them into string sentences 

In [6]:
print(sentenceList[:1])

['AAC TAT TTG GAA ACA CAG TTA AAT AAA AAG CAA AAA CAG ATA CAG GAA TAC GAA AGT ATG AAT GGC AAC CTG ATA AAG ATG TTT GAG CAA TTG TCT AAA GAA AAG AAA AAT GAT GAG ACA CCA AAA AAA ATT TCC TCG ACG TAC ATT AAA GAG TTA AAG GAG TAC AAC GAA TTG AGA GAT GCC GGT TTA AGG TTG GCC CAA ATA ATT GCT GAT GAA AAG CAA TGC AAA ATT AAG GAT GTT TTT GAA GAG ATC GGT TAT TCA ATG AAG GAC']


The next step is to get the genome wide codon usage and generate codon ranks, which will used to replace the codon strings

In [7]:
import findSequenceById as FSBID
from CAI import RSCU
import scipy.stats as ss




def process(seqList,genomeFile,tag=""):
    geneDict=FSBID.findSequenceByID(genomeFile)
    keyList=[]
    rscu=RSCU(seqList)
    rscu_rank=convertRSCUtoRanks(rscu)
    sentenceList = []
    for seq in seqList:
        codonList=CL.loadSequence(seq)
        #remove the first and last five codons:
        codonList = codonList[5:]
        codonList = codonList[:-5]
        try:
            codonRankList=[rscu_rank[codon] for codon in codonList]
            sentence=""
            for rank in codonRankList:
                sentence+=str(rank)+" "
            sentenceList.append(sentence[:-1])
        except :
            print("one error on ",seq)
    
    return sentenceList




def convertRSCUtoRanks(rscu):
    synonymousCodons = {
        'CYS': ['TGT', 'TGC'],
        'ASP': ['GAT', 'GAC'],
        'SER': ['TCT', 'TCG', 'TCA', 'TCC', 'AGC', 'AGT'],
        'GLN': ['CAA', 'CAG'],
        'MET': ['ATG'],
        'ASN': ['AAC', 'AAT'],
        'PRO': ['CCT', 'CCG', 'CCA', 'CCC'],
        'LYS': ['AAG', 'AAA'],
        'THR': ['ACC', 'ACA', 'ACG', 'ACT'],
        'PHE': ['TTT', 'TTC'],
        'ALA': ['GCA', 'GCC', 'GCG', 'GCT'],
        'GLY': ['GGT', 'GGG', 'GGA', 'GGC'],
        'ILE': ['ATC', 'ATA', 'ATT'],
        'LEU': ['TTA', 'TTG', 'CTC', 'CTT', 'CTG', 'CTA'],
        'HIS': ['CAT', 'CAC'],
        'ARG': ['CGA', 'CGC', 'CGG', 'CGT', 'AGG', 'AGA'],
        'TRP': ['TGG'],
        'VAL': ['GTA', 'GTC', 'GTG', 'GTT'],
        'GLU': ['GAG', 'GAA'],
        'TYR': ['TAT', 'TAC']}#'CYS': ['TGT', 'TGC']
    rscu_rank=dict()
    for aa in synonymousCodons:
        codonList=synonymousCodons[aa]
        rscuList=[rscu[codon] for codon in codonList]
        rankList=ss.rankdata([-1*x for x in rscuList])
        for codon,rank in zip(codonList,rankList):
            rscu_rank[codon]=int(rank)
    return rscu_rank



In [8]:
genomeFile="./Data/s288c.fasta"
print(len(seqList))
print(seqList[:10])


2740
0    ATGAACTATTTGGAAACACAGTTAAATAAAAAGCAAAAACAGATAC...
1    ATGCACAATCAGGAAGAGTGGCTAGACAAGGACAAAACTTTGGTGA...
2    ATGGGAGCTGGCACTCTTTTAAATGGATTAGAAAAGGAAAACTTTC...
3    ATGAATCAAGGTTACACACAGCTTTCCGCACCGGAACTGAAGGAGA...
4    ATGCTTTTCGCTAGATTAGTGCTGCTGTTGGTGTATTTGGCACCAG...
5    ATGGATTGCCCCTCAAACGTTGTGTTATTGTTGCTGCAATTAGTTT...
6    ATGAACATTAAGACTTTGTGTCATCCAGAATATAAAAGAATCTCCG...
7    ATGAGTTTCCTAAGCATTTTTACTTTTTTCAGCGTCCTTATTTCTG...
8    ATGTACGAGTACTGCTCAGTTGTAATAAAGAAATACTCCAAGTATA...
9    ATGGTGACTGGTGAAGAAAATGTGTATCTAAAGTCAAGCTTATCCA...
Name: Sequence, dtype: object


In [9]:
sentenceList=process(list(seqList),genomeFile)

Selected id Type: locus_tag
There are 0 entries NOT found out of 5990
5990 distinct record in 5990 entries


In [10]:
print(len(sentenceList))
print(sentenceList[:1])

2740
['2 2 1 1 2 1 1 2 3 2 1 2 1 4 1 1 3 2 5 3 2 1 1 2 1 1 1 1 1 2 1 1 1 2 2 1 1 1 1 3 6 4 2 1 1 2 2 2 2 2 2 1 1 1 1 3 1 2 2 1 3 1 3 1 1 1 1 2 1 2 1 1 2 1 1 1 1 2 2 1']


We now have the intermediate sentence structure composed of characters including all codon rnaks, this can also be used for training the model and tokenizer, but we will apply one more operation on this file, composing single codon ranks into words of size k

In [11]:
def convertLineToWords(line,n):
    line= line.replace(' ','')
    wordList=[]
    for i in range(0,len(line)-n,n):
        wordList.append(line[i:i+n])
    sentence=" ".join(wordList)
    return sentence



In [None]:
df.to_csv("expressionPrediction_S288C.csv")


In [12]:
k=4 #k specifies the wordsize, aka number of codon ranks in a word, or understood as size of codon windows
wordSentenceList=[]
for sentence in sentenceList:
    wordSentence=convertLineToWords(sentence,k)
    wordSentenceList.append(wordSentence)

In [13]:
print(len(wordSentenceList))
print(wordSentenceList[:2])




2740
['2211 2112 3212 1411 3253 2112 1111 1211 1221 1113 6421 1222 2221 1113 1221 3131 1112 1211 2111', '1322 2111 4211 1112 2211 1212 2112 1621 1511 2212 2242 2211 4311 2224 1141 4621 1112 2221 1116 5122 2122 1133 1112 2112 1151 1121 3122 1211 1331 1113 1111 2112 1111 1212 2111 1121 1321 4121 1142 1121 1212 4111 1251 2111 1134 2511 1124 5115 1411 1211 1321 2214']


In [14]:
labelList=df["Label"]
print(len(labelList))
print(labelList)



2740
0       0
1       0
2       0
3       0
4       0
       ..
2735    1
2736    1
2737    1
2738    1
2739    1
Name: Label, Length: 2740, dtype: int64


In [19]:
df["WordSentence"]=wordSentenceList

In [25]:
df_sub=df[["WordSentence","Label"]].copy()
df_sub=df_sub.rename(columns={"WordSentence": "text", "Label": "label"})
df_sub.to_csv("expressionPrediction_S288C.csv")

In [26]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_sub)