In [1]:
from IPython.display import Markdown as md

In [2]:
from datasets import load_dataset
import pandas as pd

In [3]:
inputFile="./S288C_Expression/EmpiricalExp/Full_S288C.datatable.csv"
df=pd.read_csv(inputFile)
print(df[:3])

   Unnamed: 0 GeneName                                           Sequence  \
0           0  YAL067C  ATGTATTCAATTGTTAAAGAGATTATTGTAGATCCTTACAAAAGAC...   
1           1  YAL062W  ATGACAAGCGAACCAGAGTTTCAGCAGGCTTACGATGAGATCGTTT...   
2           2  YAL061W  ATGAGAGCCTTAGCGTATTTCGGTAAAGGTAACATCAGATTCACCA...   

          EXP  Label  
0  207.761417      0  
1  257.287421      0  
2  712.980404      0  


After loading the data, the next step is to convert the raw coding sequences to text sentences compsed of codon usage ranks.

In [4]:
#Start

In [5]:
from Bio import SeqIO
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, './CUB_Code')
import CodonLibraries as CL

seqList=df["Sequence"]
sentenceList=[]
for seq in seqList:
    codonList = CL.loadSequence(seq)
    sentence=""
    for codon in codonList:
        sentence+=codon+" "
    sentenceList.append(sentence[:-1])#this removes the extra redundant space in the end

Now we have converted the sequence to codonLists, then concated them into string sentences 

In [6]:
print(sentenceList[:1])

['TAT TCA ATT GTT AAA GAG ATT ATT GTA GAT CCT TAC AAA AGA CTA AAA TGG GGT TTT ATT CCA GTA AAG CGG CAG GTG GAA GAC CTG CCA GAT GAC TTA AAT TCA ACA GAA ATT GTC ACT ATC TCC AAC AGT ATC CAG AGT CAT GAA ACA GCT GAA AAT TTC ATC ACG ACT ACA AGT GAA AAA GAT CAA CTA CAT TTT GAG ACT AGT AGC TAT AGT GAA CAT AAA GAC AAT GTG AAC GTT ACT AGA AGT TAT GAA TAT AGA GAT GAA GCC GAT AGG CCA TGG TGG AGA TTT TTC GAT GAA CAA GAG TAT CGG ATC AAT GAA AAG GAA AGA TCT CAC AAT AAA TGG TAT AGT TGG TTC AAA CAG GGT ACC TCT TTC AAA GAA AAA AAA TTA TTA ATT AAA TTG GAT GTC CTT TTA GCC TTT TAT TCT TGT ATT GCT TAT TGG GTG AAA TAT CTG GAT ACG GTT AAT ATA AAC AAC GCT TAC GTT TCG GGA ATG AAG GAA GAT TTA GGC TTT CAA GGT AAT GAT TTG GTG CAT ACT CAA GTA ATG TAC ACA GTT GGT AAT ATT ATA TTT CAA TTG CCA TTT TTG ATT TAC CTG AAC AAG CTC CCA TTA AAC TAT GTT TTA CCA AGC CTC GAC TTA TGT TGG TCG CTT TTA ACC GTT GGT GCT GCA TAT GTC AAT TCT GTA CCA CAC TTG AAA GCA ATT AGG TTT TTC ATT GGG GCT TTT GAA GCG CCA AGT TAT TTG GCA TAC CAA TAT TT

The next step is to get the genome wide codon usage and generate codon ranks, which will used to replace the codon strings

In [7]:
import findSequenceById as FSBID
from CAI import RSCU
import scipy.stats as ss




def process(seqList,genomeFile,tag=""):
    geneDict=FSBID.findSequenceByID(genomeFile)
    keyList=[]
    rscu=RSCU(seqList)
    rscu_rank=convertRSCUtoRanks(rscu)
    sentenceList = []
    for seq in seqList:
        codonList=CL.loadSequence(seq)
        #remove the first and last five codons:
        codonList = codonList[5:]
        codonList = codonList[:-5]
        try:
            codonRankList=[rscu_rank[codon] for codon in codonList]
            sentence=""
            for rank in codonRankList:
                sentence+=str(rank)+" "
            sentenceList.append(sentence[:-1])
        except :
            print("one error on ",seq)
    
    return sentenceList




def convertRSCUtoRanks(rscu):
    synonymousCodons = {
        'CYS': ['TGT', 'TGC'],
        'ASP': ['GAT', 'GAC'],
        'SER': ['TCT', 'TCG', 'TCA', 'TCC', 'AGC', 'AGT'],
        'GLN': ['CAA', 'CAG'],
        'MET': ['ATG'],
        'ASN': ['AAC', 'AAT'],
        'PRO': ['CCT', 'CCG', 'CCA', 'CCC'],
        'LYS': ['AAG', 'AAA'],
        'THR': ['ACC', 'ACA', 'ACG', 'ACT'],
        'PHE': ['TTT', 'TTC'],
        'ALA': ['GCA', 'GCC', 'GCG', 'GCT'],
        'GLY': ['GGT', 'GGG', 'GGA', 'GGC'],
        'ILE': ['ATC', 'ATA', 'ATT'],
        'LEU': ['TTA', 'TTG', 'CTC', 'CTT', 'CTG', 'CTA'],
        'HIS': ['CAT', 'CAC'],
        'ARG': ['CGA', 'CGC', 'CGG', 'CGT', 'AGG', 'AGA'],
        'TRP': ['TGG'],
        'VAL': ['GTA', 'GTC', 'GTG', 'GTT'],
        'GLU': ['GAG', 'GAA'],
        'TYR': ['TAT', 'TAC']}#'CYS': ['TGT', 'TGC']
    rscu_rank=dict()
    for aa in synonymousCodons:
        codonList=synonymousCodons[aa]
        rscuList=[rscu[codon] for codon in codonList]
        rankList=ss.rankdata([-1*x for x in rscuList])
        for codon,rank in zip(codonList,rankList):
            rscu_rank[codon]=int(rank)
    return rscu_rank



In [8]:
genomeFile="./Data/s288c.fasta"
print(len(seqList))
print(seqList[:10])


4568
0    ATGTATTCAATTGTTAAAGAGATTATTGTAGATCCTTACAAAAGAC...
1    ATGACAAGCGAACCAGAGTTTCAGCAGGCTTACGATGAGATCGTTT...
2    ATGAGAGCCTTAGCGTATTTCGGTAAAGGTAACATCAGATTCACCA...
3    ATGAGAGCTTTGGCATATTTCAAGAAGGGTGATATTCACTTCACTA...
4    ATGTGGGAACAAAGACGACAAAAGGTAGTTTTTTCCTTGACTATAC...
5    ATGAAATTTTCTGCGTATTTATGGTGGCTGTTTTTGAATCTAGCGT...
6    ATGGAAATTTCCAGTTCACCATGGAACGACGGTGGATACAGCCCCT...
7    ATGCCACCACCATCAAGAAGTAGAATAAACAAAACAAGAACATTAG...
8    ATGTCGCCCTCTGCCGTACAATCATCAAAACTAGAAGAACAGTCAA...
9    ATGATCTTCCTAAACACCTTCGCAAGGTGCCTTTTAACGTGTTTCG...
Name: Sequence, dtype: object


In [9]:
sentenceList=process(list(seqList),genomeFile)

Selected id Type: locus_tag
There are 0 entries NOT found out of 5990
5990 distinct record in 5990 entries


In [10]:
print(len(sentenceList))
print(sentenceList[:1])

4568
['2 1 1 2 1 2 2 1 1 3 1 1 1 1 1 1 2 2 6 2 4 1 2 5 1 1 2 2 1 2 2 1 1 3 1 3 4 2 3 3 2 3 1 1 2 1 1 1 2 3 4 1 2 3 1 1 1 1 3 1 1 2 1 3 5 1 3 1 1 1 2 1 4 2 1 1 1 3 1 1 1 1 1 1 3 1 2 1 1 1 1 1 2 1 1 1 2 1 6 3 1 1 2 1 1 1 2 1 1 1 1 3 1 2 1 2 1 3 1 2 1 1 1 1 2 2 1 1 1 1 3 4 2 3 1 1 1 1 1 1 1 1 4 1 1 5 1 4 1 1 2 2 2 1 2 1 6 2 1 2 1 1 2 3 1 1 1 1 1 1 4 1 1 1 2 1 2 2 1 1 1 1 2 1 1 1 1 1 1 1 2 5 2 2 6 1 2 2 1 1 2 1 5 6 2 2 1 1 6 4 2 3 1 1 1 2 1 3 1 1 2 1 2 1 1 2 1 2 1 2 1 4 1 1 1 4 1 3 1 1 2 2 1 1 1 1 1 4 1 2 1 1 1 1 1 4 3 3 1 1 1 2 1 1 3 2 1 3 1 1 3 4 1 1 4 3 2 2 3 2 1 2 6 2 1 1 2 1 1 2 2 2 1 1 1 2 1 1 1 2 1 1 4 1 3 2 4 3 4 1 2 1 2 4 5 1 1 2 1 2 2 1 1 1 1 2 2 1 1 1 1 1 2 1 4 2 1 1 2 1 1 2 1 2 1 1 3 1 1 1 2 1 2 2 1 1 1 5 1 1 2 1 2 3 1 1 1 2 2 1 2 1 2 1 1 1 2 1 1 1 2 3 1 1 2 1 4 2 2 3 3 1 1 1 1 1 1 1 2 1 1 2 2 6 1 2 2 4 1 1 1 4 1 2 1 1 1 1 1 1 4 1 1 1 1 2 1 1 2 2 1 3 1 1 4 1 1 1 1 2 1 2 1 3 1 1 2 4 2 2 3 1 1 2 3 2 1 2 3 1 1 1 2 1 1 5 1 1 1 1 1 1 1 1 2 1 2 2 1 1 1 2 1 1 1 5 4 1 1 1 1 1 1 1 1 2 1

We now have the intermediate sentence structure composed of characters including all codon rnaks, this can also be used for training the model and tokenizer, but we will apply one more operation on this file, composing single codon ranks into words of size k

In [11]:
def convertLineToWords(line,n):
    line= line.replace(' ','')
    wordList=[]
    for i in range(0,len(line)-n,n):
        wordList.append(line[i:i+n])
    sentence=" ".join(wordList)
    return sentence



In [12]:
k=10 #k specifies the wordsize, aka number of codon ranks in a word, or understood as size of codon windows
wordSentenceList=[]
for sentence in sentenceList:
    wordSentence=convertLineToWords(sentence,k)
    wordSentenceList.append(wordSentence)

In [13]:
labelList=df["Label"]
print(len(labelList))
print(labelList)



4568
0       0
1       0
2       0
3       1
4       0
       ..
4563    1
4564    1
4565    1
4566    0
4567    0
Name: Label, Length: 4568, dtype: int64


In [14]:
df["WordSentence"]=wordSentenceList

In [15]:
df["SentenceLength"]=[len(x) for x in df["WordSentence"]]
df=df[df['SentenceLength'] <= 1000]
df=df[df['SentenceLength'] >= 2]
print(len(df))

4066


In [16]:
df_sub=df[["WordSentence","Label"]].copy()
df_sub=df_sub.rename(columns={"WordSentence": "text", "Label": "label"})
df_sub.to_csv("expressionPrediction_S288C.csv")

In [17]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_sub)