## Jupyter Notebook for responsible for create maize circRNA k-mer datasets

Raw data downloaded from: http://deepbiology.cn/crop/index.php/Home/Html/Download



In [None]:
#!pip install biopython gensim umap-learn

In [2]:
# Download data from PlantCircBase
#!wget -O zma10381_genomic_seq.txt.gz http://ibi.zju.edu.cn/plantcircbase/download/zma10381_genomic_seq.txt.gz
!yes | gzip -d zma10381_genomic_seq.txt.gz

In [3]:
# Import Libs
from Bio import SeqIO
import numpy as np
import pandas as pd

In [19]:
kmer_size = 4
vector_size = 256

In [20]:
def circrna_to_kmers(circrna_sequence, k=kmer_size):
  kmers = []
  for i in range(0, len(circrna_sequence)-k+1):
    kmer = circrna_sequence[i:i+k]
    kmers.append(kmer)
  return kmers

In [21]:
fasta_handle = open('zma10381_genomic_seq.txt', 'r')
fasta_parser = SeqIO.parse(fasta_handle, 'fasta')

with open('maize_corpus.txt', 'w') as corpus_handle:
  for record in fasta_parser:
    record_kmers = circrna_to_kmers(str(record.seq))
    corpus_handle.write(' '.join(record_kmers) + '\n')

In [22]:
!head -5 maize_corpus.txt

CTCA TCAT CATA ATAG TAGC AGCT GCTG CTGA TGAT GATG ATGC TGCC GCCT CCTA CTAA TAAA AAAG AAGA AGAG GAGG AGGA GGAA GAAA AAAA AAAA AAAA AAAT AATT ATTG TTGT TGTG GTGT TGTT GTTG TTGC TGCC GCCC CCCT CCTC CTCA TCAT CATT ATTC TTCA TCAG CAGG AGGT GGTG GTGT TGTA GTAG TAGC AGCA GCAA CAAT AATC ATCT TCTC CTCA TCAA CAAA AAAA AAAC AACA ACAG CAGA AGAT GATC ATCT TCTC CTCA TCAC CACA ACAC CACA ACAT CATC ATCT TCTC CTCA TCAA CAAG AAGA AGAT GATA ATAG TAGA AGAT GATA ATAA TAAG AAGA AGAC GACA ACAG CAGA AGAA GAAT AATT ATTT TTTT TTTC TTCA TCAC CACC ACCA CCAG CAGT AGTG GTGT TGTT GTTT TTTT TTTC TTCA TCAA CAAG AAGG AGGC GGCT GCTG CTGC TGCA GCAA CAAA AAAG AAGA AGAT GATT ATTA TTAA TAAG AAGA AGAA GAAA AAAA AAAG AAGA AGAA GAAA AAAT AATG ATGA TGAA GAAG AAGT AGTA GTAC TACA ACAA CAAA AAAT AATA ATAA TAAA AAAA AAAT AATC ATCA TCAG CAGC AGCG GCGC CGCA GCAT CATT ATTT TTTC TTCA TCAG CAGG AGGA GGAT GATA ATAA TAAT AATC ATCT TCTA CTAA TAAG AAGT AGTT GTTA TTAA TAAA AAAG AAGG AGGA GGAT GATT ATTA TTAG TAGC AGCA GCAT CATA ATAA TAAA AAAA 

In [23]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(vector_size=vector_size)
w2v_model.build_vocab(corpus_file='maize_corpus.txt')

In [24]:
w2v_model.corpus_count

13192

In [25]:
w2v_model.train(corpus_file='maize_corpus.txt', total_words=w2v_model.corpus_total_words, epochs=1)

(25319664, 42262575)

In [26]:
def circrna_to_vec(circrna_sequence, k=kmer_size):
  vec = np.zeros(vector_size)
  kmers = circrna_to_kmers(circrna_sequence, k=k)
  for kmer in kmers:
    if kmer in w2v_model.wv:
      vec = vec + w2v_model.wv[kmer]
  return vec

In [27]:
circ_df = pd.read_excel('maize_db.xlsx')

In [None]:
list = circ_df['seq']

columns = [f'wc_4mer_{v+1}' for v in range(vector_size)]

df_vecs = pd.DataFrame(columns=columns)

for record in list:
  df_vecs = df_vecs.append(
      [
       dict(zip(columns, circrna_to_vec(record)))
       ], 
      ignore_index=True)

In [29]:
new_df = pd.concat([circ_df, df_vecs], axis=1)

new_df.head()

Unnamed: 0,circName,circID,gene,isoform,stress,tissue,chr,start,end,strand,...,wc_4mer_247,wc_4mer_248,wc_4mer_249,wc_4mer_250,wc_4mer_251,wc_4mer_252,wc_4mer_253,wc_4mer_254,wc_4mer_255,wc_4mer_256
0,zma-circ1-Zm00001d002325,2:10317309-10317467_-,Zm00001d002325,Zm00001d002325_T001,-,multipleTissue,2,10317309,10317467,-,...,15.677424,38.900346,-3.102404,-4.442807,-10.046855,-9.627969,-4.195266,4.189774,9.732552,4.227022
1,zma-circ2-Zm00001d038675,6:162376852-162378246_+,Zm00001d038675,Zm00001d038675_T004,-,multipleTissue,6,162376852,162378246,+,...,36.455992,-20.06317,-83.387029,59.792782,-98.033561,68.308304,17.6391,-13.828875,-82.591465,64.354806
2,zma-circ3-Zm00001d038163,6:150032431-150032595_+,Zm00001d038163,Zm00001d038163_T001,-,multipleTissue,6,150032431,150032595,+,...,6.675268,-28.79728,11.152429,23.036428,1.149696,-10.554642,5.191221,13.023831,42.685613,6.735523
3,zma-circ4-Zm00001d049552,4:34381638-34381747_-,Zm00001d049552,Zm00001d049552_T004,-,multipleTissue,4,34381638,34381747,-,...,-16.163142,15.93597,-23.395082,7.818691,30.864781,7.972365,8.410871,-10.671463,-2.896606,19.580173
4,zma-circ5-Zm00001d032567,1:230724608-230725226_-,Zm00001d032567,Zm00001d032567_T001,-,multipleTissue,1,230724608,230725226,-,...,-22.312784,33.259835,67.920903,-24.463687,-53.406053,-7.545128,29.81287,-55.991946,-21.702738,-47.711335


In [30]:
new_df.to_excel('maize_word2vec_4mer_dataset.xlsx', index=False)