## Jupyter Notebook for responsible for create maize circRNA k-mer datasets

Raw data downloaded from: http://deepbiology.cn/crop/index.php/Home/Html/Download



In [None]:
#!pip install biopython gensim umap-learn

In [2]:
# Download data from PlantCircBase
#!wget -O zma10381_genomic_seq.txt.gz http://ibi.zju.edu.cn/plantcircbase/download/zma10381_genomic_seq.txt.gz
!yes | gzip -d zma10381_genomic_seq.txt.gz

In [3]:
# Import Libs
from Bio import SeqIO
import numpy as np
import pandas as pd

In [4]:
kmer_size = 3
vector_size = 64

In [5]:
def circrna_to_kmers(circrna_sequence, k=kmer_size):
  kmers = []
  for i in range(0, len(circrna_sequence)-k+1):
    kmer = circrna_sequence[i:i+k]
    kmers.append(kmer)
  return kmers

In [6]:
fasta_handle = open('zma10381_genomic_seq.txt', 'r')
fasta_parser = SeqIO.parse(fasta_handle, 'fasta')

with open('maize_corpus.txt', 'w') as corpus_handle:
  for record in fasta_parser:
    record_kmers = circrna_to_kmers(str(record.seq))
    corpus_handle.write(' '.join(record_kmers) + '\n')

In [8]:
!head -5 maize_corpus.txt

CTC TCA CAT ATA TAG AGC GCT CTG TGA GAT ATG TGC GCC CCT CTA TAA AAA AAG AGA GAG AGG GGA GAA AAA AAA AAA AAA AAT ATT TTG TGT GTG TGT GTT TTG TGC GCC CCC CCT CTC TCA CAT ATT TTC TCA CAG AGG GGT GTG TGT GTA TAG AGC GCA CAA AAT ATC TCT CTC TCA CAA AAA AAA AAC ACA CAG AGA GAT ATC TCT CTC TCA CAC ACA CAC ACA CAT ATC TCT CTC TCA CAA AAG AGA GAT ATA TAG AGA GAT ATA TAA AAG AGA GAC ACA CAG AGA GAA AAT ATT TTT TTT TTC TCA CAC ACC CCA CAG AGT GTG TGT GTT TTT TTT TTC TCA CAA AAG AGG GGC GCT CTG TGC GCA CAA AAA AAG AGA GAT ATT TTA TAA AAG AGA GAA AAA AAA AAG AGA GAA AAA AAT ATG TGA GAA AAG AGT GTA TAC ACA CAA AAA AAT ATA TAA AAA AAA AAT ATC TCA CAG AGC GCG CGC GCA CAT ATT TTT TTC TCA CAG AGG GGA GAT ATA TAA AAT ATC TCT CTA TAA AAG AGT GTT TTA TAA AAA AAG AGG GGA GAT ATT TTA TAG AGC GCA CAT ATA TAA AAA AAA AAG AGA GAA AAA AAA AAA AAC ACT CTT TTT TTG TGA GAA AAT ATG TGA GAT ATA TAA AAG AGT GTA TAG AGA GAT ATA TAG AGT GTT TTG TGT GTG TGT GTT TTC TCC CCA CAA AAT ATA TAC ACA CAG AGG GGA GAT ATC TCA CAA 

In [10]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(vector_size=vector_size)
w2v_model.build_vocab(corpus_file='maize_corpus.txt')

In [11]:
w2v_model.corpus_count

13192

In [13]:
w2v_model.train(corpus_file='maize_corpus.txt', total_words=w2v_model.corpus_total_words, epochs=1)

(13454237, 42273036)

In [14]:
def circrna_to_vec(circrna_sequence, k=kmer_size):
  vec = np.zeros(vector_size)
  kmers = circrna_to_kmers(circrna_sequence, k=k)
  for kmer in kmers:
    if kmer in w2v_model.wv:
      vec = vec + w2v_model.wv[kmer]
  return vec

In [15]:
circ_df = pd.read_excel('maize_db.xlsx')

In [None]:
list = circ_df['seq']

columns = [f'wc_3mer_{v+1}' for v in range(vector_size)]

df_vecs = pd.DataFrame(columns=columns)

for record in list:
  df_vecs = df_vecs.append(
      [
       dict(zip(columns, circrna_to_vec(record)))
       ], 
      ignore_index=True)

In [17]:
new_df = pd.concat([circ_df, df_vecs], axis=1)

new_df.head()

Unnamed: 0,circName,circID,gene,isoform,stress,tissue,chr,start,end,strand,...,wc_3mer_55,wc_3mer_56,wc_3mer_57,wc_3mer_58,wc_3mer_59,wc_3mer_60,wc_3mer_61,wc_3mer_62,wc_3mer_63,wc_3mer_64
0,zma-circ1-Zm00001d002325,2:10317309-10317467_-,Zm00001d002325,Zm00001d002325_T001,-,multipleTissue,2,10317309,10317467,-,...,-1.649599,31.57955,8.262636,-8.476553,-21.353326,-5.830368,5.679497,-0.869754,-16.990953,-28.061553
1,zma-circ2-Zm00001d038675,6:162376852-162378246_+,Zm00001d038675,Zm00001d038675_T004,-,multipleTissue,6,162376852,162378246,+,...,-276.418821,-19.796359,-14.883323,-210.64013,-108.069105,-196.095334,-8.178139,196.293086,-111.923786,-176.260473
2,zma-circ3-Zm00001d038163,6:150032431-150032595_+,Zm00001d038163,Zm00001d038163_T001,-,multipleTissue,6,150032431,150032595,+,...,41.822434,8.608161,-6.166661,31.397092,23.119942,14.479912,7.400303,-15.383993,25.829718,-35.97836
3,zma-circ4-Zm00001d049552,4:34381638-34381747_-,Zm00001d049552,Zm00001d049552_T004,-,multipleTissue,4,34381638,34381747,-,...,-28.654313,9.646898,16.798625,4.431155,-22.220375,-12.460058,8.829987,17.305939,-19.056328,-6.713643
4,zma-circ5-Zm00001d032567,1:230724608-230725226_-,Zm00001d032567,Zm00001d032567_T001,-,multipleTissue,1,230724608,230725226,-,...,-27.083839,-10.496368,-3.498599,-47.558427,34.946736,-30.822783,-12.50635,11.3057,-11.05898,-78.504665


In [18]:
new_df.to_excel('maize_word2vec_3mer_dataset.xlsx', index=False)