## Jupyter Notebook for responsible for create rice circRNA word2vec datasets

Raw data downloaded from: http://deepbiology.cn/crop/index.php/Home/Html/Download



In [2]:
!pip install biopython gensim umap-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Using cached biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Collecting umap-learn
  Using cached umap_learn-0.5.3-py3-none-any.whl
Collecting pynndescent>=0.5 (from umap-learn)
  Using cached pynndescent-0.5.10-py3-none-any.whl
Installing collected packages: biopython, pynndescent, umap-learn
Successfully installed biopython-1.81 pynndescent-0.5.10 umap-learn-0.5.3


In [4]:
# Download data from PlantCircBase
#!wget -O v4_osa_genomic_seq.txt.gz http://ibi.zju.edu.cn/plantcircbase/download/osaj43883_genomic_seq.txt.gz
!yes | gzip -d osaj43883_genomic_seq.txt.gz

In [3]:
# Import Libs
from Bio import SeqIO
import numpy as np
import pandas as pd

In [22]:
kmer_size = 3
vector_size = 64

In [23]:
def circrna_to_kmers(circrna_sequence, k=kmer_size):
  kmers = []
  for i in range(0, len(circrna_sequence)-k+1):
    kmer = circrna_sequence[i:i+k]
    kmers.append(kmer)
  return kmers

In [24]:
fasta_handle = open('osaj43883_genomic_seq.txt', 'r')
fasta_parser = SeqIO.parse(fasta_handle, 'fasta')

with open('oryza_corpus.txt', 'w') as corpus_handle:
  for record in fasta_parser:
    record_kmers = circrna_to_kmers(str(record.seq))
    corpus_handle.write(' '.join(record_kmers) + '\n')

In [25]:
!head -5 oryza_corpus.txt

GCT CTG TGC GCT CTG TGT GTA TAC ACC CCA CAA AAG AGC GCC CCA CAT ATC TCC CCA CAA AAA AAG AGA GAC ACG CGC GCA CAA AAC ACA CAT ATT TTG TGA GAA AAG AGT GTG TGC GCA CAT ATC TCG CGG GGG GGT GTC TCA CAA AAA AAC ACC CCA CAG AGC GCC CCA CAA AAT ATG TGC GCT CTC TCC CCT CTG TGA GAT ATT TTT TTG TGT GTT TTG TGT GTC TCT CTT TTT TTT TTG TGT GTC TCA CAA AAT ATG TGA GAT ATA TAT ATC TCG CGA GAG AGT GTT TTC TCC CCA CAC ACA CAT ATA TAA AAC ACC CCA CAC ACC CCA CAT ATC TCT CTC TCT CTA TAG AGC GCA CAT ATG TGT GTC TCT CTT TTG TGC GCT CTC TCT CTC TCA CAG AGT GTT TTG TGG GGT GTT TTG TGC GCC CCC CCC CCT CTT TTC TCT CTA TAC ACA CAA AAT ATT TTG TGC GCA CAA AAT ATG TGG GGG GGA GAA AAC ACA CAC ACC CCT CTG TGG GGA GAT ATC TCT CTA TAT ATG TGG GGA GAC ACT CTT TTC TCA CAG AGG GGC GCT CTG TGC GCT CTG TGC GCC CCT CTT TTT TTG TGT GTC TCA CAA AAC ACA CAT ATT TTG TGT GTT TTG TGA GAC ACA CAT ATA TAC ACA CAA AAT ATT TTG TGC GCC CCA CAG AGA GAA AAC ACA CAC ACG CGA GAG AGC GCC CCA CAG AGG GGG GGG GGC GCC CCA CAT ATC TCT CTG TGT 

In [26]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(vector_size=vector_size)
w2v_model.build_vocab(corpus_file='oryza_corpus.txt')

In [27]:
w2v_model.corpus_count

47735

In [28]:
w2v_model.train(corpus_file='oryza_corpus.txt', total_words=w2v_model.corpus_total_words, epochs=1)

(23163435, 73683738)

In [29]:
def circrna_to_vec(circrna_sequence, k=kmer_size):
  vec = np.zeros(vector_size)
  kmers = circrna_to_kmers(circrna_sequence, k=k)
  for kmer in kmers:
    if kmer in w2v_model.wv:
      vec = vec + w2v_model.wv[kmer]
  return vec

In [30]:
circ_df = pd.read_excel('rice_db.xlsx')

In [None]:
list = circ_df['seq']

columns = [f'wc_3mer_{v+1}' for v in range(vector_size)]

df_vecs = pd.DataFrame(columns=columns)

for record in list:
  df_vecs = df_vecs.append(
      [
       dict(zip(columns, circrna_to_vec(record)))
       ], 
      ignore_index=True)

In [33]:
new_df = pd.concat([circ_df, df_vecs], axis=1)

new_df.head()

Unnamed: 0,circName,circID,gene,isoform,stress,tissue,chr,start,end,strand,...,wc_3mer_55,wc_3mer_56,wc_3mer_57,wc_3mer_58,wc_3mer_59,wc_3mer_60,wc_3mer_61,wc_3mer_62,wc_3mer_63,wc_3mer_64
0,osa-circ1-OS01T0723400,1:30167620-30167771_+,OS01T0723400,OS01T0723400-01,-,multipleTissue,1,30167620,30167771,+,...,-25.19884,2.249939,-4.701558,-3.633969,5.212902,-6.579618,11.032136,-8.533007,-8.32791,18.141064
1,osa-circ2-OS03T0223400,3:6461672-6462146_-,OS03T0223400,OS03T0223400-01,-,multipleTissue,3,6461672,6462146,-,...,-27.264821,51.35945,-73.188428,-1.662928,56.56528,28.610637,80.878607,38.090248,-30.54351,28.859375
2,osa-circ3-OS11T0210300,11:5715883-5716030_-,OS11T0210300,OS11T0210300-02,-,multipleTissue,11,5715883,5716030,-,...,-69.36683,1.113416,-42.68958,-6.150814,23.203639,-7.541412,-11.218056,-40.615067,16.391329,1.324206
3,osa-circ4-OS02T0200900,2:5631023-5631244_-,OS02T0200900,OS02T0200900-02,-,multipleTissue,2,5631023,5631244,-,...,42.391724,50.721551,-67.643923,-24.403869,41.171276,45.067768,9.704687,9.653121,7.829608,-11.254961
4,osa-circ5-OS05T0494800,5:24302336-24302448_+,OS05T0494800,OS05T0494800-01,-,multipleTissue,5,24302336,24302448,+,...,-75.243917,-36.308267,7.583992,-27.560256,-24.934966,-15.739278,9.237141,-16.380438,-18.462203,7.718794


In [34]:
new_df.to_excel('rice_word2vec_3mer_dataset.xlsx', index=False)