## Jupyter Notebook for responsible for create rice circRNA word2vec datasets

Raw data downloaded from: http://deepbiology.cn/crop/index.php/Home/Html/Download



In [2]:
!pip install biopython gensim umap-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Using cached biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Collecting umap-learn
  Using cached umap_learn-0.5.3-py3-none-any.whl
Collecting pynndescent>=0.5 (from umap-learn)
  Using cached pynndescent-0.5.10-py3-none-any.whl
Installing collected packages: biopython, pynndescent, umap-learn
Successfully installed biopython-1.81 pynndescent-0.5.10 umap-learn-0.5.3


In [4]:
# Download data from PlantCircBase
#!wget -O v4_osa_genomic_seq.txt.gz http://ibi.zju.edu.cn/plantcircbase/download/osaj43883_genomic_seq.txt.gz
!yes | gzip -d osaj43883_genomic_seq.txt.gz

In [3]:
# Import Libs
from Bio import SeqIO
import numpy as np
import pandas as pd

In [35]:
kmer_size = 4
vector_size = 256

In [36]:
def circrna_to_kmers(circrna_sequence, k=kmer_size):
  kmers = []
  for i in range(0, len(circrna_sequence)-k+1):
    kmer = circrna_sequence[i:i+k]
    kmers.append(kmer)
  return kmers

In [37]:
fasta_handle = open('osaj43883_genomic_seq.txt', 'r')
fasta_parser = SeqIO.parse(fasta_handle, 'fasta')

with open('oryza_corpus.txt', 'w') as corpus_handle:
  for record in fasta_parser:
    record_kmers = circrna_to_kmers(str(record.seq))
    corpus_handle.write(' '.join(record_kmers) + '\n')

In [38]:
!head -5 oryza_corpus.txt

GCTG CTGC TGCT GCTG CTGT TGTA GTAC TACC ACCA CCAA CAAG AAGC AGCC GCCA CCAT CATC ATCC TCCA CCAA CAAA AAAG AAGA AGAC GACG ACGC CGCA GCAA CAAC AACA ACAT CATT ATTG TTGA TGAA GAAG AAGT AGTG GTGC TGCA GCAT CATC ATCG TCGG CGGG GGGT GGTC GTCA TCAA CAAA AAAC AACC ACCA CCAG CAGC AGCC GCCA CCAA CAAT AATG ATGC TGCT GCTC CTCC TCCT CCTG CTGA TGAT GATT ATTT TTTG TTGT TGTT GTTG TTGT TGTC GTCT TCTT CTTT TTTT TTTG TTGT TGTC GTCA TCAA CAAT AATG ATGA TGAT GATA ATAT TATC ATCG TCGA CGAG GAGT AGTT GTTC TTCC TCCA CCAC CACA ACAT CATA ATAA TAAC AACC ACCA CCAC CACC ACCA CCAT CATC ATCT TCTC CTCT TCTA CTAG TAGC AGCA GCAT CATG ATGT TGTC GTCT TCTT CTTG TTGC TGCT GCTC CTCT TCTC CTCA TCAG CAGT AGTT GTTG TTGG TGGT GGTT GTTG TTGC TGCC GCCC CCCC CCCT CCTT CTTC TTCT TCTA CTAC TACA ACAA CAAT AATT ATTG TTGC TGCA GCAA CAAT AATG ATGG TGGG GGGA GGAA GAAC AACA ACAC CACC ACCT CCTG CTGG TGGA GGAT GATC ATCT TCTA CTAT TATG ATGG TGGA GGAC GACT ACTT CTTC TTCA TCAG CAGG AGGC GGCT GCTG CTGC TGCT GCTG CTGC TGCC GCCT CCTT CTTT TTTG TTGT 

In [39]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(vector_size=vector_size)
w2v_model.build_vocab(corpus_file='oryza_corpus.txt')

In [40]:
w2v_model.corpus_count

47732

In [41]:
w2v_model.train(corpus_file='oryza_corpus.txt', total_words=w2v_model.corpus_total_words, epochs=1)

(45860120, 73653142)

In [42]:
def circrna_to_vec(circrna_sequence, k=kmer_size):
  vec = np.zeros(vector_size)
  kmers = circrna_to_kmers(circrna_sequence, k=k)
  for kmer in kmers:
    if kmer in w2v_model.wv:
      vec = vec + w2v_model.wv[kmer]
  return vec

In [43]:
circ_df = pd.read_excel('rice_db.xlsx')

In [None]:
list = circ_df['seq']

columns = [f'wc_4mer_{v+1}' for v in range(vector_size)]

df_vecs = pd.DataFrame(columns=columns)

for record in list:
  df_vecs = df_vecs.append(
      [
       dict(zip(columns, circrna_to_vec(record)))
       ], 
      ignore_index=True)

In [45]:
new_df = pd.concat([circ_df, df_vecs], axis=1)

new_df.head()

Unnamed: 0,circName,circID,gene,isoform,stress,tissue,chr,start,end,strand,...,wc_4mer_247,wc_4mer_248,wc_4mer_249,wc_4mer_250,wc_4mer_251,wc_4mer_252,wc_4mer_253,wc_4mer_254,wc_4mer_255,wc_4mer_256
0,osa-circ1-OS01T0723400,1:30167620-30167771_+,OS01T0723400,OS01T0723400-01,-,multipleTissue,1,30167620,30167771,+,...,-3.65022,-6.532469,-10.394247,-18.915748,-8.209992,20.900938,10.121081,-8.663392,-3.891558,16.968743
1,osa-circ2-OS03T0223400,3:6461672-6462146_-,OS03T0223400,OS03T0223400-01,-,multipleTissue,3,6461672,6462146,-,...,65.037102,-54.414726,-7.826274,1.359892,10.894437,115.083468,-11.790173,-90.684992,-113.248148,-76.719906
2,osa-circ3-OS11T0210300,11:5715883-5716030_-,OS11T0210300,OS11T0210300-02,-,multipleTissue,11,5715883,5716030,-,...,-69.418313,2.976611,-89.222495,-14.065688,-11.113202,13.712974,-9.093328,18.582649,36.676268,28.016192
3,osa-circ4-OS02T0200900,2:5631023-5631244_-,OS02T0200900,OS02T0200900-02,-,multipleTissue,2,5631023,5631244,-,...,10.868538,-94.599972,4.269204,58.609413,36.12965,66.836573,-36.547408,-34.347736,-103.831539,15.517953
4,osa-circ5-OS05T0494800,5:24302336-24302448_+,OS05T0494800,OS05T0494800-01,-,multipleTissue,5,24302336,24302448,+,...,-32.298986,75.309151,-70.483905,-60.826504,-24.527909,-15.224254,49.960103,-62.473183,50.843977,62.039499


In [46]:
new_df.to_excel('rice_word2vec_4mer_dataset.xlsx', index=False)