# HyperSmORF
## Using hyperbolic neural networks to identify small proteins
##

In [None]:
# install packages
!pip install keras
!

In [50]:
# Import libraries

import sys
import pandas as pd
import numpy as np
import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Import dataset

df = pd.read_csv("datasets/dataset_FINAL.tsv", sep='\t')
df.head()

Unnamed: 0,set,clust,name,origin,y,upstream,downstream,smorf
0,train,smorfam02316,SRS015578|100_12635.12635|324|473|1,real,positive,GCAGTGGAAACTGTGGAAGCTGCGGTGGATGCAGCGGTGCTGCTTT...,TGGTAGATATAAT,ATGGGAAATTCAACAGGATCCACAATGTACATGTTTGGTGCGGTCA...
1,train,smorfam00029,SRS017247|9023|255|395|1,real,positive,AANATGACTTTTTTTCTCTTTTGCCTTCATTTTTCTGATATAGAAC...,TTCTGACAGGNTTCGGATAAATCATNTATCTTTGCAAGGATAGTGA...,ATGAAACAGAGAAAAGTTTTAGTAGGTATCGCTATTGCCATTTTCA...
2,train,smorfam02445,SRS049896|22905|185|292|1,real,positive,GCGGAATTTTTTCTTAAAAGGCATTGACAAGGGAAACGATGCGTGA...,TGCTGTTTGGTTTTGCGGGTATTGCCGCCATCGCGGGTCTGATCTT...,ATGAAGATCGTAAAGAAGCTGGTAGCCGCCCTGATGGTGCTGGGCG...
3,train,smorfam00002,SRS015174|51208|59|208|-1,real,positive,ATATTTAGAAGTATGATTATGAAGTGCAAATAACTGCGTCAGGATT...,TAACTGAAGGCCCCCCCCCCTTTTCCGCCCCTTTTTTTTTTTTTTT...,ATGAGCAGAATTAAGACAATTTCAACAAGAAATATGGCAAAGTCAA...
4,train,smorfam01037,SRS063985|100_30365.30365|110|235|-1,real,positive,AAGCAAAACCGGCAGTGGAAGAAAACAAATACATCGCCATTGAAGG...,TTACAATAGACTATTTTAGGAGGAATGACTTATGTTAGTACCAAGT...,ATGTCAGACAAAAATCCAAAACATCCATTAAAAAAGAAGAAAGTAA...


In [4]:
# save subset of dataset

df_subset=df.sample(10000, random_state=42)
df_subset.to_csv('DATA/smorf_subset.csv')

In [85]:
# get only protein sequence from training subset

train=df_subset[df_subset.set=='train']
x=train.smorf.tolist()
y=train.y=='positive'

In [82]:
# encode the protein sequence as a one-hot

def onehote(sequence):
  seq_array = np.array(list(sequence))
  #integer encode the sequence
  label_encoder = LabelEncoder()
  integer_encoded_seq = label_encoder.fit_transform(seq_array) 
  #one hot the sequence
  onehot_encoder = OneHotEncoder(sparse=False)
  #reshape because that's what OneHotEncoder likes
  integer_encoded_seq = integer_encoded_seq.reshape(len(integer_encoded_seq), 1)
  onehot_encoded_seq = onehot_encoder.fit_transform(integer_encoded_seq)
  return onehot_encoded_seq

In [86]:
onehote(x[1])

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],


In [75]:
x

['ATGTCGTGGCTGTCATTTCATTCCCAGCTGTCCTTGAGCCCCGCCGCCGTCCAGCAGGATAACCCATCTTCTCGCTCCAACGCAGGTGCCGCTCCGGAGACCGGCCCGGCCGCGCTGACGCCACCGTTGCTGGCGCAGTAA',
 'ATGAATGAGTTAGACCATGAGGTAAAGAGCTTACGTAAAGAATTAAACACAGAACGTACACAAAGAGTCAGTGTAGATGAGTCTTTAGGTATGCGTTTTGATAATCTAATATTTCACCTAGATAACAAGTAA',
 'ATGCTGCCGCGCTGCGCTCAGAAAAAATCGGATAACTTGACGATACCCTGGATTCCGCGCATGAACGGGAGGCCGGCCGAGGTATCGCAGGTGCGGGAGTCTGGCATGGGGGCGCGCGAGGATCCCGCCTCACAAAGGCTGGAGTGA',
 'ATGAATATATCAGCTAGTAGCCTCAGAATTGGTTGGACTTTTGGAAGAATAGCTATTAGTATTAGGATTATTATAATGTGCGGCTATCTCAGGATTTAA',
 'ATGAGTGATAATTATAAAAAGAACTTTTTAGCGATTTGTTTTCTATCAAATTTAGCGTTGTTGGTTTTATCTGACAATTTAGCACTTCGTGTTTTGAACTTTTTAGCGATTGTTTTAGTGATTTATGTGTTTTCGCTAGTTGATAAAAATTAA',
 'ATGAAACGAAATACTAAATTCGGAAACTTTGCCGCCGAAAAACAGGAGGTAACGTTTGACTTAATTAGTCAAAGAGTATAG',
 'ATGAAAAGAAGATGTTATGAAATACCACATGGGTTTCATTATATGGATGTCTTACAACCGTATTTTGAAATGGGGTGGTCTTTAACAAAAGTTATGCGTATCGGTCAAGATACGTCAGTTATTATTGTTCACCCATAG',
 'ATGCGAGTTCGGCGTTTTTTTTATGACAATCGGGATAAAGGCTCCAGCCATTTACCGCGAGATAGCTCGTGG

In [64]:
np.array(x)

array(['ATGTCGTGGCTGTCATTTCATTCCCAGCTGTCCTTGAGCCCCGCCGCCGTCCAGCAGGATAACCCATCTTCTCGCTCCAACGCAGGTGCCGCTCCGGAGACCGGCCCGGCCGCGCTGACGCCACCGTTGCTGGCGCAGTAA',
       'ATGAATGAGTTAGACCATGAGGTAAAGAGCTTACGTAAAGAATTAAACACAGAACGTACACAAAGAGTCAGTGTAGATGAGTCTTTAGGTATGCGTTTTGATAATCTAATATTTCACCTAGATAACAAGTAA',
       'ATGCTGCCGCGCTGCGCTCAGAAAAAATCGGATAACTTGACGATACCCTGGATTCCGCGCATGAACGGGAGGCCGGCCGAGGTATCGCAGGTGCGGGAGTCTGGCATGGGGGCGCGCGAGGATCCCGCCTCACAAAGGCTGGAGTGA',
       'ATGAATATATCAGCTAGTAGCCTCAGAATTGGTTGGACTTTTGGAAGAATAGCTATTAGTATTAGGATTATTATAATGTGCGGCTATCTCAGGATTTAA',
       'ATGAGTGATAATTATAAAAAGAACTTTTTAGCGATTTGTTTTCTATCAAATTTAGCGTTGTTGGTTTTATCTGACAATTTAGCACTTCGTGTTTTGAACTTTTTAGCGATTGTTTTAGTGATTTATGTGTTTTCGCTAGTTGATAAAAATTAA',
       'ATGAAACGAAATACTAAATTCGGAAACTTTGCCGCCGAAAAACAGGAGGTAACGTTTGACTTAATTAGTCAAAGAGTATAG',
       'ATGAAAAGAAGATGTTATGAAATACCACATGGGTTTCATTATATGGATGTCTTACAACCGTATTTTGAAATGGGGTGGTCTTTAACAAAAGTTATGCGTATCGGTCAAGATACGTCAGTTATTATTGTTCACCCATAG',
       'ATGCGAGTTCGGCGTTTTTTTTAT