In [None]:
from genet import database as db
import pandas as pd

In [None]:
from Bio import Entrez

In [None]:
# first get UIDs for clinvar records of the same position
# credits: credits: https://entrezpy.readthedocs.io/en/master/tutorials/esearch/esearch_uids.html
chr = variants["chr"].split("chr")[1]
start, end = str(variants["start"]), str(variants["end"])

es = entrezpy.esearch.esearcher.Esearcher('esearcher', self.entrez_email)
genomic_pos = chr + "[chr]" + " AND " + start + ":" + end  # + "[chrpos37]"
entrez_query = es.inquire(
    {'db': 'clinvar',
     'term': genomic_pos,
     'retmax': 100000,
     'retstart': 0,
     'rettype': 'uilist'})  # 'usehistory': False
entrez_uids = entrez_query.get_result().uids

In [None]:
# Gene name으로 reference sequence 정보를 찾고 싶으면, 
# NCBI에서 search string을 아래와 같이 정해서 넣어줘야 함

item = 'BRCA1'
animal = 'Homo sapien' 
search_string = item+"[Gene] AND "+animal+"[Organism] AND biomol_genomic[PROP] AND RefSeqGene[Filter]"

# search_string = 'BRCA1[Gene] AND Homo sapien[Organism] AND biomol_genomic[PROP] AND RefSeqGene[Filter]'


In [None]:
handle = Entrez.esearch(db="nucleotide", term=search_string)
record = Entrez.read(handle)
ids = record['IdList']

In [None]:
from Bio import SeqIO

fetch = Entrez.efetch(db='nucleotide', id=record['IdList'], rettype='gb', retmode='xlm')
record = SeqIO.read(fetch, 'genbank')

record

In [None]:
cRec = record.features[0]
print(type(cRec))
print(cRec)
print(cRec.type)
print(cRec.location)
print(cRec.qualifiers)

In [None]:
gene = db.GetGene('BRCA1')
gene

In [None]:
# print(gene.transcripts())

list_exons = gene.exons()

In [None]:
list_exons

In [None]:
list_transcript[1]

In [None]:
from pprint import pprint
pprint(dir(seq))

In [None]:
a = seq.seq

In [None]:
len(a)

In [None]:
print(seq)

In [None]:
from genet import database as db

In [None]:
gene = db.GetGene('BRCA1')

In [None]:

dict_models = {
    
    'HEK293T': {
        'PE2'        : 'DeepPrime_base',
        'NRCH_PE2'   : 'DeepPrime_FT/DPFT_293T_NRCH_PE2',
        'NRCH_PE2max': 'DeepPrime_FT/DPFT_293T_NRCH_PE2max',
        'PE2max'     : 'DeepPrime_FT/DPFT_293T_PE2max',
        'PE4max'     : 'DeepPrime_FT/DPFT_293T_PE4max',
    },

    'A549': {
        'PE4max'     : 'DeepPrime_FT/DPFT_A549_PE4max',
    },
    
    'DLD1': {
        'NRCH_PE4max': 'DeepPrime_FT/DPFT_DLD1_NRCH_PE4max',
        'PE4max'     : 'DeepPrime_FT/DPFT_DLD1_PE4max',
    },

    'HCT116': {
        'PE2'        : 'DeepPrime_FT/DPFT_HCT116_PE2',
    },
    
    'HeLa': {
        'PE2max'     : 'DeepPrime_FT/DPFT_HeLa_PE2max',
    },
    
    'MDA-MB-231': {
        'PE2'        : 'DeepPrime_FT/DPFT_MDA_PE2',
    },
    
    'NIH3T3': {
        'NRCH_PE4max': 'DeepPrime_FT/DPFT_NIH_NRCH_PE4max',
    },
    
}

In [None]:
import sys

pe_system = 'PE4max'
cell_type = 'DLD11'

try:
    model_type = dict_models[cell_type][pe_system]
except:
    print('Not available Prime Editor')
    sys.exit()
    
    
print(model_type)

In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn

In [None]:
class GeneInteractionModel(nn.Module):


    def __init__(self, hidden_size, num_layers, num_features=24, dropout=0.1):
        super(GeneInteractionModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.c1 = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=128, kernel_size=(2, 3), stride=1, padding=(0, 1)),
            nn.BatchNorm2d(128),
            nn.GELU(),
        )
        self.c2 = nn.Sequential(
            nn.Conv1d(in_channels=128, out_channels=108, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(108),
            nn.GELU(),
            nn.AvgPool1d(kernel_size=2, stride=2),

            nn.Conv1d(in_channels=108, out_channels=108, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(108),
            nn.GELU(),
            nn.AvgPool1d(kernel_size=2, stride=2),

            nn.Conv1d(in_channels=108, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.AvgPool1d(kernel_size=2, stride=2),
        )

        self.r = nn.GRU(128, hidden_size, num_layers, batch_first=True, bidirectional=True)

        self.s = nn.Linear(2 * hidden_size, 12, bias=False)

        self.d = nn.Sequential(
            nn.Linear(num_features, 96, bias=False),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(96, 64, bias=False),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 128, bias=False)
        )

        self.head = nn.Sequential(
            nn.BatchNorm1d(140),
            nn.Dropout(dropout),
            nn.Linear(140, 1, bias=True),
        )

    def forward(self, g, x):
        g = torch.squeeze(self.c1(g), 2)
        g = self.c2(g)
        g, _ = self.r(torch.transpose(g, 1, 2))
        g = self.s(g[:, -1, :])

        x = self.d(x)

        out = self.head(torch.cat((g, x), dim=1))

        return F.softplus(out)


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
m = r'D:\github_project\genet\genet\predict\models\DeepPrime\DeepPrime_FT\DPFT_293T_NRCH_PE2\final_model_0.pt'
model = GeneInteractionModel(hidden_size=128, num_layers=1).to(device)
model.load_state_dict(torch.load(m))

In [None]:
path = r'D:\github_project\genet\genet\predict\models\DeepPrime\DeepPrime_FT\DPFT_293T_NRCH_PE2\final_model_0_new.pt'
torch.save(model.state_dict(), path)