In [1]:
## bring in needed mods
import pandas as pd, numpy as np, os, subprocess
from Bio.Seq import Seq
from Bio import SeqIO

In [2]:
## Set chromosome of interest
chrom = 'Chr14'
peak = (355325,421543)

In [3]:
chromint = str(int(chrom[-2:]))

In [4]:
contigpath = '/Users/croth/Desktop/CRYPTO_QTL/FILES/%s/%s.fasta'%(chrom,chrom)
contigrec = [rec for rec in SeqIO.parse(contigpath,contigpath.split('.')[-1])][0]

In [5]:
vardf = pd.read_csv('../FILES/OLD/OCT2018/CDx-ill-SNP-INDEL-CHR%s-df-104.csv'%(chromint),
                    index_col=0)
vardf.head()

Unnamed: 0,Pos,Ref,Alt,Start,End,aStart,aEnd,Subtype,status,invcf,...,SS-B598,SS-B565,SS-B600,SS-B574,SS-B872_Correction,SS-B873_Correction,SS-B360,SS-B397,SS-B564,SS-B382
0,6648,T,G,6647,6648,6647,6648,tv,LOW,CNN00020,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
1,24175,C,T,24174,24175,24174,24175,ts,LOW,CNN00070,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
2,28073,CA,CG,28072,28074,28073,28074,ins,LOW,CNN00080,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
3,28198,TTACA,TTACG,28197,28202,28198,28202,ins,MODIFIER,CNN00070,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
4,31161,T,A,31160,31161,31160,31161,tv,MODIFIER,CNN00085,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0


In [6]:
#ch_gff_path  ='../FILES/%s/%s_xl280_gff.csv'%(chrom,chrom)
gff_path = '../FILES/xl280genome.gff.csv'
gff = pd.read_csv(gff_path,index_col=0,header=0)

In [7]:
ch_gff = gff[(gff.xl280_chrom==chrom)].sort_values(by='start')
ch_gff.drop(['chrom','start','end'],inplace=True,axis=1)
ch_gff.columns = ['type','strand','phase','attribute','gene','start','end','chrom']
ch_gff.head()

Unnamed: 0,type,strand,phase,attribute,gene,start,end,chrom
95304,gene,+,.,ID=CNN00010;description=unspecified product,CNN00010,2481,4297,Chr14
95305,mRNA,+,.,ID=CNN00010.mRNA;Parent=CNN00010;description=u...,CNN00010,2481,4297,Chr14
95306,exon,+,.,ID=exon_CNN00010-E1;Parent=CNN00010.mRNA,CNN00010,2481,4297,Chr14
95307,CDS,+,0,ID=CNN00010.mRNA-p1-CDS1;Parent=CNN00010.mRNA;...,CNN00010,2481,4297,Chr14
100166,three_prime_UTR,-,.,ID=utr_CNN00020-t26_1_1;Parent=CNN00020-t26_1,CNN00020,5071,5096,Chr14


In [8]:
def gff_gene(gene,vardf,gff,sequen,savepath=None,verbose=False):
    ggff = gff[(gff.gene==gene)].sort_values(by='start')
    gvdf = vardf[(vardf.Pos >= ggff.start.min()) & 
             (vardf.Pos <= ggff.end.max())][[
            'Start','End','Ref','Alt']].copy()
    if gvdf.shape[0] == 0:
        if verbose:
            print(gvdf.shape)
            print('No genetic variants in %s'%(gene))
        return []
    gvdf['Var'] = 1
    variant_positions = np.concatenate([np.arange(*a[1][['Start','End']]) 
                                    for a in gvdf.iterrows()])
    non_variant_positions = [a for a in np.arange(ggff.start.min(),ggff.end.max()) 
                         if a not in variant_positions]
    non_variants = [str(sequen[p:p+1]) for p in non_variant_positions]
    gnvdf = pd.DataFrame([non_variant_positions,non_variant_positions,
                      non_variants,non_variants,
                      list(np.zeros(len(non_variant_positions)))],
                     index=['Start','End','Ref','Alt','Var']).T
    gdf = pd.concat([gvdf,gnvdf]).sort_values(by=['Start','End']).reset_index(drop=True)
    gdf['Strand'] = [1 if ggff.strand.unique()[0] == '+' else -1][0]
    gdf['Seq'] = 0
    gdf['Gene'] = gene
    for g in ggff[(ggff.type=='CDS')].iterrows():
        fpos = np.arange(g[1].start,g[1].end)
        gdf.at[gdf.Start.isin(fpos),'Seq'] = 1
    for g in ggff[(ggff.type=='three_prime_UTR')].iterrows():
        fpos = np.arange(g[1].start,g[1].end)
        gdf.at[gdf.Start.isin(fpos),'Seq'] = 3
    for g in ggff[(ggff.type=='five_prime_UTR')].iterrows():
        fpos = np.arange(g[1].start,g[1].end)
        gdf.at[gdf.Start.isin(fpos),'Seq'] = 5
    if savepath is not None:
        gdf.to_csv(savepath)
    return gdf

def gene_protien(gdf,a='Ref',tostop=True):
    strand = gdf.Strand.unique()[0]
    if strand > 0:
        aa = Seq(''.join(gdf[(gdf.Seq==1)][a].tolist())).translate(to_stop=tostop)
    else:
        assert strand < 0
        aa = Seq(''.join(gdf[(gdf.Seq==1)][a].tolist())).reverse_complement().translate(to_stop=tostop)
    return aa

def protien_compare(a,b):
    if len(a) != len(b):
        flag = 'Modified:%s'%(str(len(a)-len(b)))
    else:
        flag = []
        for i, k in enumerate(a):
            if k != b[i]:
                flag.append(i)
        if len(flag) == 0:
            assert len(a) == len(b)
            flag = 'Synonymous'
        else:
            flag = ','.join([str(a) for a in flag])
    return flag   

In [9]:
peak_genes = ch_gff[(ch_gff.start>=peak[0]) & (ch_gff.end<=peak[-1])].gene.unique()

In [10]:
len(peak_genes)

30

In [11]:
peak_gene_res = []
for gene in peak_genes:
    genegdf = gff_gene(gene,vardf,ch_gff,contigrec.seq)
    if len(genegdf) == 0:
        peak_gene_res.append('Nonvariant')
        continue
    gene_r_aa = gene_protien(genegdf,a='Ref')
    gene_a_aa = gene_protien(genegdf,a='Alt')
    peak_gene_res.append(protien_compare(gene_r_aa,gene_a_aa))

In [12]:
peak_gene_res_df = pd.DataFrame([peak_genes,peak_gene_res],index=['Name','Result']).T

In [13]:
cnn01270 = gff_gene('CNN01270',vardf,ch_gff,contigrec.seq)
cnn01270_r_aa = gene_protien(cnn01270,a='Ref',tostop=True)
len(cnn01270_r_aa)*3

1368

In [14]:
Ref_seq = ''.join(cnn01270[(cnn01270.Seq==1)].sort_values('Start').Ref.tolist())

In [15]:
len(Ref_seq)

2076

In [16]:
temp = Seq(Ref_seq).translate()

In [17]:
len(temp)

692

In [18]:
for i, k in enumerate(temp):
    if k == '*':
        print(i)

456
691


In [19]:
cnn01270gff = gff[(gff.xl280_chrom=='Chr14') & (gff.gene=='CNN01270') & 
                  (gff.type.isin(['gene','CDS','three_prime_UTR','five_prime_UTR']))]

In [21]:
cnn01270gff

Unnamed: 0,chrom,type,start,end,strand,phase,attribute,gene,xl280_start,xl280_end,xl280_chrom
64388,AE017356.1,gene,386822,389688,+,.,ID=CNN01270;description=hypothetical protein,CNN01270,386833,389699,Chr14
64398,AE017356.1,CDS,387145,387245,+,0,ID=CNN01270-t26_1-p1-CDS1;Parent=CNN01270-t26_...,CNN01270,387156,387256,Chr14
64399,AE017356.1,CDS,387309,387417,+,2,ID=CNN01270-t26_1-p1-CDS2;Parent=CNN01270-t26_...,CNN01270,387320,387428,Chr14
64400,AE017356.1,CDS,387473,387895,+,2,ID=CNN01270-t26_1-p1-CDS3;Parent=CNN01270-t26_...,CNN01270,387484,387906,Chr14
64401,AE017356.1,CDS,387940,388089,+,0,ID=CNN01270-t26_1-p1-CDS4;Parent=CNN01270-t26_...,CNN01270,387951,388100,Chr14
64402,AE017356.1,CDS,388135,388711,+,1,ID=CNN01270-t26_1-p1-CDS5;Parent=CNN01270-t26_...,CNN01270,388146,388722,Chr14
64403,AE017356.1,CDS,388766,388906,+,1,ID=CNN01270-t26_1-p1-CDS6;Parent=CNN01270-t26_...,CNN01270,388777,388917,Chr14
64404,AE017356.1,CDS,388959,389540,+,2,ID=CNN01270-t26_1-p1-CDS7;Parent=CNN01270-t26_...,CNN01270,388970,389551,Chr14
64405,AE017356.1,five_prime_UTR,386822,387025,+,.,ID=utr_CNN01270-t26_1_1;Parent=CNN01270-t26_1,CNN01270,386833,387036,Chr14
64406,AE017356.1,five_prime_UTR,387075,387145,+,.,ID=utr_CNN01270-t26_1_2;Parent=CNN01270-t26_1,CNN01270,387086,387156,Chr14


In [22]:
#cnn01270[(cnn01270.Start>=388777) & (cnn01270.End<388917)]

In [23]:
jecchrom = cnn01270gff.chrom.tolist()[0]

In [24]:
jeccontigpath = '/Users/croth/Desktop/CRYPTO_QTL/FILES/%s/%s.fasta'%(jecchrom,jecchrom)
jeccontigrec = [rec for rec in SeqIO.parse(jeccontigpath,jeccontigpath.split('.')[-1])][0]

In [25]:
xl280cnn01270 = contigrec.seq[cnn01270gff.xl280_start.min():cnn01270gff.xl280_end.max()]
jeccnn01270 = jeccontigrec.seq[cnn01270gff.start.min():cnn01270gff.end.max()]

In [26]:
xl280cnn01270_ref = Seq(''.join(cnn01270.Ref.tolist()))
xl280cnn01270_alt = Seq(''.join(cnn01270.Alt.tolist()))

In [27]:
for i, s in enumerate(xl280cnn01270):
    if s != jeccnn01270[i]:
        print(i)

1957


In [28]:
jeccnn01270_ref = Seq('caagtatcgccccatcacttccgactttccgctccctttggtcgatcatccctcacattcccaggtcgcagactcccagctcccccgcagcagatcgggtggacaccttatatctcaaaatcctcgtggcatccatccgacgccgtccgccagcaacaacttctcgagcggacccggatacgagagagctcatcgtacgccaagttagtataatatattaccgcgtagccagtccgtaattcataacctctagcaaatagcaagcgagttaatccaattgtgtactcggcattgagggcctcagacgctgaaacgaagtcataATGTCCGTCAAGAGCCTATTAGCTTCGAGCTACGGCTCTATTGAGAACCGACGATACTCTTTAGATAGAATACGAGGGGAACTGAATGCTGTAGTAGAAGgtaagtaatcaccgtacccccgtaatgaccattcgttcacttgtctgctgcactgctgttatagCTCTCCCGACGCAAGTGCATCCAGCTGAAAGAGAGATTTACCTGAACGACATTCTATCCGACCTGCATCCGAAAGAAGAAGCAATATGGAGGGATTGGCCAAGCGATGgtaagctcatcttgtgcctagagaatatcagaataccgcgtttatcaagagcgtagTCTACCTACTTGCCCTCACGGCTGTCAAATGCCTCGGACGAAACCCCATTGGATCAGAAACCCTGCTCTCAAGCCATCACATTTCCACTCTCATGTTTCATTCAACATTACTATCAGATGATTCTGCGACGTTTCCGACCGTCGTTGCCCAGTCATCAACGTCACCACAGGCTCGTGAAGCCTTGAAGATCTTGGCCAACATGCTAGTTCTTCACACCGCCGGACGTATCGAATTCTTCAAAGCCCGCGGAGCTATAGCGGTTGCTCGCGCTTTGTCCGAAAGTTCTACACAAGACGAGAATGATGCAGCGCATGCGGAAAAGCTGTTTTTACTGGGGAGACTTGGATTTTTGGTCACTGTGGAAAGGGCTGAAGCGGCGAAGCAGATGGTAGATAACGAGGTGGTGGATGCCCTCGTGCAGgtgagactgtccatctcgtccaagacagaactaacgaaggagcagATTTTCATGTCGGCCTCTGTCTTACCCGCTTCTTTCTTGTATCTGAGCGAATTGCTCAAACTGACGAACGCATTACTTCAATTCTATCCTTATAAACAGCCCTCTGATGCTACAACTGGTGTTGATCCTTGGGACGAAAAGTTTGATTAgtgagtaattgttgttaaacagttcagttctgatcttatctctaagCCTTCTTTACCCTCTTTTGCGCCTTTTTTACGCTACTCCAACGGTCGATCTTAGCCCGCCACTTACCCACATCATGAATACTCTGCTTTTGATACCGTTCAAAACACGTTTGTTACCAACATGGGCTTCGGTCCCTGAAAGTCCCGGTTCCCTACAAGTCAACAGTCCCTCGAGCCCTTCGTCCACAATGAGAAATATTCTCACCAAGCTGGGAAACATAGCTTCACCATCCAGCCCTCGCAAGTCTTCGGCCGGCTCACTTGCCCCGCCTGGTTCTAGGCCGCCAGCAGGTGGTCAGAGATCCGCACCAAATTCACCGCGAGGTAGTTTCTCGTCTTCGAGGCCTGGCTTTGGTGCGACATCCGATCATTTCGCGCTATCGTCAAGGCTTCTGAAGATCCTTGACCGGTTCTTTGAGGCATATCTACCATACCCTAAACGGCCGGATGACGATCTGCCTCATTCACTTGTACTGGATGAAATACTTCCTCCGTTATTACTCTTAATGACACGAGCCACATGGGGTTTGGAAAACGTGCGGTTATCGATCAAAGAAATTTTATTACCTTCCTCTCTgtaggtctgctctctggtggattttatcaaattcaattgacgttttatgctgcagAGACCGCTCATCAGAAGCAGGGCCGCTTGAATCCCGCAAGGGTTTACTAGGCAACATTCTGCGTTTAATGGCTTGTGCAGGTCATACTCAAACAAGAAATGCTGCCGGAGAGTTGATGTGGGCAATCTGCAATGGTAATGgtatgtttagattctcccatattttacaagcaactttgctgagcttcggacagCTTCCGATCTCTGCGTAGAAATCGGCTATGGCAACGCAGCTGGCATACTGTTCCAAAAAGGGCTCACTGGTCCACCATCGGCAAAGGTCGAAGAGATTGACCAGCCTAATCCTTCGCAGACCGTGATGCAAATAGCCAAGTCATCTAATCGCCGATCAGACGTCCGGTCGCCTGTAACAACTGTCCAGCCGGCTACGCCTTCTTCTACGTCGACATTCAGCGCCGAAACTTTGAGGAATCCAATCACAGGTATTGAGAACGGTAATCAAGCGACAGGTGACCTGGACGAAATGACTCAGGAAGAGAAGGAGCGCGAAGCAGAAAGGTTGTTTATTTTGTTTGATAGGATGGAGAAGAATCCTGTGATTAGCATGAAGTCAGGGGATGACCAAGACGGACAAAAGAGTAAGGCTCAAGGGCTGAAGGATATAATGAGAGAAAAGTTAGAGAGTGGGGATATGGAGAGATGGGATCGCAAGGATGAGCAAGAGGAGAGACAGAGATTGGAGGAGGAGGCGCAAAAAGACGAAGAGGAAGCTTTCCGGGAGCTTGCGGCCTATAAACGACGCACTGGAAGGTAGtaaccttacagatataatctatcaatcagagacattggggcccaaatcgttttcgactgtgatatttggttatatcttgtgccttgcgtaaaagttgtcatgctatatgtgacatatccgtactatacaatgatgtctactccttcat')

In [29]:
cnn01270_seqs = [jeccnn01270_ref,jeccnn01270,xl280cnn01270,xl280cnn01270_ref,xl280cnn01270_alt]

In [30]:
cnn01270gff

Unnamed: 0,chrom,type,start,end,strand,phase,attribute,gene,xl280_start,xl280_end,xl280_chrom
64388,AE017356.1,gene,386822,389688,+,.,ID=CNN01270;description=hypothetical protein,CNN01270,386833,389699,Chr14
64398,AE017356.1,CDS,387145,387245,+,0,ID=CNN01270-t26_1-p1-CDS1;Parent=CNN01270-t26_...,CNN01270,387156,387256,Chr14
64399,AE017356.1,CDS,387309,387417,+,2,ID=CNN01270-t26_1-p1-CDS2;Parent=CNN01270-t26_...,CNN01270,387320,387428,Chr14
64400,AE017356.1,CDS,387473,387895,+,2,ID=CNN01270-t26_1-p1-CDS3;Parent=CNN01270-t26_...,CNN01270,387484,387906,Chr14
64401,AE017356.1,CDS,387940,388089,+,0,ID=CNN01270-t26_1-p1-CDS4;Parent=CNN01270-t26_...,CNN01270,387951,388100,Chr14
64402,AE017356.1,CDS,388135,388711,+,1,ID=CNN01270-t26_1-p1-CDS5;Parent=CNN01270-t26_...,CNN01270,388146,388722,Chr14
64403,AE017356.1,CDS,388766,388906,+,1,ID=CNN01270-t26_1-p1-CDS6;Parent=CNN01270-t26_...,CNN01270,388777,388917,Chr14
64404,AE017356.1,CDS,388959,389540,+,2,ID=CNN01270-t26_1-p1-CDS7;Parent=CNN01270-t26_...,CNN01270,388970,389551,Chr14
64405,AE017356.1,five_prime_UTR,386822,387025,+,.,ID=utr_CNN01270-t26_1_1;Parent=CNN01270-t26_1,CNN01270,386833,387036,Chr14
64406,AE017356.1,five_prime_UTR,387075,387145,+,.,ID=utr_CNN01270-t26_1_2;Parent=CNN01270-t26_1,CNN01270,387086,387156,Chr14


In [31]:
cnn01270[(cnn01270.Seq==1) & (cnn01270.Var==1)].head()

Unnamed: 0,Start,End,Ref,Alt,Var,Strand,Seq,Gene
377,387210,387211,T,G,1,1,1,CNN01270
534,387367,387368,G,T,1,1,1,CNN01270
781,387614,387615,G,A,1,1,1,CNN01270
1048,387881,387886,CGAGG,CG,1,1,1,CNN01270
1350,388187,388189,CG,CA,1,1,1,CNN01270


In [32]:
[len(a) for a in cnn01270_seqs]

[2866, 2866, 2866, 2866, 2863]

In [33]:
from Bio.SeqRecord import SeqRecord

In [40]:
cnn01270_ids = ['fungidb_jec21 ','fasta_jec21','fasta_xl280','xl280','431']
fileout = '/Users/croth/Desktop/xl280_cnn01270.fasta'
with open(fileout, 'w') as f_out:
        for i,seq in enumerate(cnn01270_seqs):
            seq_record = SeqRecord(seq=seq,id=cnn01270_ids[i])
            r = SeqIO.write(seq_record, f_out,'fasta')
            f_out.write('\n')
            if r != 1: print('Error while writing sequence: ' + seq_record.id)

In [36]:
len(jeccnn01270)

2866

In [37]:
for i, s in enumerate(xl280cnn01270):
    if s != jeccnn01270[i]:
        print(i)

1957


In [38]:
xl280cnn01270[1950:1960]

Seq('CTCATCATAA', SingleLetterAlphabet())

In [39]:
jeccnn01270[1950:1960]

Seq('CTCATCAGAA', SingleLetterAlphabet())

In [53]:
peak_gene_nonsyn =  peak_gene_res_df[(peak_gene_res_df.Result!='Synonymous') & 
                 (peak_gene_res_df.Result!='Nonvariant')]
print(peak_gene_nonsyn.shape[0])

16


In [54]:
h99_names = ['CNAG_06390','CNAG_06393','CNAG_06394','CNAG_06396','CNAG_07936',
             'CNAG_06401','CNAG_06405','CNAG_06406','CNAG_06408','CNAG_06409',
             'CNAG_06411','CNAG_06412','Missing','CNAG_06415',
             'CNAG_06416','CNAG_06418'] #CNAG_06413

In [55]:
peak_gene_nonsyn['H99_name'] = h99_names
peak_gene_nonsyn.to_csv('../FILES/%s/%s_peak_genes.csv'%(chrom,chrom),index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [56]:
peak_gene_nonsyn

Unnamed: 0,Name,Result,H99_name
0,CNN01170,Modified:2,CNAG_06390
3,CNN01190,450,CNAG_06393
4,CNN01195,1263987100,CNAG_06394
6,CNN01210,89179,CNAG_06396
7,CNN01220,"13,44,73,107,118,162,201,269,292,302,303,306,4...",CNAG_07936
11,CNN01270,Modified:-235,CNAG_06401
15,CNN01310,120274308477,CNAG_06405
16,CNN01320,86,CNAG_06406
18,CNN01340,52137223234,CNAG_06408
19,CNN01345,131,CNAG_06409


In [41]:
cnn01385 = gff_gene('CNN01385',vardf,ch_gff,contigrec.seq)
cnn01385_r_aa = gene_protien(cnn01270,a='Ref',tostop=True)

In [62]:
gff[(gff.gene=='CNN01385')]

Unnamed: 0,chrom,type,start,end,strand,phase,attribute,gene,xl280_start,xl280_end,xl280_chrom
35171,AE017356.1,gene,408804,409768,-,.,ID=CNN01385;description=hypothetical protein,CNN01385,408815,409779,Chr14
35172,AE017356.1,mRNA,408804,409768,-,.,ID=CNN01385-t26_1;Parent=CNN01385;description=...,CNN01385,408815,409779,Chr14
35173,AE017356.1,exon,408804,409346,-,.,ID=exon_CNN01385-E3;Parent=CNN01385-t26_1,CNN01385,408815,409357,Chr14
35174,AE017356.1,exon,409410,409582,-,.,ID=exon_CNN01385-E2;Parent=CNN01385-t26_1,CNN01385,409421,409593,Chr14
35175,AE017356.1,exon,409627,409768,-,.,ID=exon_CNN01385-E1;Parent=CNN01385-t26_1,CNN01385,409638,409779,Chr14
35176,AE017356.1,CDS,409043,409346,-,0,ID=CNN01385-t26_1-p1-CDS2;Parent=CNN01385-t26_...,CNN01385,409054,409357,Chr14
35177,AE017356.1,CDS,409410,409578,-,0,ID=CNN01385-t26_1-p1-CDS1;Parent=CNN01385-t26_...,CNN01385,409421,409589,Chr14
35178,AE017356.1,three_prime_UTR,408804,409043,-,.,ID=utr_CNN01385-t26_1_1;Parent=CNN01385-t26_1,CNN01385,408815,409054,Chr14
35179,AE017356.1,five_prime_UTR,409578,409582,-,.,ID=utr_CNN01385-t26_1_2;Parent=CNN01385-t26_1,CNN01385,409589,409593,Chr14
35180,AE017356.1,five_prime_UTR,409627,409768,-,.,ID=utr_CNN01385-t26_1_3;Parent=CNN01385-t26_1,CNN01385,409638,409779,Chr14


In [58]:
cnn01385_seq = Seq(''.join(cnn01385.Ref.tolist())).reverse_complement()

In [59]:
len(cnn01385_seq)

964

In [60]:
print(cnn01385_seq)

GTTCGGCATCACCAACCAGTGGATTGCCAGCTGCAAAGTTTTCCAGCGCGAGCACTCGCGGCGTGGGAGAGGAAGAGTCAGCAAAGAAGGCAAAGGTGGCAGAGGAGGGTGGCAAACGGAGGCCATCACGCTCGAGAAGCCGTGAGTATGCGTCGTTATGGTACCCTGTGCTAAAAGAAGGGATAGACCGATGAAAGCGATGTGGTGGCGGCACCTGCAAAAAAAACAAAGAAAGCGGCTGGCGCAAAGGAGCAGTACCTACATAACCCTTTCGACATTCCTTCCCAGGTACTCGCGGTCGAAAATTTTGCAGCTGGCAATCCATTGGTTGGTGATGCCGAACACGATACTCTGGCCAGTGAGTAACAATATCCCCTTATTGTGAGGTTTGAAACGTCTCTGACATGAAACACATTGACCAGGCTGACAGGCGTCTCAGGCATGACGCCTACCCCCTCCTTTGCGCGGCCGCAAGTGCCCGCTCGAATGTGGCCGCTGTGCTGCAAACGAACGAGCACAAGCGCCACTGGGGAAAGCCCATGTCAGCTGGGGATGCCGAAATCCTTAACTGGACGAGGTCTCAGCTGCACAAGGCCAGCCAAGATCTGGACATCGTTTGCAATTTCGCGAGAGAGGCTGCTGGCTATCCAAATATGACCAACGCAAGGTCATTATATATACGCATTGGACAAACCTCATCTGGCCCTGCAGATGTACCTAGCTAGGGTAGGAATTTTCTCTTCGGCCTTACATGGCAGCCTCACCATGGAAGAGCGCGAGCGGGTGGTACTAAGAGTTTCAACGCGACGATGACCATCCATGTACCCGATTGGCTGAATTCTTGGGTACATTCATGTACGTGATAAAAATACGTGATTATAACACCACCTCTATTGCTTGGTTGGAGAGTTAAACATAAAAACAAAAACAAAAAAGAATCCTGCCCCGGAGAGACTCGAACTCT


In [63]:
assert ch_gff[(ch_gff.gene=='CNN01220') & (ch_gff.type=='CDS')].shape[0] == 28

In [64]:
len(peak_gene_nonsyn[(peak_gene_nonsyn.Name=='CNN01220')].Result.values[0].split(','))

17

In [67]:
modifiedix = [i[0] for i in peak_gene_nonsyn.iterrows()
              if i[1]['Result'].split(':')[0] == 'Modified']

In [68]:
modifiedix

[0, 11]

In [70]:
agene_name = peak_gene_res_df['Name'][modifiedix].values
agene_name

array(['CNN01170', 'CNN01270'], dtype=object)

In [42]:
gene_gdf = gff_gene('CNN01270',vardf,ch_gff,contigrec.seq)
print(gene_gdf.shape)

(2859, 8)


In [55]:
gene_ref_aa = gene_protien(gene_gdf,a='Ref',tostop=True)
gene_alt_aa = gene_protien(gene_gdf,a='Alt',tostop=True)
print(protien_compare(gene_ref_aa,gene_alt_aa))

Modified:-235


In [56]:
gene_gdf_zb = gene_gdf.copy()

In [57]:
456*3 

1368

In [58]:
len(gene_ref_aa)

456

In [68]:
len(gene_alt_aa)

691

In [64]:
len(''.join(gene_gdf[gene_gdf.Seq!=0].Alt.tolist()))

2494

In [66]:
gene_gdf[(gene_gdf.Seq!=0) & (gene_gdf.Var==1)]

Unnamed: 0,Start,End,Ref,Alt,Var,Strand,Seq,Gene
64,386897,386898,G,A,1,1,5,CNN01270
161,386994,386995,T,A,1,1,5,CNN01270
377,387210,387211,T,G,1,1,1,CNN01270
534,387367,387368,G,T,1,1,1,CNN01270
781,387614,387615,G,A,1,1,1,CNN01270
1048,387881,387886,CGAGG,CG,1,1,1,CNN01270
1350,388187,388189,CG,CA,1,1,1,CNN01270
1357,388195,388198,CTT,CTC,1,1,1,CNN01270
1381,388221,388222,G,A,1,1,1,CNN01270
1408,388248,388249,A,C,1,1,1,CNN01270


In [None]:
gene_gdf[(gene_gdf.Seq==1) & (gene_gdf.Var==1)]

In [None]:
len('AGGCATGGGCATGGGCATGG')

In [None]:
len('AGGCATGGGCATGG')

In [None]:
print(len(gene_ref_aa))
gene_ref_aa

In [None]:
print(len(gene_alt_aa))
gene_alt_aa