In [0]:
pwd

'/Users/cbib-02/Documents/Jupyter_Notebooks'

In [0]:
%%bash 
cd /Users/cbib-02/Documents/TNT
pwd
ls -ltrh

/Users/cbib-02/Documents/TNT
total 67944
-rwxrwxrwx@  1 cbib-02  staff   6.2M Nov 10  2017 JCJR2.fasta
-rw-r--r--   1 cbib-02  staff    83K Aug 14 15:54 pseudo_sp_vs_mandeleii.delta
-rw-r--r--   1 cbib-02  staff    33K Aug 14 16:00 pseudo_sp_vs_mandeleii.dot.uniqueAnchorFiltered_l10000.delta.gz
-rw-r--r--   1 cbib-02  staff    56K Aug 14 16:00 pseudo_sp_vs_mandeleii.dot.coords.idx
-rw-r--r--   1 cbib-02  staff    38K Aug 14 16:00 pseudo_sp_vs_mandeleii.dot.coords
-rw-r--r--   1 cbib-02  staff   202B Aug 14 16:00 nucmer.sh
-rw-r--r--   1 cbib-02  staff   6.7M Aug 21 09:41 JCJR12.fasta
-rw-r--r--@  1 cbib-02  staff   9.7K Sep 11 08:50 number_nitro-OYEs.ods
-rw-r--r--@  1 cbib-02  staff   829B Sep 11 08:54 XenBs_angi.fasta
-rw-r--r--   1 cbib-02  staff     0B Sep 11 09:03 AAF02539.1
-rw-r--r--   1 cbib-02  staff     0B Sep 11 09:06 pseudo_xenb.fasta
-rw-r--r--@  1 cbib-02  staff   2.5K Sep 11 09:09 xenBs.aln
-rw-r--r--@  1 cbib-02  staff   411B Sep 11 09:25 xenb_jcjr2.fasta
-rw-r--r--   1

In [0]:
import pandas as pd
import re
pd.set_option('max_colwidth', 2000)

  return f(*args, **kwds)
  return f(*args, **kwds)


In [0]:
def Fastd_to_pandas(gff_file):
    '''Takes a gff file from `FASTD` and converts it to a pandas data frame with every column named
    as the GFF3 specification. It cleans the rows that have Nans in the `source` column (fasta sequences are appended 
    to the end of the file).  
    Input: GFF file name (full path expected)
    Output: Pandas Dataframe'''
    
    import pandas as pd
    
    col_names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
    gff_pd = pd.read_csv(gff_file, sep='\t', low_memory=False, comment='#', header=None, names=col_names)    
    gff_pd = gff_pd[~gff_pd['source'].isnull()]

    return gff_pd


def Unique_types_sources(gff_file):
    '''Takes a gff dataframe and prints the unique types to the stdout'''
    
    pd_dataframe = Fastd_to_pandas(gff_file)
    print('Unique types', pd_dataframe['type'].unique())
    print('Unique sources', pd_dataframe['source'].unique())
    
    return ""

def Anot_by_annotators(gff_file):
    '''Prints the number of annotation made by each software (annotator)'''
    
    df = Fastd_to_pandas(gff_file)
    sources = df['source'].unique()

    for source in sources:# Nans not included
        print(f'Number of entries from {source} ==>', df[df['source'] == source].shape[0])  

In [0]:
def Type_counts(*gff_files, **species):    
    
    '''Returns the counts of each feature type (column named "type"). Names every column 
    with the specie name.
    Input: A list of file names *args and keyword args with the specia names. Order of keywords 
    matters, use the same as the list of gff_files
    Output: A data frame with type counts for each gff file'''
    
    import pandas as pd
    
    gff_files = gff_files
    
    gffs_pds = [Fastd_to_pandas(gff_file) for gff_file in gff_files]
    gffs_value_counts = [df["type"].value_counts() for df in gffs_pds]
    type_counts = pd.concat(gffs_value_counts, axis=1, sort=True)
    
    col_names = species
    type_counts.columns = col_names.values()
    return type_counts

In [0]:
def Most_common_products(gff_file, n=10, position_string=1):
    '''Takes in a dfast gff_file and pprints the n most common products.
    
    Input: gff_file, n (integer) which represents the number of top elements and position_string tells position
    of product qualifier.
    
    output: Prints to stdout the products and returns a counter dic with the  most 
    common products'''

    from pprint import pprint
    from collections import Counter
    
    pd = Fastd_to_pandas(gff_file)
    attribute_product= pd.iloc[: , 8].str.split(";").str[position_string]
    attribute_product= attribute_product.str.replace("product=", "")
    
    attribute_prod_dic = Counter()
    for element in attribute_product:
        attribute_prod_dic[element] += 1

    pprint(attribute_prod_dic.most_common()[:n])
    
    return attribute_prod_dic

In [0]:
def CDS_composition(*gff_files, ec_number_string ="EC_number", uniprot="UniProtKB", COG="COG", **species_names):
    '''Takes a list of GFF files, splits the attribute column (the last one) and returns it'''
    import pandas as pd
    
    gff_files = gff_files
    species = species_names.values()
    
    attributes_dic_list = []
    attributes_dic = {}
    ec_number_string = ec_number_string
    uniprot = uniprot

    product_types = [ec_number_string, uniprot, COG,"hypothetical protein"] 
    gffs_dfs = [Fastd_to_pandas(gff_file)["attributes"] for gff_file in gff_files]
    
    for df in gffs_dfs:
        gffs_attr_counts = {product_type : df.str.contains(product_type, case=False).sum() for product_type in product_types}
        attributes_dic_list.append(gffs_attr_counts)
        gffs_attr_counts = {}
    #print(type(attributes_dic_list),attributes_dic_list)
    df_attributes = pd.DataFrame(attributes_dic_list, index=species)
    df_attributes.columns =  ["COG", "UniProtKB", "EC number", "Hypothetical protein"]
        
    return round(df_attributes / gffs_dfs[0].shape[0] * 100, 3)

        

In [0]:
def Searching_atts(gff_file, patterns, dfast=True):
    '''Takes the attributes column, replaces the translation string if dfast=True, 
    the default (for prokka set it to False) and then searches for a list
    of patterns returning the number of ocurrences and indexes of each one
    
    input: list of patterns
    output: dictionary of each pattern and its indexes as a list of values'''
    
    import re
    from collections import defaultdict


    df = Fastd_to_pandas(gff_file)
    if dfast:
        atts = df.iloc[: , 8].str.replace("translation=[A-Z]+;", "")
        red_genes_locs = defaultdict()
    else:
        atts = df.iloc[: , 8]
        red_genes_locs = defaultdict()
    for red in potential_reds:
        #print(red, atts[atts.str.contains(red, flags=re.I)].shape)
        #print(atts[atts.str.contains(red, flags=re.I)].index, end="\n\n")
        red_genes_locs[red] = atts[atts.str.contains(red, flags=re.I)].index
    
    return(red_genes_locs)
    
        
potential_reds = ["nitroreductase", 'aldehyde.+oxidase', "amide.+dehydrogenase",\
                      "diaphorase","hydrogenase", "xanthine.+oxidase",\
                      "cytochrome.+reductase", "xen[A-Z]", 'PETN', "oxidoreductase",\
                      "xenobiotic", "OYE", "PETN", "flavin oxidoreductase", "nitro", 
                      "COG1902", "COG0778", "old yellow"]


In [0]:
#paths
JCJR2_path = '/Users/cbib-02/Documents/TNT/JCJR2/6666666.367937.gff'
JCJR12_path = '/Users/cbib-02/Documents/TNT/JCJR12/6666666.367945.gff'
JCJR2_path = '/run/media/cbib/Bionano/My_imac/TNT/JCJR12/'
JCJR12_path = '/Users/cbib-02/Documents/TNT/JCJR12/6666666.367945.gff'

In [0]:
JCJR2_df = Fastd_to_pandas(JCJR2_path)
JCJR12_df = Fastd_to_pandas(JCJR12_path)
JCJR2_df.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,contig_1,FIG,CDS,1,63,.,-,1,ID=fig|6666666.367937.peg.1;Name=hypothetical protein
1,contig_1,FIG,CDS,785,2413,.,+,2,ID=fig|6666666.367937.peg.2;Name=hypothetical protein
2,contig_1,FIG,CDS,2425,4047,.,-,1,ID=fig|6666666.367937.peg.3;Name=Methyl-accepting chemotaxis sensor/transducer protein
3,contig_1,FIG,CDS,4324,5280,.,-,1,ID=fig|6666666.367937.peg.4;Name=Uncharacterized methyltransferase PA1407
4,contig_1,FIG,CDS,5342,6271,.,-,2,ID=fig|6666666.367937.peg.5;Name=Auxin efflux carrier family protein


In [0]:
JCJR12_df

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,contig100|mandelii,FIG,CDS,1,93,.,-,1,ID=fig|6666666.367945.peg.1;Name=hypothetical protein
1,contig100|mandelii,FIG,CDS,318,446,.,-,0,ID=fig|6666666.367945.peg.2;Name=hypothetical protein
2,contig101|mandelii,FIG,CDS,1,444,.,+,1,ID=fig|6666666.367945.peg.3;Name=Phosphodiesterase/alkaline phosphatase D
3,contig102|mandelii,FIG,CDS,2,184,.,-,2,ID=fig|6666666.367945.peg.4;Name=Multi antimicrobial extrusion protein (Na(+)/drug antiporter)%2C MATE family of MDR efflux pumps
4,contig103|mandelii,FIG,CDS,3,443,.,-,0,ID=fig|6666666.367945.peg.5;Name=hypothetical protein
5,contig106|mandelii,FIG,CDS,1,441,.,-,1,ID=fig|6666666.367945.peg.6;Name=Mobile element protein
6,contig107|mandelii,FIG,CDS,20,442,.,+,2,ID=fig|6666666.367945.peg.7;Name=Deoxyguanosinetriphosphate triphosphohydrolase (EC 3.1.5.1);Ontology_term=KEGG_ENZYME:3.1.5.1
7,contig109|mandelii,FIG,CDS,1,276,.,-,1,ID=fig|6666666.367945.peg.8;Name=SSU ribosomal protein S8p (S15Ae)
8,contig109|mandelii,FIG,CDS,369,440,.,+,0,ID=fig|6666666.367945.peg.9;Name=hypothetical protein
9,contig10|mandelii,FIG,CDS,2,604,.,+,2,"ID=fig|6666666.367945.peg.10;Name=Xylulose-5-phosphate phosphoketolase (EC 4.1.2.9) @ Fructose-6-phosphate phosphoketolase (EC 4.1.2.22);Ontology_term=KEGG_ENZYME:4.1.2.9,KEGG_ENZYME:4.1.2.22"


In [0]:
JCJR12_df.iloc[9, 8]

'ID=fig|6666666.367945.peg.10;Name=Xylulose-5-phosphate phosphoketolase (EC 4.1.2.9) @ Fructose-6-phosphate phosphoketolase (EC 4.1.2.22);Ontology_term=KEGG_ENZYME:4.1.2.9,KEGG_ENZYME:4.1.2.22'

In [0]:
print(Unique_types_sources(JCJR12_path), Unique_types_sources(JCJR2_path))

Unique types ['CDS' 'tRNA' 'RNA']
Unique sources ['FIG']
Unique types ['CDS' 'tRNA' 'RNA']
Unique sources ['FIG']
 


In [0]:
print(JCJR2_df.shape, JCJR12_df.shape)

(6299, 9) (6763, 9)


In [0]:
Type_counts(JCJR2_path, JCJR12_path, specie1='JCJR2', specie2='JCJR12')

Unnamed: 0,JCJR2,JCJR12
CDS,6110,6568
RNA,15,12
tRNA,174,183


In [0]:
common_prods_jcjr2 = Most_common_products(JCJR2_path)

[('Name=hypothetical protein', 1534),
 ('Name=Transcriptional regulator%2C LysR family', 64),
 ('Name=Uncharacterized MFS-type transporter', 51),
 ('Name=putative lipoprotein', 39),
 ('Name=Transcriptional regulator%2C AraC family', 36),
 ('Name=Mobile element protein', 36),
 ('Name=putative membrane protein', 33),
 ('Name=Methyl-accepting chemotaxis sensor/transducer protein', 30),
 ('Name=Transcriptional regulator%2C AcrR family', 26),
 ('Name=Acetyltransferase%2C GNAT family', 17)]


In [0]:
common_prods_jcjr12 = Most_common_products(JCJR12_path)

[('Name=hypothetical protein', 1569),
 ('Name=Transcriptional regulator%2C LysR family', 75),
 ('Name=putative lipoprotein', 46),
 ('Name=Uncharacterized MFS-type transporter', 45),
 ('Name=putative membrane protein', 42),
 ('Name=Transcriptional regulator%2C AraC family', 40),
 ('Name=Mobile element protein', 36),
 ('Name=Two-component transcriptional response regulator%2C LuxR family', 28),
 ('Name=Methyl-accepting chemotaxis sensor/transducer protein', 27),
 ('Name=Transcriptional regulator%2C AcrR family', 21)]


In [0]:
JCJR2_df[JCJR2_df["attributes"].str.contains("COG")]

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
284,contig_1,FIG,CDS,308470,308712,.,-,1,ID=fig|6666666.367937.peg.285;Name=COGs COG2002
524,contig_10,FIG,CDS,42104,42583,.,-,2,ID=fig|6666666.367937.peg.525;Name=COGs COG3558
709,contig_11,FIG,CDS,32034,32573,.,+,0,ID=fig|6666666.367937.peg.710;Name=Putative analog of CcoH%2C COG3198
1171,contig_13,FIG,CDS,116361,117131,.,+,0,ID=fig|6666666.367937.peg.1172;Name=COG2041: Sulfite oxidase and related enzymes
1207,contig_13,FIG,CDS,149745,150584,.,+,0,ID=fig|6666666.367937.peg.1208;Name=COGs COG3777
1266,contig_14,FIG,CDS,16307,16603,.,-,2,ID=fig|6666666.367937.peg.1267;Name=COGs COG3492
2107,contig_19,FIG,CDS,120263,120703,.,+,2,ID=fig|6666666.367937.peg.2108;Name=Universal stress protein family COG0589
2794,contig_22,FIG,CDS,124980,125570,.,-,0,ID=fig|6666666.367937.peg.2795;Name=FIG000605: protein co-occurring with transport systems (COG1739)
3375,contig_28,FIG,CDS,56093,57340,.,-,2,ID=fig|6666666.367937.peg.3376;Name=COG2907: Amine oxidase%2C flavin-containing
3380,contig_28,FIG,CDS,60922,61884,.,-,1,ID=fig|6666666.367937.peg.3381;Name=COG1683: Uncharacterized conserved protein / FIG143828: Hypothetical protein YbgA


**EC numbers in `Rast` are represented by the strings `EC xx.xx.xx`. It seems almost always there's a space between the string `EC` and the numbers but in one case there wasn't, look below**

In [0]:
JCJR2_df[JCJR2_df["attributes"].str.contains('\(EC\S', flags=re.I)]

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
5801,contig_8,FIG,CDS,128314,129597,.,+,1,ID=fig|6666666.367937.peg.5802;Name=Gamma-glutamyl-putrescine oxidase (EC1.4.3.-);Ontology_term=KEGG_ENZYME:1.4.3.-


In [0]:
JCJR2_df[JCJR2_df["attributes"].str.contains('\(EC\s', flags=re.I)]

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
5,contig_1,FIG,CDS,6525,7559,.,+,0,ID=fig|6666666.367937.peg.6;Name=Selenide%2Cwater dikinase (EC 2.7.9.3);Ontology_term=KEGG_ENZYME:2.7.9.3
13,contig_1,FIG,CDS,14362,14703,.,-,1,ID=fig|6666666.367937.peg.14;Name=Peptidylprolyl isomerase%2C FKBP-type (EC 5.2.1.8);Ontology_term=KEGG_ENZYME:5.2.1.8
18,contig_1,FIG,CDS,18901,19680,.,+,1,ID=fig|6666666.367937.peg.19;Name=2%2C3-butanediol dehydrogenase%2C S-alcohol forming%2C (S)-acetoin-specific (EC 1.1.1.76);Ontology_term=KEGG_ENZYME:1.1.1.76
20,contig_1,FIG,CDS,21899,22876,.,+,2,ID=fig|6666666.367937.peg.21;Name=Acetoin dehydrogenase E1 component alpha-subunit (EC 2.3.1.190);Ontology_term=KEGG_ENZYME:2.3.1.190
21,contig_1,FIG,CDS,22933,23952,.,+,1,ID=fig|6666666.367937.peg.22;Name=Acetoin dehydrogenase E1 component beta-subunit (EC 2.3.1.190);Ontology_term=KEGG_ENZYME:2.3.1.190
22,contig_1,FIG,CDS,23949,25055,.,+,0,ID=fig|6666666.367937.peg.23;Name=Dihydrolipoamide acetyltransferase component (E2) of acetoin dehydrogenase complex (EC 2.3.1.12);Ontology_term=KEGG_ENZYME:2.3.1.12
25,contig_1,FIG,CDS,27491,28639,.,-,2,ID=fig|6666666.367937.peg.26;Name=Cystathionine beta-lyase%2C type II (EC 4.4.1.8);Ontology_term=KEGG_ENZYME:4.4.1.8
28,contig_1,FIG,CDS,29327,30403,.,-,2,ID=fig|6666666.367937.peg.29;Name=Chemotaxis response regulator protein-glutamate methylesterase CheB (EC 3.1.1.61);Ontology_term=KEGG_ENZYME:3.1.1.61
30,contig_1,FIG,CDS,30890,31696,.,-,2,ID=fig|6666666.367937.peg.31;Name=Chemotaxis protein methyltransferase CheR (EC 2.1.1.80);Ontology_term=KEGG_ENZYME:2.1.1.80
39,contig_1,FIG,CDS,40490,41665,.,+,2,ID=fig|6666666.367937.peg.40;Name=Acetylornithine aminotransferase (EC 2.6.1.11);Ontology_term=KEGG_ENZYME:2.6.1.11


In [0]:
JCJR2_df[JCJR2_df["attributes"].str.contains('\(EC', flags=re.I)]

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
5,contig_1,FIG,CDS,6525,7559,.,+,0,ID=fig|6666666.367937.peg.6;Name=Selenide%2Cwater dikinase (EC 2.7.9.3);Ontology_term=KEGG_ENZYME:2.7.9.3
13,contig_1,FIG,CDS,14362,14703,.,-,1,ID=fig|6666666.367937.peg.14;Name=Peptidylprolyl isomerase%2C FKBP-type (EC 5.2.1.8);Ontology_term=KEGG_ENZYME:5.2.1.8
18,contig_1,FIG,CDS,18901,19680,.,+,1,ID=fig|6666666.367937.peg.19;Name=2%2C3-butanediol dehydrogenase%2C S-alcohol forming%2C (S)-acetoin-specific (EC 1.1.1.76);Ontology_term=KEGG_ENZYME:1.1.1.76
20,contig_1,FIG,CDS,21899,22876,.,+,2,ID=fig|6666666.367937.peg.21;Name=Acetoin dehydrogenase E1 component alpha-subunit (EC 2.3.1.190);Ontology_term=KEGG_ENZYME:2.3.1.190
21,contig_1,FIG,CDS,22933,23952,.,+,1,ID=fig|6666666.367937.peg.22;Name=Acetoin dehydrogenase E1 component beta-subunit (EC 2.3.1.190);Ontology_term=KEGG_ENZYME:2.3.1.190
22,contig_1,FIG,CDS,23949,25055,.,+,0,ID=fig|6666666.367937.peg.23;Name=Dihydrolipoamide acetyltransferase component (E2) of acetoin dehydrogenase complex (EC 2.3.1.12);Ontology_term=KEGG_ENZYME:2.3.1.12
25,contig_1,FIG,CDS,27491,28639,.,-,2,ID=fig|6666666.367937.peg.26;Name=Cystathionine beta-lyase%2C type II (EC 4.4.1.8);Ontology_term=KEGG_ENZYME:4.4.1.8
28,contig_1,FIG,CDS,29327,30403,.,-,2,ID=fig|6666666.367937.peg.29;Name=Chemotaxis response regulator protein-glutamate methylesterase CheB (EC 3.1.1.61);Ontology_term=KEGG_ENZYME:3.1.1.61
30,contig_1,FIG,CDS,30890,31696,.,-,2,ID=fig|6666666.367937.peg.31;Name=Chemotaxis protein methyltransferase CheR (EC 2.1.1.80);Ontology_term=KEGG_ENZYME:2.1.1.80
39,contig_1,FIG,CDS,40490,41665,.,+,2,ID=fig|6666666.367937.peg.40;Name=Acetylornithine aminotransferase (EC 2.6.1.11);Ontology_term=KEGG_ENZYME:2.6.1.11


In [0]:
CDS_composition(JCJR2_path, JCJR12_path, ec_number_string='\(EC', specie1='JCJR2', specie2='JCJR12')

Unnamed: 0,COG,UniProtKB,EC number,Hypothetical protein
JCJR2,0.365,0.0,21.416,25.448
JCJR12,0.349,0.0,23.131,26.163


In [0]:
JCJR2_df[JCJR2_df["attributes"].str.contains('uniprot', flags=re.I)]

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes


In [0]:
red_genes_locs12 = Searching_atts(JCJR12_path,potential_reds, False)
red_genes_locs12

defaultdict(None,
            {'nitroreductase': Int64Index([6310, 6418], dtype='int64'),
             'aldehyde.+oxidase': Int64Index([], dtype='int64'),
             'amide.+dehydrogenase': Int64Index([1349, 1350, 2182, 2183, 2620, 3871, 4274, 5102], dtype='int64'),
             'diaphorase': Int64Index([], dtype='int64'),
             'hydrogenase': Int64Index([  17,   25,   51,   57,   79,   80,   82,  168,  192,  204,
                         ...
                         6259, 6340, 6344, 6346, 6442, 6458, 6510, 6524, 6546, 6555],
                        dtype='int64', length=272),
             'xanthine.+oxidase': Int64Index([], dtype='int64'),
             'cytochrome.+reductase': Int64Index([605, 1311, 1877, 1878, 1903, 3212, 3983, 4106, 5093, 5094, 5095,
                         6084],
                        dtype='int64'),
             'xen[A-Z]': Int64Index([592], dtype='int64'),
             'PETN': Int64Index([], dtype='int64'),
             'oxidoreductase': Int64Index([

In [0]:
print(JCJR12_df.iloc[red_genes_locs["flavin oxidoreductase"], 8])

224      ID=fig|6666666.367945.peg.225;Name=NADH:flavin oxidoreductases%2C Old Yellow Enzyme family
860                                ID=fig|6666666.367945.peg.861;Name=NAD(P)H-flavin oxidoreductase
3135                        ID=fig|6666666.367945.peg.3136;Name=putative NADH:flavin oxidoreductase
5438    ID=fig|6666666.367945.peg.5439;Name=NADH:flavin oxidoreductases%2C Old Yellow Enzyme family
5444         ID=fig|6666666.367945.peg.5445;Name=NADH-dependent flavin oxidoreductase%2C Oye family
5588                       ID=fig|6666666.367945.peg.5589;Name=NADH-dependent flavin oxidoreductase
Name: attributes, dtype: object


In [0]:
print(JCJR12_df.iloc[red_genes_locs['xen[A-Z]'], 8])

592    ID=fig|6666666.367945.peg.593;Name=Isohexenylglutaconyl-CoA hydratase (EC 4.2.1.57);Ontology_term=KEGG_ENZYME:4.2.1.57
Name: attributes, dtype: object


In [0]:
print(JCJR12_df.iloc[red_genes_locs['old yellow'], 8])

224      ID=fig|6666666.367945.peg.225;Name=NADH:flavin oxidoreductases%2C Old Yellow Enzyme family
5438    ID=fig|6666666.367945.peg.5439;Name=NADH:flavin oxidoreductases%2C Old Yellow Enzyme family
Name: attributes, dtype: object


In [0]:
red_genes_locs2 = Searching_atts(JCJR2_path,potential_reds, False)
red_genes_locs2

defaultdict(None,
            {'nitroreductase': Int64Index([4041, 4427], dtype='int64'),
             'aldehyde.+oxidase': Int64Index([], dtype='int64'),
             'amide.+dehydrogenase': Int64Index([22, 1283, 2014, 2015, 3954, 5019], dtype='int64'),
             'diaphorase': Int64Index([], dtype='int64'),
             'hydrogenase': Int64Index([  14,   18,   19,   20,   21,   22,   67,  102,  103,  118,
                         ...
                         5762, 5763, 5782, 5787, 5809, 5817, 5821, 5912, 5913, 5930],
                        dtype='int64', length=238),
             'xanthine.+oxidase': Int64Index([], dtype='int64'),
             'cytochrome.+reductase': Int64Index([764, 1717, 1871, 4153, 4154, 4155, 5988], dtype='int64'),
             'xen[A-Z]': Int64Index([420, 3307, 4587], dtype='int64'),
             'PETN': Int64Index([], dtype='int64'),
             'oxidoreductase': Int64Index([  15,   37,  133,  149,  157,  204,  219,  267,  301,  322,  367,
               

In [0]:
print(JCJR2_df.iloc[red_genes_locs["flavin oxidoreductase"], 8])

224     ID=fig|6666666.367937.peg.225;Name=ABC transporter%2C ATP-binding protein (cluster 5%2C nickel/peptides/opines) / ABC transporter%2C ATP-binding protein (cluster 5%2C nickel/peptides/opines)
860                                                                              ID=fig|6666666.367937.peg.861;Name=Gamma-butyrobetaine dioxygenase (EC 1.14.11.1);Ontology_term=KEGG_ENZYME:1.14.11.1
3135                                                           ID=fig|6666666.367937.peg.3136;Name=Phytochrome%2C two-component sensor histidine kinase (EC 2.7.3.-);Ontology_term=KEGG_ENZYME:2.7.3.-
5438                                                                                                                                          ID=fig|6666666.367937.peg.5439;Name=hypothetical protein
5444                                                                                                                                        ID=fig|6666666.367937.peg.5445;Name=Mobile element protein
5588 

In [0]:
JCJR2_df[JCJR2_df["attributes"].str.contains("peg.3146")]

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
3145,contig_26,FIG,CDS,6274,6486,.,+,1,ID=fig|6666666.367937.peg.3146;Name=hypothetical protein


In [0]:
JCJR2_df[JCJR2_df["attributes"].str.contains("xenobiotic", case=False)]+

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
420,contig_1,FIG,CDS,443870,444958,.,+,2,ID=fig|6666666.367937.peg.421;Name=xenobiotic reductase A
3307,contig_27,FIG,CDS,89275,90324,.,+,1,ID=fig|6666666.367937.peg.3308;Name=Xenobiotic reductase B
4587,contig_4,FIG,CDS,194253,195359,.,-,0,ID=fig|6666666.367937.peg.4588;Name=xenobiotic reductase%2C putative


**Angi encontro Xen A y B en mandelii (JCJR12), yo no**

In [0]:
JCJR12_df[JCJR12_df["attributes"].str.contains("xenobiotic", case=False)]

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes


al buscar la secuencia aminoacidica en mi `.faa` la encuentro en el siguiente contig/peg **peg.5822**

In [0]:
JCJR12_df[JCJR12_df["attributes"].str.contains("peg.5822")]

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
5821,contig7|mandelii,FIG,CDS,293477,294526,.,-,2,ID=fig|6666666.367945.peg.5822;Name=N-ethylmaleimide reductase (EC 1.-.-.-);Ontology_term=KEGG_ENZYME:1.-.-.-


E2_sequence_Biopython.ipynb  TNT_Rast_gffs.ipynb


In [0]:
ls -ltrh

total 2152
-rw-r--r--  1 cbib-02  staff   869K Sep  4 16:32 E2_sequence_Biopython.ipynb
-rw-r--r--  1 cbib-02  staff   201K Sep 24 16:51 TNT_Rast_gffs.ipynb


### Angi gffs

In [0]:
JCJR2_path = '/Users/cbib-02/Documents/TNT/JCJR2/Rastk/6666666.320807_pseudo.gff'
JCJR12_path = '/Users/cbib-02/Documents/TNT/JCJR12/Rastk/6666666.320830_mandelii.gff'

In [0]:
JCJR2_df = Fastd_to_pandas(JCJR2_path)
JCJR12_df = Fastd_to_pandas(JCJR12_path)
print(JCJR2_df.shape, JCJR12_df.shape)
JCJR2_df.head()

(5985, 9) (6351, 9)


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,contig_1,FIG,CDS,785,2413,.,+,2,ID=fig|6666666.320807.peg.1;Name=diguanylate cyclase/phosphodiesterase (GGDEF & EAL domains) with PAS/PAC sensor(s)
1,contig_1,FIG,CDS,2425,4047,.,-,1,ID=fig|6666666.320807.peg.2;Name=Methyl-accepting chemotaxis protein I (serine chemoreceptor protein)
2,contig_1,FIG,CDS,4324,5292,.,-,1,ID=fig|6666666.320807.peg.3;Name=Methyltransferase (EC 2.1.1.-);Ontology_term=KEGG_ENZYME:2.1.1.-
3,contig_1,FIG,CDS,5342,6271,.,-,2,ID=fig|6666666.320807.peg.4;Name=FIG040954: Transporter
4,contig_1,FIG,CDS,6525,7559,.,+,0,ID=fig|6666666.320807.peg.5;Name=Selenide%2Cwater dikinase (EC 2.7.9.3);Ontology_term=KEGG_ENZYME:2.7.9.3


In [0]:
print(Unique_types_sources(JCJR12_path), Unique_types_sources(JCJR2_path))

Unique types ['CDS' 'tRNA' 'rRNA']
Unique sources ['FIG']
Unique types ['CDS' 'tRNA' 'rRNA']
Unique sources ['FIG']
 


In [0]:
Type_counts(JCJR2_path, JCJR12_path, specie1='JCJR2', specie2='JCJR12')

Unnamed: 0,JCJR2,JCJR12
CDS,5781,6138
rRNA,30,30
tRNA,174,183


In [0]:
common_prods_jcjr2 = Most_common_products(JCJR2_path)
print('\n')
common_prods_jcjr12 = Most_common_products(JCJR12_path)

[('Name=hypothetical protein', 684),
 ('Name=Transcriptional regulator%2C LysR family', 35),
 ('Name=Transcriptional regulator%2C AraC family', 29),
 ('Name=Transcriptional regulator%2C TetR family', 25),
 ('Name=Methyl-accepting chemotaxis protein I (serine chemoreceptor protein)',
  24),
 ('Name=Permeases of the major facilitator superfamily', 24),
 ('Name=Mobile element protein', 24),
 ('Name=5S RNA', 24),
 ('Name=membrane protein%2C putative', 20),
 ('Name=Phosphonate ABC transporter phosphate-binding periplasmic component '
  '(TC 3.A.1.9.1)',
  19)]


[('Name=hypothetical protein', 647),
 ('Name=Transcriptional regulator%2C LysR family', 50),
 ('Name=Mobile element protein', 32),
 ('Name=Transcriptional regulator%2C AraC family', 30),
 ('Name=Phosphonate ABC transporter phosphate-binding periplasmic component '
  '(TC 3.A.1.9.1)',
  27),
 ('Name=5S RNA', 24),
 ('Name=Transcriptional regulator%2C TetR family', 23),
 ('Name=diguanylate cyclase/phosphodiesterase (GGDEF & EAL domains

In [0]:
CDS_composition(JCJR2_path, JCJR12_path, ec_number_string='\(EC', specie1='JCJR2', specie2='JCJR12')

Unnamed: 0,COG,UniProtKB,EC number,Hypothetical protein
JCJR2,0.735,0.0,23.659,18.062
JCJR12,0.769,0.0,25.564,19.198


In [0]:
JCJR12_df[JCJR12_df["attributes"].str.contains("xenobiotic", case=False)]

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
1319,contig10,FIG,CDS,212708,213799,.,-,2,ID=fig|6666666.320830.peg.1320;Name=xenobiotic reductase A
5056,contig6,FIG,CDS,349097,350188,.,+,2,ID=fig|6666666.320830.peg.5059;Name=xenobiotic reductase A


In [0]:
JCJR2_df[JCJR2_df["attributes"].str.contains("xenobiotic", case=False)]

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
410,contig_1,FIG,CDS,443870,444958,.,+,2,ID=fig|6666666.320807.peg.411;Name=xenobiotic reductase A
