In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys
import seaborn as sns
import http.client

import json

In [2]:
import shutil



In [16]:
SampledTCSpath = os.path.join(os.getcwd(), "SampledTCS") #TCS identified from sampled geneomes
SampledGenomespath = os.path.join(os.getcwd(), "SampledGenomes") #list of sampled genomes from a taxanomic level
Genomespath = os.path.join(os.getcwd(), "GenomeDBs") #list of genomes for a taxanomic level
Source_path = os.path.join(os.getcwd(), "Source_Files") #.hmm files

#### Load file with all genomes available in MISTDB

In [4]:
Bacteria = pd.read_csv(os.path.join(Genomespath,"BacterialGenomesDB.csv"),
                       index_col = None).drop("Unnamed: 0", axis=1)

In [38]:
Bacteria.head()

Unnamed: 0,id,worker_id,accession,version,version_number,genbank_accession,genbank_version,taxonomy_id,name,refseq_category,...,family,genus,species,strain,stats,meta,biosample_id,createdAt,updatedAt,WorkerId
0,257,,GCF_000010525,GCF_000010525.1,1,GCA_000010525,GCA_000010525.1,438753,Azorhizobium caulinodans ORS 571,representative genome,...,Xanthobacteraceae,Azorhizobium,Azorhizobium caulinodans,ORS 571,{},{},5831728,2018-03-28T14:09:25.187Z,2021-03-04T04:38:47.735Z,
1,258,,GCF_000007365,GCF_000007365.1,1,GCA_000007365,GCA_000007365.1,198804,Buchnera aphidicola str. Sg (Schizaphis graminum),representative genome,...,Erwiniaceae,Buchnera,Buchnera aphidicola,str. Sg (Schizaphis graminum),{},{},2604269,2018-03-28T14:09:25.187Z,2021-03-04T04:38:49.137Z,
2,273,,GCF_000218545,GCF_000218545.1,1,GCA_000218545,GCA_000218545.1,593907,Cellulomonas gilvus ATCC 13127,representative genome,...,Cellulomonadaceae,Cellulomonas,Cellulomonas gilvus,ATCC 13127,{},{},2232005,2018-03-28T14:09:25.187Z,2021-03-04T04:39:14.919Z,
3,274,,GCF_000020965,GCF_000020965.1,1,GCA_000020965,GCA_000020965.1,309799,Dictyoglomus thermophilum H-6-12,representative genome,...,Dictyoglomaceae,Dictyoglomus,Dictyoglomus thermophilum,H-6-12,{},{},2603928,2018-03-28T14:09:25.187Z,2021-03-04T04:39:16.228Z,
4,275,,GCF_000378225,GCF_000378225.1,1,GCA_000378225,GCA_000378225.1,1122236,Methylophilus methylotrophus DSM 46235 = ATCC ...,representative genome,...,Methylophilaceae,Methylophilus,Methylophilus methylotrophus,DSM 46235 = ATCC 53528,{},{},2440991,2018-03-28T14:09:25.187Z,2021-03-04T04:39:17.517Z,


## A quick look inside the database
- **Goal**: generate new dataframes to pull out species based on taxanomic level

### Which Phyla  are in the bacterial kingdom?

In [39]:
Bacteria.phylum.value_counts()

Proteobacteria           1995
Firmicutes               1246
Actinobacteria           1020
Bacteroidetes             577
Tenericutes               116
Cyanobacteria              79
Spirochaetes               71
Deinococcus-Thermus        43
Chloroflexi                30
Thermotogae                26
Fusobacteria               25
Planctomycetes             22
Verrucomicrobia            17
Acidobacteria              16
Chlamydiae                 16
Synergistetes              16
Aquificae                  14
Chlorobi                   12
Thermodesulfobacteria       9
Nitrospirae                 9
Deferribacteres             5
Balneolaeota                4
Fibrobacteres               3
Elusimicrobia               3
Dictyoglomi                 2
Armatimonadetes             2
Coprothermobacterota        2
Gemmatimonadetes            2
Ignavibacteriae             2
Chrysiogenetes              2
Caldiserica                 1
Nitrospinae                 1
Lentisphaerae               1
Calditrich

### Which classes are in the phylum proteobacteria?

In [40]:

Proteobacteria = Bacteria[Bacteria.phylum ==  "Proteobacteria"]
Proteobacteria.to_csv(os.path.join(Genomespath, "ProteobacteriaGenomesDB.csv"),index= False)
Proteobacteria["class"].value_counts()


Gammaproteobacteria      809
Alphaproteobacteria      696
Betaproteobacteria       267
Deltaproteobacteria      134
Epsilonproteobacteria     78
Acidithiobacillia          4
Oligoflexia                3
Hydrogenophilalia          2
Zetaproteobacteria         2
Name: class, dtype: int64

### Which orders  are in the class gammaproteobacteria?

In [41]:
Gammaproteobacteria = Bacteria[Bacteria["class"] ==  "Gammaproteobacteria"]
Gammaproteobacteria.to_csv(os.path.join(Genomespath, "GammaproteobacteriaGenomesDB.csv"),index= False)
Gammaproteobacteria["order"].value_counts()


Pseudomonadales          109
Alteromonadales          109
Oceanospirillales        102
Enterobacterales         101
Xanthomonadales           65
Vibrionales               61
Chromatiales              47
Legionellales             44
Thiotrichales             37
Cellvibrionales           29
Pasteurellales            24
Methylococcales           22
Aeromonadales             16
Nevskiales                11
Cardiobacteriales          5
Salinisphaerales           2
Orbales                    2
Acidiferrobacterales       1
Immundisolibacterales      1
Name: order, dtype: int64

### Which species are in the genus pseudmonas?

In [42]:
Pseudomonas =  Proteobacteria[Proteobacteria.genus ==  "Pseudomonas"]
Pseudomonas.to_csv(os.path.join(Genomespath, "PseudomonasGenomesDB.csv"),index= False)
Pseudomonas["name"].value_counts()

Pseudomonas aeruginosa PAO1                    1
Pseudomonas litoralis                          1
Pseudomonas guineae                            1
Pseudomonas marincola                          1
Pseudomonas sabulinigri                        1
Pseudomonas xinjiangensis                      1
Pseudomonas caeni DSM 24390                    1
Pseudomonas massiliensis                       1
Pseudomonas pelagia CL-AP6                     1
Pseudomonas benzenivorans                      1
Pseudomonas taeanensis MS-3                    1
Pseudomonas flexibilis                         1
Pseudomonas aestusnigri                        1
Pseudomonas pohangensis                        1
Pseudomonas formosensis                        1
Pseudomonas kuykendallii                       1
Pseudomonas zeshuii                            1
Pseudomonas sagittaria                         1
Pseudomonas guangdongensis                     1
Pseudomonas oryzae                             1
Pseudomonas hussaini

In [43]:
Pseudomonas.head()

Unnamed: 0,id,worker_id,accession,version,version_number,genbank_accession,genbank_version,taxonomy_id,name,refseq_category,...,family,genus,species,strain,stats,meta,biosample_id,createdAt,updatedAt,WorkerId
85,360,,GCF_000006765,GCF_000006765.1,1,GCA_000006765,GCA_000006765.1,208964,Pseudomonas aeruginosa PAO1,reference genome,...,Pseudomonadaceae,Pseudomonas,Pseudomonas aeruginosa,PAO1,{},{},2603714,2018-03-28T14:09:25.281Z,2021-03-04T04:41:39.586Z,
88,363,,GCF_000237065,GCF_000237065.1,1,GCA_000237065,GCA_000237065.1,1114970,Pseudomonas fluorescens F113,representative genome,...,Pseudomonadaceae,Pseudomonas,Pseudomonas fluorescens,F113,{},{},2604099,2018-03-28T14:09:25.281Z,2021-03-04T04:41:43.817Z,
89,364,,GCF_000016565,GCF_000016565.1,1,GCA_000016565,GCA_000016565.1,399739,Pseudomonas mendocina ymp,representative genome,...,Pseudomonadaceae,Pseudomonas,Pseudomonas mendocina,ymp,{},{},2598392,2018-03-28T14:09:25.281Z,2021-03-04T04:41:44.944Z,
90,365,,GCF_000007565,GCF_000007565.2,2,GCA_000007565,GCA_000007565.2,160488,Pseudomonas putida KT2440,reference genome,...,Pseudomonadaceae,Pseudomonas,Pseudomonas putida,KT2440,{},{},2603999,2018-03-28T14:09:25.281Z,2021-03-04T04:41:46.314Z,
92,368,,GCF_000219605,GCF_000219605.1,1,GCA_000219605,GCA_000219605.1,316,Pseudomonas stutzeri,representative genome,...,Pseudomonadaceae,Pseudomonas,Pseudomonas stutzeri,,{},{},2603041,2018-03-28T14:09:25.281Z,2021-03-04T04:41:52.191Z,


In [44]:
PP = Pseudomonas[Pseudomonas["species"] == "Pseudomonas putida"]

# Interacting with the Microbial signal transduction database (MISTDB) API
- The following function can be applied to generate tables of response regulators based on taxanomic level

In [6]:
class get_MISTDB():
    
     # Class of functions built interface with Microbial Signal Transduction Database
    
    def __init__(self, GenomeAccession):
        self.GenomeAccession= GenomeAccession
        self.conn = http.client.HTTPSConnection("api.mistdb.caltech.edu") #connect to MIST API
        
    def getGenomeAPI(self, APIinput = "/genes"):        
         # request genome and genes from MIST API
        self.conn.request("GET", "/v1/genomes/{}{}".format(self.GenomeAccession,APIinput))
        
         # read API connection and return dictionary of data
        res = self.conn.getresponse()
        data = res.read()
        datastring= data.decode("utf-8")
        res = json.loads(datastring)
        return res
    
    def get_GeneID(self, page = 1, idx = 0, dict_key = "id"):
        APIinput = "/genes?page={}&per_page=100".format(page)
        
        if dict_key:
            APIpage = self.getGenomeAPI(APIinput = APIinput)
            if APIpage[idx]:
                geneID = APIpage[idx][dict_key]               
                return geneID
            else:
                pass
        else:
            APIpage = self.getGenomeAPI(APIinput = APIinput)
            if APIpage:
                geneID = self.getGenomeAPI(APIinput = APIinput)[idx]              
                return geneID
            else:
                pass
            
    
    def get_GenePage(self, geneID_N):        
         # get the ID from the first item on the first page
        geneID_0 = self.get_GeneID()
        
         # calculate the page number of the Nth gene ID
        page = int(((geneID_N - geneID_0))/100) + 1
        
         # get the ID from the first item on the Nth page 
        geneID_N_0 = self.get_GeneID(page = page)
        
         # calculate the index of the Nth gene on the Nth page
        geneIDX = geneID_N - geneID_N_0
        
         # return the dictionary with protein information for Nth ID
        return self.get_GeneID(page = page, idx = geneIDX, dict_key = None)   
    
    def make_TCSdf(self):
        
         # initialize final dataframe
        df = pd.DataFrame()
        
         # initialize page number and page lengths
            # run while loop to find end of pages
        pageNum, pageLen = 1, 100
        
        while pageLen == 100:

             # pull 100 signal genes per MISTDB page
                # after the page length no longer equals 100, the code will stop
            signalGenes = self.getGenomeAPI(APIinput = "/signal-genes?page={}&per_page=100".format(pageNum))
            pageLen = len(signalGenes)
            pageNum += 1

            for signalGene_meta in signalGenes: #check if singal-gene is TCS 
                domains = signalGene_meta["counts"].keys()
                signalGeneDict = dict()
                
                if "RR" in domains and "HK_CA" in domains:
                    # if HKs have RRs they will be classified as hybrid histidine kinase (hybrid_HK)
                    signalGeneDict["TCStype"] = "Hybrid_HK"
                
                elif "RR" in domains and "Hpt" in domains:
                    # if HKs have RRs they will be classified as hybrid histidine kinase (hybrid_HK)
                    signalGeneDict["TCStype"] = "Hybrid_HK_Hpt"
                elif "RR" in domains and "HisKA" in domains:
                    # if HKs have RRs they will be classified as hybrid histidine kinase (hybrid_HK)
                    signalGeneDict["TCStype"] = "Hybrid_HK_HisKA"
                    
                elif "RR" in domains:# and "Hpt"  not in domains and "HK_CA" not in domains:
                    # if only RR domain they will be classifies as response regulator (RR)
                    signalGeneDict["TCStype"] = "RR"
                    
                
                elif "HK_CA" in domains:
                    # if only HK-CA domain they will be classifies as histidine kinase (HK)
                    signalGeneDict["TCStype"] = "HK"
                    
                else:
                    continue
                
                signalGeneDict["gene_id"] = signalGene_meta["gene_id"]
                signalGeneDict["domain_counts"] = signalGene_meta["counts"]

                genepage = self.get_GenePage(signalGene_meta["gene_id"])
                signalGeneDict["stable_id"], signalGeneDict["aseq_id"] = genepage["stable_id"], genepage["aseq_id"]
                
                
                
                df_temp = pd.DataFrame.from_dict(signalGeneDict, orient='index').T
                df = pd.concat([df_temp, df])

        return df.reset_index().drop("index", axis=1)
                

# test function with a GCF id from the Bacteria dataframe
TCS = get_MISTDB("GCF_000006765.1").make_TCSdf()
TCS.head()

Unnamed: 0,TCStype,gene_id,domain_counts,stable_id,aseq_id
0,RR,1069051,"{'RR': 1, 'GerE': 1}",GCF_000006765.1-PA0034,suqpcEUaLpnkC-9T0BXN8A
1,RR,1069196,{'RR': 1},GCF_000006765.1-PA0179,nWGyZzSDubUX1v4GivsVgQ
2,RR,1069428,{'RR': 1},GCF_000006765.1-PA0408,zCQ1gf5_ysbDseV0AQoN5g
3,RR,1069429,{'RR': 1},GCF_000006765.1-PA0409,2W8pAYbNBLYSGjEkKKs6cg
4,Hybrid_HK_Hpt,1069433,"{'RR': 1, 'Hpt': 6, 'CheW': 1, 'HK_CA:Che': 1}",GCF_000006765.1-PA0413,HykcXflWuBa56iCvZG98DA


In [7]:
def makeSeqDF(TCS):

    dfSeq = pd.DataFrame()

    conn = http.client.HTTPSConnection("api.mistdb.caltech.edu")

    for gene_id, aseq_id, TCStype in zip(TCS.gene_id, TCS.aseq_id, TCS.TCStype):
        conn.request("GET", "/v1/aseqs/{}".format(aseq_id))
        res = conn.getresponse()
        data = res.read()
        datastring= data.decode("utf-8")
        res = json.loads(datastring)
        seqDict = dict()
        seqDict["gene_id"] = gene_id
        for pfam in res["pfam31"]:
            if TCStype == "RR":
                if pfam["name"] == "Response_reg":
                    seqDict["env_from"], seqDict["env_to"] = pfam["env_from"], pfam["env_to"]
                else:
                    continue

            elif TCStype == "HK" or TCStype == "Hybrid_HK":
                if pfam["name"] == "HATPase_c":
                    seqDict["env_to"] = pfam["env_to"]
                elif "HisKA" in pfam["name"]:
                    seqDict["env_from"] = pfam["env_from"]
                else:
                    continue

            else:
                continue
        seqDict["sequence"] = res["sequence"]
        df_temp = pd.DataFrame.from_dict(seqDict, orient='index').T
        dfSeq = pd.concat([df_temp, dfSeq], sort =  False)
    
    dfSeq.dropna(subset = ["env_to", "env_from"], inplace = True)
    dfSeq["domain_seq"] = dfSeq.apply(lambda row: row.sequence[row.env_from:row.env_to], axis =1)
    
    
    df = pd.merge(dfSeq, TCS, on  = "gene_id", how = "left")    
    
    return df
    

In [8]:
def TCSdomains(DB):
    df = pd.DataFrame()
    for version in DB.version:
        dfTCS = get_MISTDB(version).make_TCSdf()
        dfTCS["version"] = version
        if dfTCS.empty:
            continue
        else:
            df_temp = makeSeqDF(TCS = dfTCS)
            df = pd.concat([df_temp, df], sort = True)
    print(df.TCStype.value_counts())
    print(df.version.value_counts())
    return df


### Sampling a subset of the species based on taxanomic level
 - To deal with bias driven by genome availability for species of various taxanomics levels (e.g gammaproteobacteria more represented than any other class in the proteobacteria phylum) we have taken a subset of the data by randomly sampling species based on taxonomic level via the function below
 - We are not considering genomes with assembly level "contig" 

In [9]:
def randomSubset(df = Bacteria, groupby = "phylum", frac = 0.1, second_frac=None, add_PP = True):
    # pull a random subset of species from indicated taxanomic level
    sorterDict = {"phylum":"class","class":"order", "order":"genus", "genus":"species"} 
    df.dropna(subset = [groupby], axis=0, inplace=True)
    df = df[df["assembly_level"]!="contig"]
    randomSubset = pd.DataFrame()

    randomSubset = df.groupby(groupby).sample(frac=frac).reset_index()
    excluded = [group for group in df[groupby].unique() if group not in randomSubset[groupby].unique()]
    if excluded:
        if second_frac:
            randomSubset_ex = df[df[groupby].isin(excluded)].groupby(groupby).sample(frac=second_frac).reset_index()
            randomSubset = pd.concat([randomSubset,randomSubset_ex])
        randomSubset_ex = df[df[groupby].isin(excluded)].groupby(groupby).sample(n=1).reset_index()
        randomSubset = pd.concat([randomSubset,randomSubset_ex])

    randomSubset.drop_duplicates(subset=  sorterDict[groupby], inplace = True)
    if add_PP:
        randomSubset = pd.concat([randomSubset,PP], sort =  True)
    randomSubset.drop_duplicates(subset = "version", inplace = True)
    randomSubset.sort_values(by = [groupby, sorterDict[groupby]], inplace=True)
    
    return randomSubset
randomSubset(df = Bacteria, groupby = "phylum", frac=0.1, second_frac = 0.5).value_counts("phylum")

NameError: name 'PP' is not defined

In [137]:
iteration = 4

#### Pseudomonas

In [138]:
DB = Pseudomonas
DBname = "Pseudomonas"

df = TCSdomains(DB = DB)

df.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}DB_{iteration}.csv"), index = False)
DB.to_csv(os.path.join(SampledGenomespath, f"Sampled{DBname}DB_{iteration}.csv"), index = False)

RR           2859
HK           1340
Hybrid_HK     796
Name: TCStype, dtype: int64
GCF_000237065.1    156
GCF_000397205.1    153
GCF_900108595.1    144
GCF_900111835.1    142
GCF_001654435.1    136
GCF_001913135.1    130
GCF_000007565.2    129
GCF_002091755.1    129
GCF_000012245.1    128
GCF_000007805.1    124
GCF_000826105.1    123
GCF_000006765.1    122
GCF_002091635.1    120
GCF_000213805.1    118
GCF_000759445.1    118
GCF_900141925.1    116
GCF_900100495.1    112
GCF_000425625.1    110
GCF_000498575.2    110
GCF_000412695.1    109
GCF_000761155.1    108
GCF_900106975.1    107
GCF_001534745.1    102
GCF_000016565.1    101
GCF_900115555.1    100
GCF_000219605.1     99
GCF_000818015.1     98
GCF_900103845.1     98
GCF_900116605.1     89
GCF_001597285.1     89
GCF_900105255.1     86
GCF_000756775.1     84
GCF_900105355.1     84
GCF_000410875.1     82
GCF_002197985.1     82
GCF_900104805.1     82
GCF_900115715.1     82
GCF_900114765.1     81
GCF_900113745.1     80
GCF_900109735.1     7

#### Proteobacteria

In [158]:
ProteobacteriaSubset = Proteobacteria.drop_duplicates(subset = ["genus"])
randomSubsetClass = randomSubset(df = ProteobacteriaSubset, groupby = "class", frac = 0.05)
randomSubsetClass["class"].value_counts()

Gammaproteobacteria      8
Alphaproteobacteria      4
Deltaproteobacteria      4
Betaproteobacteria       3
Acidithiobacillia        1
Epsilonproteobacteria    1
Hydrogenophilalia        1
Oligoflexia              1
Zetaproteobacteria       1
Name: class, dtype: int64

In [47]:
%%time
DB = randomSubsetClass
DBname = "Proteobacteria"

df = TCSdomains(DB = randomSubsetClass)  

df.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}DB_{iteration}.csv"), index = False)
DB.to_csv(os.path.join(SampledGenomespath, f"Sampled{DBname}DB_{iteration}.csv"), index = False)

RR           968
HK           418
Hybrid_HK    284
Name: TCStype, dtype: int64
GCF_001483865.1    178
GCF_001184205.1    164
GCF_900099695.1    145
GCF_000007565.2    129
GCF_000828835.1    125
GCF_900104445.1    117
GCF_001010405.1     99
GCF_000423825.1     74
GCF_000210915.2     65
GCF_000153765.1     63
GCF_001975225.1     59
GCF_000154705.2     57
GCF_000710775.1     55
GCF_000152725.1     46
GCF_000008465.1     46
GCF_001661675.2     45
GCF_900112605.1     44
GCF_000425565.1     40
GCF_001697225.1     27
GCF_000186245.1     26
GCF_000510805.1     18
GCF_000600005.1     16
GCF_002079945.1     16
GCF_001888055.1     10
GCF_000024505.1      6
Name: version, dtype: int64
CPU times: user 47.4 s, sys: 2.06 s, total: 49.5 s
Wall time: 6min 58s


#### Bacteria

In [48]:
BacteriaSubset = Bacteria.drop_duplicates(subset = ["family"])

randomSubsetPhylum = randomSubset(df = BacteriaSubset, groupby = "phylum", frac = 0.25)
randomSubsetPhylum["phylum"].value_counts()

Proteobacteria           7
Firmicutes               4
Bacteroidetes            4
Actinobacteria           3
Chloroflexi              3
Cyanobacteria            2
Tenericutes              1
Synergistetes            1
Spirochaetes             1
Thermodesulfobacteria    1
Planctomycetes           1
Fusobacteria             1
Nitrospirae              1
Nitrospinae              1
Lentisphaerae            1
Kiritimatiellaeota       1
Ignavibacteriae          1
Thermotogae              1
Gemmatimonadetes         1
Acidobacteria            1
Elusimicrobia            1
Fibrobacteres            1
Dictyoglomi              1
Deinococcus-Thermus      1
Deferribacteres          1
Coprothermobacterota     1
Chrysiogenetes           1
Chlorobi                 1
Chlamydiae               1
Calditrichaeota          1
Caldiserica              1
Balneolaeota             1
Armatimonadetes          1
Aquificae                1
Verrucomicrobia          1
Name: phylum, dtype: int64

In [49]:
%%time
DB = randomSubsetPhylum
DB = DB.drop(DB[DB.version == "GCF_000178855.1"].index)

DBname = "Bacteria"
df = TCSdomains(DB = randomSubsetPhylum)  

df.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}DB_{iteration}.csv"), index = False)
DB.to_csv(os.path.join(SampledGenomespath, f"Sampled{DBname}DB_{iteration}.csv"), index = False)

RR           1875
HK            943
Hybrid_HK     447
Name: TCStype, dtype: int64
GCF_000219105.1    240
GCF_000172555.1    232
GCF_000423665.1    203
GCF_001306135.1    158
GCF_900099695.1    145
GCF_000934435.1    141
GCF_001886815.1    132
GCF_000007565.2    129
GCF_000012325.1    105
GCF_000010305.1     99
GCF_000279145.1     90
GCF_000379805.1     88
GCF_000724625.1     80
GCF_000469585.1     79
GCF_000196175.1     75
GCF_000423825.1     74
GCF_000011385.1     73
GCF_900101745.1     70
GCF_000022565.1     68
GCF_000165715.2     67
GCF_000341545.2     65
GCF_000474745.1     62
GCF_000024985.1     61
GCF_000375465.1     49
GCF_000218625.1     47
GCF_900167165.1     42
GCF_900128955.1     42
GCF_000011905.1     40
GCF_001294365.1     39
GCF_000736515.1     39
GCF_000284095.1     36
GCF_000661895.1     35
GCF_000190535.1     34
GCF_000284315.1     32
GCF_000426825.1     30
GCF_000092425.1     27
GCF_000170755.1     27
GCF_000158195.2     26
GCF_001017655.1     26
GCF_000734015.1     2

## Alignment of REC domains
- This analysis depends on the alignment of the REC domain only
- *Note that aligning full length sequences will result in massively gapped regions and alignment bias*
- Below we use hmmalign against to align RR sequences against the Response_reg hmm file



In [19]:
def Align(df, DBname, TCStypes = ["RR", "HK"], iteration = 2):
    

    for TCStype in TCStypes:
        dfsub = df[df.TCStype == TCStype]

        if TCStype == "HK":
            dfHHK = df[df.TCStype == "Hybrid_HK"]
            
            dfsub = pd.concat([dfsub,dfHHK])
        seqfile = os.path.join(os.getcwd(),"Alignment_Files", f"Full_{TCStype}_{DBname}_{iteration}.fasta")

        file = open(seqfile, "w")    
        for stable_id, domain_seq in zip(dfsub.stable_id, dfsub.domain_seq):
            if len(domain_seq) > 80:
                file.write(">"+stable_id + "\n" +domain_seq +"\n")
            else:
                df.drop(df[df.stable_id == stable_id].index)

        file.close()
    
    for TCStype in TCStypes:
        dfsub = df[df.TCStype == TCStype]

        if TCStype == "HK":
            dfHHK = df[df.TCStype == "Hybrid_HK"]
            
            dfsub = pd.concat([dfsub,dfHHK])

        hmmfile = os.path.join(os.getcwd(), "Source_Files/", "Response_reg.hmm")
        seqfile = os.path.join(os.getcwd(),"Alignment_Files", f"Full_{TCStype}_{DBname}_{iteration}.fasta")
        alignment_output = os.path.join(os.getcwd(),"Alignment_Files", f"{TCStype}_{DBname}_hmmAlign_{iteration}.fasta")
        
        file = open(seqfile, "w")    
        for stable_id, sequence in zip(dfsub.stable_id, dfsub.sequence):
            if len(sequence) > 80:
                file.write(">"+stable_id + "\n" +sequence +"\n")
            else:
                df.drop(df[df.stable_id == stable_id].index)

        file.close()
            # https://mafft.cbrc.jp/alignment/software/manual/manual.html
                # method for larger databases :
                    # *FFT-NS-i (iterative refinement method; max. 1000 iterations):
        command = "hmmalign -o {2} --trim --amino --outformat afa {0} {1}".format(hmmfile, seqfile, alignment_output)
        print(command)
        !{command} 
        print("Alignment for {} complete!".format(TCStype))

    return df

In [54]:
DBs = [
    "Pseudomonas",
    "Proteobacteria", 
    "Bacteria"
]

for DB in DBs:
    df = pd.read_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DB}DB_{iteration}.csv"))

    Align(df, DBname = DB , TCStypes = ["RR"], iteration = iteration)

hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/RR_Pseudomonas_hmmAlign_3.fasta --trim --amino --outformat afa /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Source_Files/Response_reg.hmm /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/Full_RR_Pseudomonas_3.fasta
Alignment for RR complete!
hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/RR_Proteobacteria_hmmAlign_3.fasta --trim --amino --outformat afa /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Source_Files/Response_reg.hmm /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/Full_RR_Proteobacteria_3.fasta
Alignment for RR complete!
hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplaygro

## Generate databases for other Phyla
- Chloroflexi
- Firmicutes
- Bacteroidetes
- Actinobacteri

In [56]:
Other_Phyla = ["Chloroflexi", "Firmicutes", "Bacteroidetes", "Actinobacteria" ]
iteration = 2

#### Chloroflexi

In [57]:
Chloroflexi = Bacteria[Bacteria.phylum ==  "Chloroflexi"]

Chloroflexi = Chloroflexi[Chloroflexi["version"] != "GCF_000178855.1"] # this genome wasn't working for some reason
ChloroflexiSubset = Chloroflexi.drop_duplicates(subset = ["genus"])
randomSubsetClass = randomSubset(df = ChloroflexiSubset, groupby = "class", frac = 1, add_PP = False)
print(randomSubsetClass["class"].value_counts())
DB = randomSubsetClass
DBname = "Chloroflexi"

df = TCSdomains(DB = randomSubsetClass)  

Chloroflexi.to_csv(os.path.join(Genomespath, f"{DBname}DB.csv"), index = False)
df.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}DB_{iteration}.csv"), index = False)
DB.to_csv(os.path.join(SampledGenomespath, f"Sampled{DBname}DB_{iteration}.csv"), index = False)

Chloroflexia       2
Dehalococcoidia    2
Thermomicrobia     2
Anaerolineae       1
Ardenticatenia     1
Caldilineae        1
Ktedonobacteria    1
Thermoflexia       1
Name: class, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


RR           487
HK           301
Hybrid_HK     63
Name: TCStype, dtype: int64
GCF_001306135.1    158
GCF_000017805.1    122
GCF_001748285.1     99
GCF_001050235.1     93
GCF_001293545.1     79
GCF_000281175.1     66
GCF_000024985.1     61
GCF_900187885.1     61
GCF_000143165.1     49
GCF_000011905.1     40
GCF_000021685.1     23
Name: version, dtype: int64


#### Firmicutes

In [58]:
Firmicutes = Bacteria[Bacteria.phylum ==  "Firmicutes"]

FirmicutesSubset = Firmicutes.drop_duplicates(subset = ["genus"])
randomSubsetClass = randomSubset(df = FirmicutesSubset, groupby = "class", frac = 1, add_PP = False)
print(randomSubsetClass["class"].value_counts())
DB = randomSubsetClass
DBname = "Firmicutes"

df = TCSdomains(DB = randomSubsetClass)  

Firmicutes.to_csv(os.path.join(Genomespath, f"{DBname}DB.csv"), index = False)
df.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}DB_{iteration}.csv"), index = False)
DB.to_csv(os.path.join(SampledGenomespath, f"Sampled{DBname}DB_{iteration}.csv"), index = False)

Clostridia          4
Negativicutes       3
Bacilli             2
Tissierellia        2
Erysipelotrichia    1
Limnochordia        1
Name: class, dtype: int64
RR           303
HK           154
Hybrid_HK      2
Name: TCStype, dtype: int64
GCF_000173815.1    113
GCF_000219125.1     60
GCF_001544015.1     54
GCF_000497245.1     50
GCF_000213255.1     42
GCF_000020005.1     34
GCF_000020485.1     21
GCF_900120165.1     18
GCF_000154285.1     18
GCF_000425865.1     18
GCF_900059565.1     15
GCF_900103425.1      8
GCF_900112895.1      8
Name: version, dtype: int64


#### Bacteroidetes

In [59]:
Bacteroidetes = Bacteria[Bacteria.phylum ==  "Bacteroidetes"]

BacteroidetesSubset = Bacteroidetes.drop_duplicates(subset = ["genus"])
randomSubsetClass = randomSubset(df = BacteroidetesSubset, groupby = "class", frac = 1, add_PP = False)
print(randomSubsetClass["class"].value_counts())
DB = randomSubsetClass
DBname = "Bacteroidetes"

df = TCSdomains(DB = randomSubsetClass)  

Bacteroidetes.to_csv(os.path.join(Genomespath, f"{DBname}DB.csv"), index = False)
df.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}DB_{iteration}.csv"), index = False)
DB.to_csv(os.path.join(SampledGenomespath, f"Sampled{DBname}DB_{iteration}.csv"), index = False)

Bacteroidia         2
Chitinophagia       1
Cytophagia          1
Flavobacteriia      1
Saprospiria         1
Sphingobacteriia    1
Name: class, dtype: int64
RR           264
HK           135
Hybrid_HK     67
Name: TCStype, dtype: int64
GCF_000759025.1    94
GCF_900106985.1    82
GCF_000143765.1    77
GCF_900141875.1    76
GCF_000265405.1    63
GCF_001439665.1    57
GCF_002201515.1    17
Name: version, dtype: int64


#### Actinobacteria

In [60]:
Actinobacteria = Bacteria[Bacteria.phylum ==  "Actinobacteria"]

ActinobacteriaSubset = Actinobacteria.drop_duplicates(subset = ["genus"])
randomSubsetClass = randomSubset(df = ActinobacteriaSubset, groupby = "class", frac = 0.1, add_PP = False)
print(randomSubsetClass["class"].value_counts())
DB = randomSubsetClass
DBname = "Actinobacteria"

df = TCSdomains(DB = randomSubsetClass)  


Actinobacteria.to_csv(os.path.join(Genomespath, f"{DBname}DB.csv"), index = False)
df.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}DB_{iteration}.csv"), index = False)
DB.to_csv(os.path.join(SampledGenomespath, f"Sampled{DBname}DB_{iteration}.csv"), index = False)

Actinobacteria     7
Acidimicrobiia     1
Coriobacteriia     1
Nitriliruptoria    1
Rubrobacteria      1
Thermoleophilia    1
Name: class, dtype: int64
RR           509
HK           240
Hybrid_HK     37
Name: TCStype, dtype: int64
GCF_000284295.1    187
GCF_000969705.1    100
GCF_000504285.1     96
GCF_000519325.1     79
GCF_000062885.1     67
GCF_900105065.1     58
GCF_900129455.1     45
GCF_000210055.1     39
GCF_000949295.1     37
GCF_000661895.1     35
GCF_001887245.1     24
GCF_001941425.1     19
Name: version, dtype: int64


In [70]:
DBs = Other_Phyla

for DB in DBs:
    df = pd.read_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DB}DB_{iteration}.csv".format()))

    Align(df, DBname = DB , TCStypes = ["RR"], iteration = iteration)

hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/RR_Chloroflexi_hmmAlign_2.fasta --trim --amino --outformat afa /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Source_Files/Response_reg.hmm /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/Full_RR_Chloroflexi_2.fasta
Alignment for RR complete!
hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/RR_Firmicutes_hmmAlign_2.fasta --trim --amino --outformat afa /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Source_Files/Response_reg.hmm /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/Full_RR_Firmicutes_2.fasta
Alignment for RR complete!
hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alig

# Generate Databases for classes in Proteobacteria
 - Alphaproteobacteria
 - Betaproteobacteria
 - Gammaproteobacteria

In [62]:
Proteo_classes = ["Alphaproteobacteria", "Betaproteobacteria", "Gammaproteobacteria"]
iteration =2

##### Alphaproteobacteria

In [63]:
Alphaproteobacteria = Bacteria[Bacteria["class"] ==  "Alphaproteobacteria"]

AlphaproteobacteriaSubset = Alphaproteobacteria.drop_duplicates(subset = ["genus"])
randomSubsetOrder = randomSubset(df = AlphaproteobacteriaSubset, groupby = "order", frac = .05, second_frac=0.25, add_PP = False)
print(randomSubsetOrder["order"].value_counts())
DB = randomSubsetOrder
DBname = "Alphaproteobacteria"

df = TCSdomains(DB = randomSubsetOrder)  

Alphaproteobacteria.to_csv(os.path.join(Genomespath, f"{DBname}DB.csv"), index = False)
df.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}DB_{iteration}.csv"), index = False)
DB.to_csv(os.path.join(SampledGenomespath, f"Sampled{DBname}DB_{iteration}.csv"), index = False)

Rhodobacterales     5
Rhizobiales         4
Caulobacterales     2
Rhodospirillales    2
Rickettsiales       2
Holosporales        1
Kiloniellales       1
Kordiimonadales     1
Magnetococcales     1
Parvularculales     1
Sneathiellales      1
Sphingomonadales    1
Name: order, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


RR           797
HK           409
Hybrid_HK    252
Name: TCStype, dtype: int64
GCF_002109495.1    151
GCF_001746755.1    126
GCF_000204015.1    125
GCF_000374005.1     95
GCF_000018545.1     85
GCF_001484065.1     84
GCF_000383415.1     81
GCF_000739895.2     75
GCF_000496075.1     74
GCF_000375545.1     72
GCF_000420665.1     71
GCF_002208825.2     71
GCF_900188425.1     62
GCF_900172325.1     60
GCF_900172315.1     49
GCF_000429365.1     49
GCF_900199215.1     44
GCF_000378465.1     43
GCF_000325745.1     16
GCF_001192655.1     11
GCF_000063545.1      8
GCF_000026005.1      6
Name: version, dtype: int64


##### Betaproteobacteria

In [64]:
Betaproteobacteria = Bacteria[Bacteria["class"] ==  "Betaproteobacteria"]

BetaproteobacteriaSubset = Betaproteobacteria.drop_duplicates(subset = ["genus"])
randomSubsetOrder = randomSubset(df = BetaproteobacteriaSubset, groupby = "order", frac = 0.05, second_frac= 0.25, add_PP = False)
print(randomSubsetOrder["order"].value_counts())
DB = randomSubsetOrder
DBname = "Betaproteobacteria"

df = TCSdomains(DB = randomSubsetOrder)  


Betaproteobacteria.to_csv(os.path.join(Genomespath, f"{DBname}DB.csv"), index = False)
df.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}DB_{iteration}.csv"), index = False)
DB.to_csv(os.path.join(SampledGenomespath, f"Sampled{DBname}DB_{iteration}.csv"), index = False)

Burkholderiales     4
Rhodocyclales       2
Ferrovales          1
Neisseriales        1
Nitrosomonadales    1
Name: order, dtype: int64
RR           564
HK           274
Hybrid_HK    138
Name: TCStype, dtype: int64
GCF_000519045.1    155
GCF_000214035.1    146
GCF_001293525.1    142
GCF_000236665.1    133
GCF_900112675.1    106
GCF_900115065.1     95
GCF_000970345.1     94
GCF_002214645.1     56
GCF_000735045.1     49
Name: version, dtype: int64


##### Gammaproteobacteria

In [65]:
Gammaproteobacteria = Bacteria[Bacteria["class"] ==  "Gammaproteobacteria"]

GammaproteobacteriaSubset = Gammaproteobacteria.drop_duplicates(subset = ["genus"])
randomSubsetOrder = randomSubset(df = GammaproteobacteriaSubset, groupby = "order", frac = 0.1, second_frac= 0.25, add_PP = False)
print(randomSubsetOrder["order"].value_counts())
DB = randomSubsetOrder
DBname = "Gammaproteobacteria"

df = TCSdomains(DB = randomSubsetOrder)  

Gammaproteobacteria.to_csv(os.path.join(Genomespath, f"{DBname}DB.csv"), index = False)
df.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}DB_{iteration}.csv"), index = False)
DB.to_csv(os.path.join(SampledGenomespath, f"Sampled{DBname}DB_{iteration}.csv"), index = False)


Enterobacterales         5
Chromatiales             3
Oceanospirillales        3
Xanthomonadales          2
Pasteurellales           2
Alteromonadales          2
Cardiobacteriales        2
Cellvibrionales          2
Thiotrichales            2
Vibrionales              1
Salinisphaerales         1
Pseudomonadales          1
Acidiferrobacterales     1
Orbales                  1
Nevskiales               1
Aeromonadales            1
Legionellales            1
Immundisolibacterales    1
Methylococcales          1
Name: order, dtype: int64
RR           837
HK           354
Hybrid_HK    162
Name: TCStype, dtype: int64
GCF_000711985.1    118
GCF_001442515.1     95
GCF_000260135.1     94
GCF_000314975.1     93
GCF_000425345.1     90
GCF_000377745.1     62
GCF_001975225.1     59
GCF_000009365.1     52
GCF_000300815.1     51
GCF_000757785.1     44
GCF_002077135.1     43
GCF_000439085.1     42
GCF_001586165.1     41
GCF_000215955.2     40
GCF_000711315.1     39
GCF_002072955.1     38
GCF_000565345.

#### Alignments

In [69]:
DBs = Proteo_classes

for DB in DBs:
    df = pd.read_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DB}DB_{iteration}.csv"))

    Align(df, DBname = DB , TCStypes = ["RR"], iteration = iteration)

hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/RR_Alphaproteobacteria_hmmAlign_2.fasta --trim --amino --outformat afa /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Source_Files/Response_reg.hmm /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/Full_RR_Alphaproteobacteria_2.fasta
Alignment for RR complete!
hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/RR_Betaproteobacteria_hmmAlign_2.fasta --trim --amino --outformat afa /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Source_Files/Response_reg.hmm /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/Full_RR_Betaproteobacteria_2.fasta
Alignment for RR complete!
hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_b

#### Alphaproteobacteria genuses that represent the order
- Searched through database to find genus of alphaproteobacteria with largest number of genomes
- DBs = ["Sphingomonas", "Paracoccus","Bartonella", "Rickettsia","Rhizobium","Acetobacter", "Brevundimonas" ]
- Concatenate DBs with Pseudomonas Putida KT220 for cluster annotation


In [107]:
dfPseudmonas = pd.read_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_PseudomonasDB_3.csv"))
dfPseudmonas.head()
dfPP = dfPseudmonas[dfPseudmonas["stable_id"].str.contains("PP")]     

In [129]:
DBs = [
    "Sphingomonas",
    "Paracoccus",
    "Bartonella", "Rickettsia","Rhizobium","Acetobacter", "Brevundimonas" 
]

# iteration = 1

# for DBname in DBs:
#     print(DBname)
#     DB =  Proteobacteria[Proteobacteria.genus == DBname]
#     DB.to_csv(os.path.join(Genomespath, f"{DBname}GenomesDB.csv"), index= False)
#     DB.to_csv(os.path.join(SampledGenomespath, f"Sampled{DBname}DB_{iteration}.csv"), index = False)

#     df = TCSdomains(DB = DB)

#     df.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}DB_{iteration}.csv"), index = False)
    
    
#     dfPP_concat = pd.concat([df, dfPP])
#     dfPP_concat.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}_PPDB_{iteration}.csv"), index = False)
#     Bacteria[Bacteria["version"].isin(dfPP_concat["version"].unique())].to_csv(os.path.join(Genomespath, f"{DBname}_PPGenomesDB.csv"), index = False)




In [130]:
[f"{DB}_PP" for DB in DBs]

['Sphingomonas_PP',
 'Paracoccus_PP',
 'Bartonella_PP',
 'Rickettsia_PP',
 'Rhizobium_PP',
 'Acetobacter_PP',
 'Brevundimonas_PP']

In [132]:
Databases = DBs + [f"{DB}_PP" for DB in DBs]

for DB in Databases:
    df = pd.read_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DB}DB_{iteration}.csv"))

    Align(df, DBname = DB , TCStypes = ["RR"], iteration = iteration)

hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/RR_Sphingomonas_hmmAlign_1.fasta --trim --amino --outformat afa /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Source_Files/Response_reg.hmm /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/Full_RR_Sphingomonas_1.fasta
Alignment for RR complete!
hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/RR_Paracoccus_hmmAlign_1.fasta --trim --amino --outformat afa /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Source_Files/Response_reg.hmm /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/Full_RR_Paracoccus_1.fasta
Alignment for RR complete!
hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Al

In [13]:
Sphingomonas =  Proteobacteria[Proteobacteria.genus ==  "Sphingomonas"]
Sphingomonas.to_csv(os.path.join(Genomespath, "SphingomonasGenomesDB.csv"),index= False)
Sphingomonas["name"].value_counts()

Sphingomonas paucimobilis NBRC 13935           1
Sphingomonas adhaesiva NBRC 15099              1
Sphingomonas panacis                           1
Sphingomonas taxi                              1
Sphingomonas indica                            1
Sphingomonas laterariae                        1
Sphingomonas haloaromaticamans                 1
Sphingomonas rubra                             1
Sphingomonas hankookensis                      1
Sphingomonas changbaiensis NBRC 104936         1
Sphingomonas astaxanthinifaciens DSM 22298     1
Sphingomonas sanxanigenens DSM 19645 = NX02    1
Sphingomonas jaspsi DSM 18422                  1
Sphingomonas mucosissima                       1
Sphingomonas dokdonensis                       1
Sphingomonas azotifigens NBRC 15497            1
Sphingomonas soli NBRC 100801                  1
Sphingomonas phyllosphaerae 5.2                1
Sphingomonas wittichii RW1                     1
Sphingomonas melonis TY                        1
Sphingomonas pituito

In [17]:
iteration = 1
DB = Sphingomonas
DBname = "Sphingomonas"

df = TCSdomains(DB = DB)

df.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}DB_{iteration}.csv"), index = False)
DB.to_csv(os.path.join(SampledGenomespath, f"Sampled{DBname}DB_{iteration}.csv"), index = False)

In [32]:
pd.set_option('display.max_rows', 500)

Paracoccus =  Proteobacteria[Proteobacteria.genus ==  "Paracoccus"]
Paracoccus.to_csv(os.path.join(Genomespath, "ParacoccusGenomesDB.csv"),index= False)
Paracoccus["name"].value_counts()

Paracoccus denitrificans PD1222             1
Paracoccus alcaliphilus                     1
Paracoccus sanguinis                        1
Paracoccus sediminis                        1
Paracoccus sphaerophysae                    1
Paracoccus isoporae                         1
Paracoccus chinensis                        1
Paracoccus saliphilus                       1
Paracoccus halophilus                       1
Paracoccus homiensis                        1
Paracoccus tibetensis                       1
Paracoccus zeaxanthinifaciens ATCC 21588    1
Paracoccus seriniphilus                     1
Paracoccus yeei                             1
Paracoccus alkenifer                        1
Paracoccus solventivorans                   1
Paracoccus versutus                         1
Paracoccus aminovorans                      1
Paracoccus aminophilus JCM 7686             1
Paracoccus contaminans                      1
Name: name, dtype: int64

In [33]:
iteration = 1
DB = Paracoccus
DBname = "Paracoccus"

df = TCSdomains(DB = DB)

df.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DBname}DB_{iteration}.csv"), index = False)
DB.to_csv(os.path.join(SampledGenomespath, f"Sampled{DBname}DB_{iteration}.csv"), index = False)

RR           549
HK           252
Hybrid_HK     81
Name: TCStype, dtype: int64
GCF_001546115.1    62
GCF_000763885.1    62
GCF_000203895.1    61
GCF_900102505.1    60
GCF_900110285.1    55
GCF_000444995.1    54
GCF_002073635.2    52
GCF_900199195.1    48
GCF_900156835.1    43
GCF_900188295.1    41
GCF_900142875.1    40
GCF_900102885.1    40
GCF_900111675.1    39
GCF_000420145.1    36
GCF_900108405.1    34
GCF_000763805.1    33
GCF_900106665.1    32
GCF_900101865.1    32
GCF_000763905.1    30
GCF_002105555.1    28
Name: version, dtype: int64


In [121]:
DBs = ["Sphingomonas", "Paracoccus","Bartonella", "Rickettsia","Rhizobium","Acetobacter", "Brevundimonas" ]


DBs = DBs + [f"{DB}_PP" for DB in DBs]

for DB in DBs:
    df = pd.read_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_{DB}DB_{iteration}.csv".format()))

    Align(df, DBname = DB , TCStypes = ["RR"], iteration = iteration)

hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/RR_Sphingomonas_hmmAlign_1.fasta --trim --amino --outformat afa /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Source_Files/Response_reg.hmm /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/Full_RR_Sphingomonas_1.fasta
Alignment for RR complete!
hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/RR_Paracoccus_hmmAlign_1.fasta --trim --amino --outformat afa /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Source_Files/Response_reg.hmm /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/Full_RR_Paracoccus_1.fasta
Alignment for RR complete!
hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Al

### Concatenate Sphingomonas and Parracoccus databases with Pseudomonas Putida KT220 for cluster annotation

In [58]:
dfPseudmonas = pd.read_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_PseudomonasDB_3.csv"))
dfPseudmonas.head()
dfPP = dfPseudmonas[dfPseudmonas["stable_id"].str.contains("PP")]     

In [74]:
dfPP_Para = pd.concat([df, dfPP])
dfPP_Para.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_Parracoccus_PPDB_{iteration}.csv"), index = False)
Bacteria[Bacteria["version"].isin(dfPP_Para["version"].unique())].to_csv(os.path.join(Genomespath, f"Parracoccus_PPGenomesDB.csv"), index = False)




In [75]:
Align(dfPP_Para, DBname = "Parracoccus_PP" , TCStypes = ["RR"], iteration = iteration)

hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/RR_Parracoccus_PP_hmmAlign_1.fasta --trim --amino --outformat afa /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Source_Files/Response_reg.hmm /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/Full_RR_Parracoccus_PP_1.fasta
Alignment for RR complete!


Unnamed: 0,TCStype,aseq_id,domain_counts,domain_seq,env_from,env_to,gene_id,sequence,stable_id,version
0,Hybrid_HK,_K908nnV_iMo8cSO4ecVxQ,"{'RR': 2, 'GAF': 1, 'PAS': 3, 'HK_CA': 1}",KSQFIANMSHELRTPLSAIIGYAELLEEEAADLGDAGGNMAEDLSK...,619,846,21453361,MASLLRAVTIALRGAACRGQIVPITPARRNGRPRSGLRRAATPLPH...,GCF_002105555.1-B0A89_RS14450,GCF_002105555.1
1,RR,BQDSZQjfyIj0CdYmpusHhw,{'RR': 1},LLVEDTPEIWDFLSRRLARRGHEVALAHDGQAGLDAAQAAPPQVIL...,4,116,21453360,MAKLLLVEDTPEIWDFLSRRLARRGHEVALAHDGQAGLDAAQAAPP...,GCF_002105555.1-B0A89_RS14445,GCF_002105555.1
2,RR,QR6Sr-8mAJi8F4g17GR2Sg,"{'RR': 1, 'Trans_reg_C': 1}",LVVEDAPDVADAVARSGARLGWAVDCAPTLADGEAALATHDYDLAI...,3,113,21453313,MRILVVEDAPDVADAVARSGARLGWAVDCAPTLADGEAALATHDYD...,GCF_002105555.1-B0A89_RS14210,GCF_002105555.1
3,RR,Ft2RHY5neXBeYBG5aGvNRA,"{'RR': 1, 'Trans_reg_C': 1}",LIVEDDPILSGQIAAAMRQGGFVADIANDGAQAEFMGMTETYDVAI...,3,113,21453129,MRCLIVEDDPILSGQIAAAMRQGGFVADIANDGAQAEFMGMTETYD...,GCF_002105555.1-B0A89_RS13290,GCF_002105555.1
4,RR,7Ry3msPUb7czrnQLjXqA2g,"{'RR': 1, 'Trans_reg_C': 1}",LLVEDEGVQREVLTYNLDAEGFRVVSAETGDEALLLVAEESPDLVL...,8,120,21452794,MSAQQPCVLLVEDEGVQREVLTYNLDAEGFRVVSAETGDEALLLVA...,GCF_002105555.1-B0A89_RS11615,GCF_002105555.1
...,...,...,...,...,...,...,...,...,...,...
4611,HK,vEwNS7GYcG9KfogH_N4H7A,"{'PAS': 1, 'HAMP': 1, 'HK_CA': 1, 'KinB_sensor...",RSEFVLRASHELRTPVTGMHMAFGLLRERVKFPPEARENDLLETIG...,373,593,1098212,MKWPPMKLRTRLFLSISTLVTVALLGLLLGLVSMLQMATVQQRLVR...,GCF_000007565.2-PP_0132,GCF_000007565.2
4612,RR,LL7_ZQRZvNWIpNM6VW59YA,"{'RR': 1, 'Trans_reg_C': 1}",LVIEDEVKTAEYVRQGLTECGYVVDCVHTGSDGLFLAKQHEYELII...,3,113,1098122,MRILVIEDEVKTAEYVRQGLTECGYVVDCVHTGSDGLFLAKQHEYE...,GCF_000007565.2-PP_0047,GCF_000007565.2
4613,RR,ggTWR_WuSmes2mWHd95rig,{'RR': 1},LVVDDNAVNREALILYLKSRGIDAVGADGAEEARLYLHYQKRIGLM...,71,183,1098463,MVPIGRKSMSALVKDPTKGETGYASNAVLMLKRNFFDEMSMPEHTD...,GCF_000007565.2-PP_0355,GCF_000007565.2
4614,HK,BoHZ-LqfrwJDgV9bfWfjJQ,{'HK_CA': 1},LSQFSDDLAHELRAPLSNLMGKAQVALTRERSLSEYREVLESCTEE...,241,463,1098106,MRPFSLAAKLGLKVGLMSAALLLLFATFGYLMVGKALERNARADLE...,GCF_000007565.2-PP_0030,GCF_000007565.2


In [81]:
dfSphing = pd.read_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_SphingomonasDB_1.csv"), index_col = False)
dfPP_Sphing = pd.concat([dfSphing, dfPP])
dfPP_Sphing
dfPP_Sphing.to_csv(os.path.join(SampledTCSpath, f"TCSPlaygound_Sphingomonas_PPDB_{iteration}.csv"), index = False)
Bacteria[Bacteria["version"].isin(dfPP_Para["version"].unique())].to_csv(os.path.join(Genomespath, f"Sphingomonas_PPGenomesDB.csv"), index = False)


In [82]:
Align(dfPP_Sphing, DBname = "Sphingomonas_PP" , TCStypes = ["RR"], iteration = iteration)

hmmalign -o /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/RR_Sphingomonas_PP_hmmAlign_1.fasta --trim --amino --outformat afa /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Source_Files/Response_reg.hmm /opt/jupyterhub/shared_notebooks/m-group/ALI/MG_stuff/RR_bioinformatics/TCSplayground/Alignment_Files/Full_RR_Sphingomonas_PP_1.fasta
Alignment for RR complete!


Unnamed: 0,TCStype,aseq_id,domain_counts,domain_seq,env_from,env_to,gene_id,sequence,stable_id,version
0,RR,GMKDLMVzJKukp_WWlfgtyQ,"{'RR': 1, 'LytTR': 1}",ILVDDEPLAIQGLELRLQEHEDVEIIDKCSNGREAIRAIKTHKPDL...,5,114,20484531,MTIRTILVDDEPLAIQGLELRLQEHEDVEIIDKCSNGREAIRAIKT...,GCF_000935025.1-TS85_RS23315,GCF_000935025.1
1,RR,LUKvtANh7rvr6FdGbXESNQ,"{'RR': 1, 'HTH_8': 1, 'Sigma54_activat': 1}",LIVEDDAGLQRQLRWAYEGYQIHSATDRDEAIALVRAEEPQVVTLD...,7,120,20484138,MTAVRKLLIVEDDAGLQRQLRWAYEGYQIHSATDRDEAIALVRAEE...,GCF_000935025.1-TS85_RS21370,GCF_000935025.1
2,RR,HBu_ldCCQHpSqa74w0NUZg,"{'RR': 1, 'GerE': 1}",LTVDDHAMVRDGVTALLARQPDMEAVGEAADGKEAIAQFRALAPDI...,12,124,20484587,MAEDSERTPIRVLTVDDHAMVRDGVTALLARQPDMEAVGEAADGKE...,GCF_000935025.1-TS85_RS23590,GCF_000935025.1
3,Hybrid_HK,lb4OsLpg-Ll0uukqKnXlzA,"{'RR': 1, 'HK_CA': 1}",LGQLTGGVAHDFNNLLTPITGALDLLQNKYASIDPRSGRLIANALL...,154,382,20484584,MLQEARLHCTIAADLEALTKHVDQGAGFALATEEALTGPGLATLQR...,GCF_000935025.1-TS85_RS23575,GCF_000935025.1
4,RR,eyZzgHre-dvqB1eEQi9AdQ,"{'RR': 1, 'Trans_reg_C': 1}",LIVEDEPNLGQQLRNALEGAGYAVDLATDGEEGHFLGSTEQYDAIL...,3,113,20484131,MRLLIVEDEPNLGQQLRNALEGAGYAVDLATDGEEGHFLGSTEQYD...,GCF_000935025.1-TS85_RS21335,GCF_000935025.1
...,...,...,...,...,...,...,...,...,...,...
4611,HK,vEwNS7GYcG9KfogH_N4H7A,"{'PAS': 1, 'HAMP': 1, 'HK_CA': 1, 'KinB_sensor...",RSEFVLRASHELRTPVTGMHMAFGLLRERVKFPPEARENDLLETIG...,373,593,1098212,MKWPPMKLRTRLFLSISTLVTVALLGLLLGLVSMLQMATVQQRLVR...,GCF_000007565.2-PP_0132,GCF_000007565.2
4612,RR,LL7_ZQRZvNWIpNM6VW59YA,"{'RR': 1, 'Trans_reg_C': 1}",LVIEDEVKTAEYVRQGLTECGYVVDCVHTGSDGLFLAKQHEYELII...,3,113,1098122,MRILVIEDEVKTAEYVRQGLTECGYVVDCVHTGSDGLFLAKQHEYE...,GCF_000007565.2-PP_0047,GCF_000007565.2
4613,RR,ggTWR_WuSmes2mWHd95rig,{'RR': 1},LVVDDNAVNREALILYLKSRGIDAVGADGAEEARLYLHYQKRIGLM...,71,183,1098463,MVPIGRKSMSALVKDPTKGETGYASNAVLMLKRNFFDEMSMPEHTD...,GCF_000007565.2-PP_0355,GCF_000007565.2
4614,HK,BoHZ-LqfrwJDgV9bfWfjJQ,{'HK_CA': 1},LSQFSDDLAHELRAPLSNLMGKAQVALTRERSLSEYREVLESCTEE...,241,463,1098106,MRPFSLAAKLGLKVGLMSAALLLLFATFGYLMVGKALERNARADLE...,GCF_000007565.2-PP_0030,GCF_000007565.2


In [105]:
Alphaproteobacteria =  Proteobacteria[Proteobacteria["class"]==  "Alphaproteobacteria"]
print(Alphaproteobacteria["order"].value_counts())
for genus in pd.DataFrame(Alphaproteobacteria.genus.value_counts()).reset_index()["index"]:
    print(genus)
    print(Alphaproteobacteria[Alphaproteobacteria["genus"] == genus].order.unique())

Rhodobacterales     252
Rhizobiales         189
Sphingomonadales     96
Rhodospirillales     87
Rickettsiales        38
Caulobacterales      17
Holosporales          3
Parvularculales       3
Kordiimonadales       3
Sneathiellales        2
Kiloniellales         2
Magnetococcales       2
Name: order, dtype: int64
Sphingomonas
['Sphingomonadales']
Bartonella
['Rhizobiales']
Paracoccus
['Rhodobacterales']
Rickettsia
['Rickettsiales']
Novosphingobium
['Sphingomonadales']
Rhizobium
['Rhizobiales']
Loktanella
['Rhodobacterales']
Sphingobium
['Sphingomonadales']
Roseovarius
['Rhodobacterales']
Devosia
['Rhizobiales']
Acetobacter
['Rhodospirillales']
Erythrobacter
['Sphingomonadales']
Sulfitobacter
['Rhodobacterales']
Sphingopyxis
['Sphingomonadales']
Bradyrhizobium
['Rhizobiales']
Ruegeria
['Rhodobacterales']
Wolbachia
['Rickettsiales']
Porphyrobacter
['Sphingomonadales']
Hyphomonas
['Rhodobacterales']
Brevundimonas
['Caulobacterales']
Jannaschia
['Rhodobacterales']
Roseivivax
['Rhodobacteral

In [86]:
Proteobacteria.

Unnamed: 0,id,worker_id,accession,version,version_number,genbank_accession,genbank_version,taxonomy_id,name,refseq_category,...,family,genus,species,strain,stats,meta,biosample_id,createdAt,updatedAt,WorkerId
0,257,,GCF_000010525,GCF_000010525.1,1,GCA_000010525,GCA_000010525.1,438753,Azorhizobium caulinodans ORS 571,representative genome,...,Xanthobacteraceae,Azorhizobium,Azorhizobium caulinodans,ORS 571,{},{},5831728,2018-03-28T14:09:25.187Z,2021-03-04T04:38:47.735Z,
1,258,,GCF_000007365,GCF_000007365.1,1,GCA_000007365,GCA_000007365.1,198804,Buchnera aphidicola str. Sg (Schizaphis graminum),representative genome,...,Erwiniaceae,Buchnera,Buchnera aphidicola,str. Sg (Schizaphis graminum),{},{},2604269,2018-03-28T14:09:25.187Z,2021-03-04T04:38:49.137Z,
4,275,,GCF_000378225,GCF_000378225.1,1,GCA_000378225,GCA_000378225.1,1122236,Methylophilus methylotrophus DSM 46235 = ATCC ...,representative genome,...,Methylophilaceae,Methylophilus,Methylophilus methylotrophus,DSM 46235 = ATCC 53528,{},{},2440991,2018-03-28T14:09:25.187Z,2021-03-04T04:39:17.517Z,
5,276,,GCF_000012885,GCF_000012885.1,1,GCA_000012885,GCA_000012885.1,338963,Pelobacter carbinolicus DSM 2380,representative genome,...,Desulfuromonadaceae,Pelobacter,Pelobacter carbinolicus,DSM 2380,{},{},2598295,2018-03-28T14:09:25.187Z,2021-03-04T04:39:19.207Z,
6,277,,GCF_001375595,GCF_001375595.1,1,GCA_001375595,GCA_001375595.1,31967,Phenylobacterium immobile (ATCC 35973),representative genome,...,Caulobacteraceae,Phenylobacterium,Phenylobacterium immobile,(ATCC 35973),{},{},3458013,2018-03-28T14:09:25.187Z,2021-03-04T04:39:20.565Z,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5394,6318,,GCF_000016585,GCF_000016585.1,1,GCA_000016585,GCA_000016585.1,319224,Shewanella putrefaciens CN-32,na,...,Shewanellaceae,Shewanella,Shewanella putrefaciens,CN-32,{},{},623063,2018-09-02T02:59:24.499Z,2021-03-04T07:42:42.083Z,
5395,6332,,GCF_002305895,GCF_002305895.1,1,GCA_002305895,GCA_002305895.1,1189310,Myxococcus macrosporus DSM 14697,na,...,Myxococcaceae,Myxococcus,Myxococcus macrosporus,DSM 14697,{},{},6167052,2018-09-02T02:59:24.556Z,2021-03-04T07:43:09.505Z,
5396,6337,,GCF_002343915,GCF_002343915.1,1,GCA_002343915,GCA_002343915.1,54,Nannocystis exedens,na,...,Nannocystaceae,Nannocystis,Nannocystis exedens,,{},{},6167144,2018-09-02T02:59:24.556Z,2021-03-04T07:43:18.765Z,
5400,6871,,GCF_000829825,GCF_000829825.1,1,GCA_000829825,GCA_000829825.1,272627,Magnetospirillum magnetotacticum MS-1,na,...,Rhodospirillaceae,Magnetospirillum,Magnetospirillum magnetotacticum,MS-1,{},{},3287305,2018-09-02T02:59:24.979Z,2021-03-04T08:02:10.609Z,
