In [None]:
import pandas as pd
import ete3
import os
from ftplib import FTP
import subprocess

ncbi = ete3.NCBITaxa()

%cd /work/clusterEvo/new_tests/archaea/

In [None]:
class cd:
    """
    Context manager for changing the current working directory
    """
    def __init__(self, newPath):
        self.newPath = os.path.expanduser(newPath)

    def __enter__(self):
        self.savedPath = os.getcwd()
        os.chdir(self.newPath)

    def __exit__(self, etype, value, traceback):
        os.chdir(self.savedPath)

In [None]:
header = 'assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid \
          organism_name infraspecific_name isolate version_status assembly_level release_type \
          genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path \
          excluded_from_refseq relation_to_type_material'.split()

genbank_summary = pd.read_csv('/work/assembly_summary_genbank.txt', 
                              sep      ='\t', 
                              index_col=0, 
                              header   =None, 
                              names    =header, 
                              comment  ='#')

In [None]:
lineages = pd.DataFrame()
missing_taxids = set()
for taxid in genbank_summary.taxid.unique():
    try:
        taxid_lineage = ncbi.get_lineage(int(taxid))
    except ValueError:
        missing_taxids.add(taxid)
    else:
        tmp_lineage = pd.Series( {rank:taxon
                                  for taxon, rank in ncbi.get_rank( taxid_lineage ).items()} )
        tmp_lineage.name = taxid
        lineages    = lineages.append(tmp_lineage)

lineages.drop(columns='no rank', inplace=True)

In [None]:
# archaea_lineages = lineages.query('superkingdom ==  2157')
# archaea_genomes  = genbank_summary.query('taxid in @archaea_lineages.index')
# tree_10k
# ref_genomes      = archaea_genomes.query('refseq_category == "representative genome"')

In [None]:
genomes_10k = pd.read_csv('10k_tree-metadata.tsv', index_col=2, sep='\t')
genomes_10k.rename(columns={'#genome':'genome'}, inplace=True)
tree_10k    = ete3.Tree('10k_tree.nwk', format=1)

In [None]:
williams_archaea = pd.read_csv('williams_archaea_names', 
                               sep   ='\t', 
                               header=None,
                               names =['abbr', 'organism_name', 'phylum'])
williams_archaea.head()

In [None]:
selected_archaea = pd.DataFrame(columns=genbank_summary.columns)
for index, row in williams_archaea.iterrows():
    summary_search = genbank_summary[genbank_summary.organism_name.str.contains(row.organism_name)]
    
    if not summary_search.shape[0]:
        print(row.organism_name)
    else:
        if summary_search.shape[0] == 1:
            selected_archaea = selected_archaea.append(summary_search.squeeze())
        elif summary_search.query('refseq_category == "representative genome"').shape[0]:
            selected_archaea = selected_archaea.append(summary_search.query('refseq_category == "representative genome"').squeeze())
        elif summary_search.query('assembly_level == "Complete Genome"').shape[0]:
            selected_archaea = selected_archaea.append(summary_search.query('assembly_level == "Complete Genome"').squeeze())
        elif summary_search.query('relation_to_type_material == "assembly from type material"').shape[0]:
            selected_archaea = selected_archaea.append(
                summary_search.query('relation_to_type_material == "assembly from type material"'
                                    ).squeeze())
            print('\t', row.organism_name)
        else:
            print('fuck')
            break

In [None]:
lineages_10k = pd.DataFrame()
missing_taxids = set()
for taxid in genomes_10k.taxid.unique():
    try:
        taxid_lineage = ncbi.get_lineage(int(taxid))
    except ValueError:
        missing_taxids.add(taxid)
    else:
        tmp_lineage = pd.Series( {rank:taxon
                                  for taxon, rank in ncbi.get_rank( taxid_lineage ).items()} )
        tmp_lineage.name = taxid
        lineages_10k    = lineages_10k.append(tmp_lineage)

lineages_10k.drop(columns='no rank', inplace=True)

In [None]:
for taxon, count in lineages_10k.query('superkingdom ==  2157')['phylum'].value_counts().items():
    print(ncbi.translate_to_names([taxon])[0], taxon, count)

In [None]:
for taxon, count in lineages.query('superkingdom == 2157').phylum.value_counts().items():
    print(ncbi.translate_to_names([taxon])[0], taxon, count)

In [None]:
for taxon, count in lineages.reindex(
    index=selected_archaea.taxid.tolist()
).phylum.value_counts().items():
    print(ncbi.translate_to_names([taxon])[0], taxon, count)

In [None]:
missing_phyla_from_10k = [
    'Candidatus Woesearchaeota',
#     'Candidatus Nanohaloarchaeota',
#     'Candidatus Diapherotrites',
]

selected_archaea = selected_archaea.append(
    genbank_summary.loc[genomes_10k.query('phylum in @missing_phyla_from_10k').index]
)

In [None]:
# Diapherotrites
selected_archaea = selected_archaea.append(genbank_summary.loc['GCA_002779065.1'])

In [None]:
# Nanohaloarchaea
selected_archaea = selected_archaea.append(genbank_summary.loc['GCA_013343275.1'])

In [None]:
# Nanoarchaeota
selected_archaea = selected_archaea.append(genbank_summary.loc['GCA_000008085.1'])

In [None]:
# Methanomassiliicoccaceae
selected_archaea.drop(index='GCA_000308215.1', inplace=True)
selected_archaea = selected_archaea.append(genbank_summary.loc['GCA_009911715.1'])

In [None]:
# asgard
asgard_phyla   = [1936272, 1706441, 1936271, 1655434]
asgard_genomes = selected_archaea[selected_archaea.taxid.isin(lineages.query('phylum in @asgard_phyla').index)].index

selected_archaea.drop(index=asgard_genomes, inplace=True)

selected_archaea = selected_archaea.append(genbank_summary.loc[os.listdir('asgard_genomes/')])

In [None]:
selected_archaea.shape

In [None]:
selected_archaea_lineage = lineages.reindex(index=selected_archaea.taxid.tolist(), copy=True)
selected_archaea_lineage.dropna(axis=1, how='all', thresh=5, inplace=True)

selected_archaea_lineage = selected_archaea_lineage.applymap(
    lambda cell: ncbi.translate_to_names([cell])[0] if pd.notnull(cell) else None
)
selected_archaea_lineage.sort_values('phylum', inplace=True)

selected_archaea_lineage.rename(
    index={taxid:name for taxid, name in zip(selected_archaea_lineage.index,
                                             ncbi.translate_to_names(selected_archaea_lineage.index))}, 
    inplace=True)

In [None]:
selected_archaea_lineage.to_excel('sampled_archaea_taxonomy.xlsx')

In [None]:
ncbi_ftp = FTP('ftp.ncbi.nlm.nih.gov')
ncbi_ftp.login()

succesful_download = []
for index, row in selected_archaea.iterrows():
    ncbi_ftp.cwd('/')
    path = row['ftp_path'].replace('ftp://ftp.ncbi.nlm.nih.gov/', '')
    try:
        ncbi_ftp.cwd(path)
    except:
        continue
        
    assembly_files = ncbi_ftp.nlst()
    for assembly_file in assembly_files:
#         if assembly_file.endswith('protein.faa.gz'):
        if assembly_file.endswith('_genomic.gbff.gz'):
            with open(f'gbk/{index}.gz', 'wb') as handle:
                ncbi_ftp.retrbinary("RETR %s" % assembly_file, handle.write)
            succesful_download.append(index)
            
    with open(f'genomes/{index}.gbk', 'w') as stdout:
        subprocess.call(['gunzip', '-c', f'gbk/{index}.gz'], stdout=stdout)
    
#     break
ncbi_ftp.quit()

In [None]:
for filename in os.listdir('gbk'):
    if filename.startswith('.') or not filename.endswith('.gz'):
        continue
    accession = filename.replace('.gz', '')
    with open(f'genomes/{accession}.gbk', 'w') as stdout:
        subprocess.call(['gunzip', '-c', f'gbk/{accession}.gz'], stdout=stdout)

In [None]:
genomes_missing_cds = subprocess.getoutput('grep -L "CDS" genomes/*.gbk')

In [None]:
with cd('genomes/'):
    genomes_missing_cds = subprocess.getoutput('grep -L "CDS" *.gbk').split()

genomes_missing_cds

In [None]:
for filename in genomes_missing_cds:
    accession = filename.replace('.gbk', '')

    print(lineages.loc[selected_archaea.loc[accession, 'taxid'], 'phylum'].astype(int),
          ncbi.translate_to_names([lineages.loc[selected_archaea.loc[accession, 'taxid'], 'phylum']])[0],
          '\t|\t',
          ncbi.translate_to_names([selected_archaea.loc[accession, 'taxid']])[0]
         )

In [None]:
ncbi.translate_to_names([lineages.loc[selected_archaea.loc['GCA_000308215.1', 'taxid'], 'family']])[0]

In [None]:
lineages.loc[selected_archaea.loc['GCA_000308215.1', 'taxid'], 'family']

In [None]:
Methanomassiliicoccaceae = lineages[lineages['family'] == 1577788].index

In [None]:
ncbi_ftp = FTP('ftp.ncbi.nlm.nih.gov')
ncbi_ftp.login()

alternatives_with_cds = set()
for index, row in genbank_summary.query('taxid in @Methanomassiliicoccaceae').iterrows():
    
    ncbi_ftp.cwd('/')
    path = row['ftp_path'].replace('ftp://ftp.ncbi.nlm.nih.gov/', '')
    
    ncbi_ftp.cwd(path)
    assembly_files = ncbi_ftp.nlst()
    for assembly_file in assembly_files:
        if assembly_file.endswith('protein.faa.gz'):
            print(index)
            alternatives_with_cds.add(index)
            break

ncbi_ftp.quit()

In [None]:
genbank_summary.loc[alternatives_with_cds, ['assembly_level', 'genome_rep', 'excluded_from_refseq']]

In [None]:
genomes_10k.query('phylum == "Candidatus Nanohaloarchaeota"').squeeze()

In [None]:
phyla_to_sample = [28890,   #Euryarchaeota
                   28889,   #Crenarchaeota
                   651137,  #Thaumarchaeota
                   1801616, #Candidatus Woesearchaeota
                   1978152, #Candidatus Marsarchaeota
                   192989,  #Nanoarchaeota
                   743725,  #Candidatus Diapherotrites
                  ]

In [None]:
tmp_taxid = lineages.query('phylum == 1801616').index
archaea_genomes.loc[((archaea_genomes.taxid.isin(tmp_taxid)) & 
                     (archaea_genomes.assembly_level != "Contig") &
                     (~archaea_genomes.excluded_from_refseq.str.contains('low|large', na=True))),
                    ['organism_name', 'species_taxid', 'assembly_level', 'genome_rep', 'excluded_from_refseq']]