In [1]:
from glob import glob
from tqdm import tqdm
import subprocess
import pandas as pd
import natsort as ns
import numpy as np
import os
import re

In [9]:
#Define my directories and location of files
WRK = '/workdir/users/pd378/oilPCR/bulk_process_all_OIL_for_paper/'

F_LIST = WRK+'file_lists/filelist.txt' #a list of the files used for iterating and namings6

#This table connects the second round of uniq reads witht he first round sample specific uniqs
UNIQ_TAB=WRK+"s6_cluster_otu/filtered_uniqs/combined_uniq_tab.txt"
#the parsed tabels generated from the usearch otu clustering 
P_TABLE=WRK+'s6_cluster_otu/filtered_uniqs/uniq_parsed_otu_tab.txt'
#The table of taxonomy generated from using mother to assign taxonomy
TAX_TAB=WRK+'s6_cluster_otu/filtered_uniqs/uniq_clustered_otus.V4_oil.wang.taxonomy' 
#the blast table generated from blasting the targets
BLAST_TABS=WRK+'s7_blast_targets/blast_reults/'            
#where this all will go
OUT_TAB=WRK+"s8_compile_data/"


#inport all the data

uniq_df = pd.read_csv(UNIQ_TAB, sep='\t', names=['output_uniq','cluster_num','member_num','output_size','input_first'])
parse_df = pd.read_csv(P_TABLE, sep='\t', names=['type','match'])
tax_df = pd.read_csv(TAX_TAB, sep='[\t|;]', names=['Kingdom','Phylum','Class','Order','Family','Genus','Species'])



In [10]:
#split up the uniq_df for easier reading
file_list=[]
size_list=[]
uniq_list=[]
for item in uniq_df.index:
    path=item.split("/")
    file_list.append(path[8].split(";")[0].strip('_ribo.fa'))
    size_list.append(path[8].rsplit(";")[2].strip('size='))
    uniq_list.append(path[8].split(";")[1])
uniq_df['file'] = file_list
uniq_df['in_size']=size_list
uniq_df['in_uniq']=uniq_list
uniq_df.sort_index


#update the parsed file to make more sense
index_list=[]
sample_list=[]
size_list=[]
otu_liste=[]
for item in parse_df.index:
    name=item.split(';')[0]
    index_list.append(name)
    size_list.append(item.split(';')[1].strip("size="))
    sample_list.append(name.split("_Uniq")[0])
   
parse_df.index = index_list
parse_df["size"]=size_list
parse_df["sample"]=sample_list



#parse through the ky table and pull out the otu name
#then use the otu to fetch the taxonomy from the tax_df
otu_list=[]
for row in parse_df.iterrows():
    otu = ""
    if re.findall("otu\d+", row[1][0]):
        otu = row[1][0]
    elif "chimera" in (row[1][0]) or "Chimera" in row[1][1]:
        otu = "chimera"
    else:
        otu = re.findall("otu\d+",row[1][1])[0]
    otu_list.append(otu)
    
parse_df["otu"]=otu_list

# merge the parse and taxa table together
key_df=pd.merge(parse_df, tax_df, left_on="otu", how='outer', right_on=tax_df.index)

# continue merging the parse_df and tax_df with the uniq_df
cross_df=pd.merge(key_df,uniq_df,left_on='sample',right_on='output_uniq', how='outer', indicator=True)

#clean up the resulting df 
clean_df=cross_df[['file', 'in_size','in_uniq','otu', 'Kingdom', 'Phylum', 'Class','Order', 'Family', 'Genus']]

#strip off the weight from the genus so we can consolidate results better

In [40]:
with open(F_LIST) as f:
    for line in f: 
        NAME = line.strip()
        b_tab=BLAST_TABS+NAME+'_target.blast'
        
        try:
            with open(b_tab) as fh:
                pass
        except IOError as e:
            print ("Unable to open blast table "+NAME)
            continue

        #modify the index names for the blast table 
        blast_df = pd.read_csv(b_tab, sep='\t', names=['p.ident','other','length','mismatch','gapopen','q.start','q.end','s.start','s.send','eval','bitscore','s.title'])
        index_list=[]
        for item in blast_df.index:
            index_list.append(item.strip(';'))
        blast_df.index=index_list

        #reformat the blast information so it's in a single cell
        blast_df['blast_summary']=blast_df['other'].astype(str)+";"+blast_df['length'].astype(str)+';'+blast_df['mismatch'].astype(str)+";"+blast_df['gapopen'].astype(str)+";"+blast_df['q.start'].astype(str)+";"+blast_df['q.end'].astype(str)+";"+blast_df['s.start'].astype(str)+";"+blast_df['s.send'].astype(str)+";"+blast_df['eval'].astype(str)
        uniq_list=[]
        size_list=[]
        genus_list=[]
        for item in blast_df.index:
            uniq_list.append(item.split(";")[0])
            size_list.append(item.split(";")[1].strip("size="))
        blast_df['blast_uniq']=uniq_list
        blast_df['blast_size']=size_list

        #clean up the table
        blast_clean=blast_df[['s.title','blast_summary','blast_uniq','blast_size']]
        blast_clean.rename(columns = {'s.title':'blast_hit'}, inplace = True)
        blast_clean["file"]=NAME
        
        #merge the taxa information with the blast information
        final_df=pd.merge(blast_clean, clean_df,left_on=['file', 'blast_uniq'], right_on=['file','in_uniq'], how='left' )
        
        #format and condens the final table for export
        final_df=final_df.fillna("0")
        final_df['in_size']=final_df['in_size'].astype(int)
        final_df['sum_size']=final_df.groupby(by=['Phylum', 'Class', 'Order','Family','Genus',"blast_hit"])["in_size"].transform(sum)
        reduce_df=final_df.drop_duplicates(subset=['Phylum', 'Class', 'Order','Family', 'Genus',"blast_hit"]).sort_values(by=['sum_size'],ascending=False)
        reduce_df=reduce_df.drop(['blast_uniq','in_uniq','blast_size','in_size'], axis=1)
        reduce_df.to_csv(OUT_TAB+NAME+"_combined_tab.txt", sep='\t')

Unable to open blast table 053-Single-B335-1_CTX-M-A
Unable to open blast table 054-Single-B335-2_CTX-M-A
Unable to open blast table 055-Single-B335-3_CTX-M-A
Unable to open blast table 056-Single-NEG_CTX-M-A
Unable to open blast table 057-Single-B335-1_TEM-A
Unable to open blast table 058-Single-B335-2_TEM-A
Unable to open blast table 059-Single-B335-3_TEM-A
Unable to open blast table 060-Single-NEG_TEM-A
Unable to open blast table 061-Single-B335-1_CTX-M-B
Unable to open blast table 062-Single-B335-2_CTX-M-B
Unable to open blast table 063-Single-B335-3_CTX-M-B
Unable to open blast table 064-Single-NEG_CTX-M-B
Unable to open blast table 065-Single-B335-1_TEM-B
Unable to open blast table 066-Single-B335-2_TEM-B
Unable to open blast table 067-Single-B335-3_TEM-B
Unable to open blast table 068-Single-NEG_TEM-B
Unable to open blast table 069-Single-B335-1_CTX-M-C
Unable to open blast table 070-Single-B335-2_CTX-M-C
Unable to open blast table 071-Single-B335-3_CTX-M-C
Unable to open blast 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unable to open blast table 005-B313_1-CTX
Unable to open blast table 006-B313_1-CTX
Unable to open blast table 007-B313_1-CTX
Unable to open blast table 008-B313_1-CTX
Unable to open blast table 009-B313_1-SHV
Unable to open blast table 010-B313_1-SHV
Unable to open blast table 011-B313_1-SHV
Unable to open blast table 012-B313_1-SHV
Unable to open blast table 013-B313_1-TEM
Unable to open blast table 014-B313_1-TEM
Unable to open blast table 015-B313_1-TEM
Unable to open blast table 016-B313_1-TEM
Unable to open blast table 021-B314_2-CTX
Unable to open blast table 022-B314_2-CTX
Unable to open blast table 023-B314_2-CTX
Unable to open blast table 024-B314_2-CTX
Unable to open blast table 025-B314_2-SHV
Unable to open blast table 026-B314_2-SHV
Unable to open blast table 027-B314_2-SHV
Unable to open blast table 028-B314_2-SHV
Unable to open blast table 029-B314_2-TEM
Unable to open blast table 030-B314_2-TEM
Unable to open blast table 031-B314_2-TEM
Unable to open blast table 032-B31

In [38]:
#004-B313_1-Bact_target.blast
blast_df = pd.read_csv(BLAST_TABS+'004-B313_1-Bact_target.blast', sep='\t', names=['p.ident','other','length','mismatch','gapopen','q.start','q.end','s.start','s.send','eval','bitscore','s.title'])

In [39]:
blast_df

Unnamed: 0,p.ident,other,length,mismatch,gapopen,q.start,q.end,s.start,s.send,eval,bitscore,s.title
Uniq1;size=890;,gi|1042799677|gb|CP012706.1|,100.000,106,0,0,1,106,3816568,3816463,7.230000e-47,196.0,"Bacteroides fragilis strain S14, complete genome"
Uniq2;size=865;,gi|1042799677|gb|CP012706.1|,100.000,106,0,0,1,106,3816568,3816463,7.230000e-47,196.0,"Bacteroides fragilis strain S14, complete genome"
Uniq3;size=331;,gi|938475369|gb|CP012937.1|,100.000,106,0,0,1,106,505683,505788,7.230000e-47,196.0,"Bacteroides thetaiotaomicron strain 7330, comp..."
Uniq4;size=295;,gi|938475369|gb|CP012937.1|,100.000,106,0,0,1,106,505683,505788,7.230000e-47,196.0,"Bacteroides thetaiotaomicron strain 7330, comp..."
Uniq5;size=214;,gi|938475369|gb|CP012937.1|,100.000,106,0,0,1,106,505683,505788,7.230000e-47,196.0,"Bacteroides thetaiotaomicron strain 7330, comp..."
Uniq6;size=211;,gi|938475369|gb|CP012937.1|,100.000,106,0,0,1,106,505683,505788,7.230000e-47,196.0,"Bacteroides thetaiotaomicron strain 7330, comp..."
Uniq7;size=141;,gi|938475369|gb|CP012937.1|,100.000,106,0,0,1,106,505683,505788,7.230000e-47,196.0,"Bacteroides thetaiotaomicron strain 7330, comp..."
Uniq8;size=135;,gi|1042799677|gb|CP012706.1|,100.000,106,0,0,1,106,3816568,3816463,7.230000e-47,196.0,"Bacteroides fragilis strain S14, complete genome"
Uniq9;size=127;,gi|1042799677|gb|CP012706.1|,100.000,106,0,0,1,106,3816568,3816463,7.230000e-47,196.0,"Bacteroides fragilis strain S14, complete genome"
Uniq10;size=97;,gi|938475369|gb|CP012937.1|,100.000,106,0,0,1,106,505683,505788,7.230000e-47,196.0,"Bacteroides thetaiotaomicron strain 7330, comp..."


In [37]:
NAME

'146-Taxa-NR2-Prev10'