In [3]:
from glob import glob
from tqdm import tqdm
import subprocess
import pandas as pd
import natsort as ns
import numpy as np
import os
import re

In [4]:
#Define my directories and location of files
WRK = '/workdir/users/pd378/oilPCR/bulk_process_all_OIL_for_paper/'

F_LIST = WRK+'file_lists/filelist.txt' #a list of the files used for iterating and namings6

#This table connects the second round of uniq reads witht he first round sample specific uniqs
UNIQ_TAB=WRK+"s6_cluster_otu/filtered_uniqs/combined_uniq_tab.txt"
#the parsed tabels generated from the usearch otu clustering 
P_TABLE=WRK+'s6_cluster_otu/filtered_uniqs/uniq_parsed_otu_tab.txt'
#The table of taxonomy generated from using mother to assign taxonomy
TAX_TAB=WRK+'s6_cluster_otu/filtered_uniqs/uniq_clustered_otus.V4_oil.wang.taxonomy' 
#the blast table generated from blasting the targets
BLAST_TABS=WRK+'s7_blast_targets/blast_reults/'            
#where this all will go
OUT_TAB=WRK+"s8_compile_data/"


#inport all the data

uniq_df = pd.read_csv(UNIQ_TAB, sep='\t', names=['output_uniq','cluster_num','member_num','output_size','input_first'])
parse_df = pd.read_csv(P_TABLE, sep='\t', names=['type','match'])
tax_df = pd.read_csv(TAX_TAB, sep='[\t|;]', names=['Kingdom','Phylum','Class','Order','Family','Genus','Species'])



In [5]:
#split up the uniq_df for easier reading
file_list=[]
size_list=[]
uniq_list=[]
for item in uniq_df.index:
    path=item.split("/")
    file_list.append(path[8].split(";")[0].strip('_ribo.fa'))
    size_list.append(path[8].rsplit(";")[2].strip('size='))
    uniq_list.append(path[8].split(";")[1])
uniq_df['file'] = file_list
uniq_df['in_size']=size_list
uniq_df['in_uniq']=uniq_list
uniq_df.sort_index


#update the parsed file to make more sense
index_list=[]
sample_list=[]
size_list=[]
otu_liste=[]
for item in parse_df.index:
    name=item.split(';')[0]
    index_list.append(name)
    size_list.append(item.split(';')[1].strip("size="))
    sample_list.append(name.split("_Uniq")[0])
   
parse_df.index = index_list
parse_df["size"]=size_list
parse_df["sample"]=sample_list



#parse through the ky table and pull out the otu name
#then use the otu to fetch the taxonomy from the tax_df
otu_list=[]
for row in parse_df.iterrows():
    otu = ""
    if re.findall("otu\d+", row[1][0]):
        otu = row[1][0]
    elif "chimera" in (row[1][0]) or "Chimera" in row[1][1]:
        otu = "chimera"
    else:
        otu = re.findall("otu\d+",row[1][1])[0]
    otu_list.append(otu)
    
parse_df["otu"]=otu_list

# merge the parse and taxa table together
key_df=pd.merge(parse_df, tax_df, left_on="otu", how='outer', right_on=tax_df.index)

# continue merging the parse_df and tax_df with the uniq_df
cross_df=pd.merge(key_df,uniq_df,left_on='sample',right_on='output_uniq', how='outer', indicator=True)

#clean up the resulting df 
clean_df=cross_df[['file', 'in_size','in_uniq','otu', 'Kingdom', 'Phylum', 'Class','Order', 'Family', 'Genus']]

#strip off the weight from the genus so we can consolidate results better

In [None]:
#begin looping through blast files
with open(F_LIST) as f:
    for line in f: 
        NAME = line.strip()
        b_tab=BLAST_TABS+NAME+'_targ.blast'
        
        try:
            with open(b_tab) as fh:
                pass
        except IOError as e:
            print ("Unable to open blast table "+NAME)
            continue

        #modify the index names for the blast table 
        blast_df = pd.read_csv(b_tab, sep='\t', names=['p.ident','length','mismatch','gapopen','q.start','q.end','s.start','s.send','eval','bitscore','s.title'])
        index_list=[]
        for item in blast_df.index:
            index_list.append(item.strip(';'))
        blast_df.index=index_list

        #reformat the blast information so it's in a single cell
        blast_df['blast_summary']=blast_df['length'].astype(str)+';'+blast_df['mismatch'].astype(str)+";"+blast_df['gapopen'].astype(str)+";"+blast_df['q.start'].astype(str)+";"+blast_df['q.end'].astype(str)+";"+blast_df['s.start'].astype(str)+";"+blast_df['s.send'].astype(str)+";"+blast_df['eval'].astype(str)+";"+blast_df['bitscore'].astype(str)
        uniq_list=[]
        size_list=[]
        genus_list=[]
        for item in blast_df.index:
            uniq_list.append(item.split(";")[0])
            size_list.append(item.split(";")[1].strip("size="))
        blast_df['blast_uniq']=uniq_list
        blast_df['blast_size']=size_list

        #clean up the table
        blast_clean=blast_df[['p.ident','blast_summary','blast_uniq','blast_size']]
        blast_clean.rename(columns = {'p.ident':'blast_hit'}, inplace = True)
        blast_clean["file"]=NAME
        
        #merge the taxa information with the blast information
        final_df=pd.merge(blast_clean, clean_df,left_on=['file', 'blast_uniq'], right_on=['file','in_uniq'], how='left' )
        
        #format and condens the final table for export
        final_df=final_df.fillna("0")
        final_df['in_size']=final_df['in_size'].astype(int)
        final_df['sum_size']=final_df.groupby(by=['Phylum', 'Class', 'Order','Family','Genus',"blast_hit"])["in_size"].transform(sum)
        reduce_df=final_df.drop_duplicates(subset=['Phylum', 'Class', 'Order','Family', 'Genus',"blast_hit"]).sort_values(by=['sum_size'],ascending=False)
        reduce_df=reduce_df.drop(['blast_uniq','in_uniq','blast_size','in_size'], axis=1)
        reduce_df.to_csv(OUT_TAB+NAME+"_combined_tab.txt", sep='\t')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
NAME

'086-spike-H02-0-1p'

In [16]:
final_df

Unnamed: 0,blast_hit,blast_summary,blast_uniq,blast_size,file,in_size,in_uniq,otu,Kingdom,Phylum,Class,Order,Family,Genus
0,CmR,73;0;0;5;77;1;73;2.08e-39,Uniq1,19606,086-spike-H02-0-1p,19606,Uniq1,otu2,Bacteria(100),Proteobacteria(100),Gammaproteobacteria(100),Enterobacteriales(100),Enterobacteriaceae(100),Escherichia-Shigella(98)
1,CmR,73;0;0;5;77;1;73;2.08e-39,Uniq2,12713,086-spike-H02-0-1p,12713,Uniq2,otu2,Bacteria(100),Proteobacteria(100),Gammaproteobacteria(100),Enterobacteriales(100),Enterobacteriaceae(100),Escherichia-Shigella(98)
2,CmR,73;0;0;5;77;1;73;2.08e-39,Uniq3,1537,086-spike-H02-0-1p,1537,Uniq3,otu2,Bacteria(100),Proteobacteria(100),Gammaproteobacteria(100),Enterobacteriales(100),Enterobacteriaceae(100),Escherichia-Shigella(98)
3,CmR,73;0;0;5;77;1;73;2.08e-39,Uniq4,1039,086-spike-H02-0-1p,1039,Uniq4,otu2,Bacteria(100),Proteobacteria(100),Gammaproteobacteria(100),Enterobacteriales(100),Enterobacteriaceae(100),Escherichia-Shigella(98)
4,CmR,73;0;0;5;77;1;73;2.08e-39,Uniq5,804,086-spike-H02-0-1p,804,Uniq5,otu2,Bacteria(100),Proteobacteria(100),Gammaproteobacteria(100),Enterobacteriales(100),Enterobacteriaceae(100),Escherichia-Shigella(98)
5,CmR,73;0;0;5;77;1;73;2.08e-39,Uniq6,681,086-spike-H02-0-1p,681,Uniq6,otu2,Bacteria(100),Proteobacteria(100),Gammaproteobacteria(100),Enterobacteriales(100),Enterobacteriaceae(100),Escherichia-Shigella(98)
6,CmR,73;0;0;5;77;1;73;2.08e-39,Uniq7,502,086-spike-H02-0-1p,502,Uniq7,otu2,Bacteria(100),Proteobacteria(100),Gammaproteobacteria(100),Enterobacteriales(100),Enterobacteriaceae(100),Escherichia-Shigella(98)
7,CmR,73;0;0;5;77;1;73;2.08e-39,Uniq8,432,086-spike-H02-0-1p,432,Uniq8,otu2,Bacteria(100),Proteobacteria(100),Gammaproteobacteria(100),Enterobacteriales(100),Enterobacteriaceae(100),Escherichia-Shigella(98)
8,CmR,73;0;0;5;77;1;73;2.08e-39,Uniq9,420,086-spike-H02-0-1p,420,Uniq9,otu2,Bacteria(100),Proteobacteria(100),Gammaproteobacteria(100),Enterobacteriales(100),Enterobacteriaceae(100),Escherichia-Shigella(98)
9,CmR,73;0;0;5;77;1;73;2.08e-39,Uniq10,371,086-spike-H02-0-1p,371,Uniq10,otu2,Bacteria(100),Proteobacteria(100),Gammaproteobacteria(100),Enterobacteriales(100),Enterobacteriaceae(100),Escherichia-Shigella(98)
