# nifH combined analysis

This code is used to take the union of unique results of round one and round two of the gene-finder, in which sequences from the Tara metagenomes were extracted using reference alignments for a specific gene of interest (here nifH).

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
#main paths to facilitate combining the results of rounds 1 and 2 snakemake automated gene-finder searches 
gene1='nifH'
gene2='nifH_rnd2'
outputs1= '/vortexfs1/omics/alexander/lblum/tara_gene_finder/jupyter_notebooks/outputs/nifH'
outputs2='/vortexfs1/omics/alexander/lblum/tara_gene_finder/jupyter_notebooks/outputs/nifH_rnd2'

In [3]:
#locations of the tables from rounds 1 and 2 detailing sequences meeting threshold e-values.
threshold_1_path=os.path.join(outputs1,gene1+"_threshold_hits_table.csv")
threshold_2_path=os.path.join(outputs2,gene2+"_threshold_hits_table.csv")

In [10]:
#inputing our sample info file which contains all the assigned size fractions
sample_info=pd.read_csv('/vortexfs1/omics/alexander/lblum/tara_gene_finder/data/metadata/SampleList_ForAssembly_metaG_python.txt', sep="\t")

In [5]:
#using bash magic here to combine our fasta hits from both rounds and then another one liner to remove duplicates!

In [6]:
%%bash
cat outputs/nifH/nifH_extracted_hits.fasta outputs/nifH_rnd2/nifH_rnd2_extracted_hits.fasta > outputs/nifH/nifH_combined_gene_hits.fasta

In [7]:
%%bash
awk '/^>/{f=!d[$1];d[$1]=1}f' outputs/nifH/nifH_combined_gene_hits.fasta > outputs/nifH/nifH_combined_gene_hits_dedup.fasta

In [8]:
sample_info["Depth"]=sample_info.Depth_sizefrac.str.split("-").str[0]
#here we create a new column called depth which took threshold_with_info column Depth_sizefrac, and split it based on the dash, then called the first half
sample_info["Sizefrac"]=sample_info.Depth_sizefrac.str.split("-",1).str[1]
#new column with sizefrac, gave it the qualifier to only split it 1 time so that it retained the sizefrac range 

In [5]:
#import our two threshold hits tables from csv in the appropriate output folders
threshold_1=pd.read_csv(threshold_1_path)
threshold_2=pd.read_csv(threshold_2_path)

In [6]:
#union of these two threshold tables, with duplicates searched based on contig_id
threshold_union= pd.concat([threshold_1, threshold_2]).drop_duplicates(subset= 'contig_id')

In [7]:
#threshold_union will go to a csv in our outputs folder for the main gene directory with "combo" in the identifier
threshold_union.to_csv(os.path.join(outputs1,gene1+"_combo_threshold_hits_table.csv"), index=False)

In [8]:
normalized_threshold=threshold_union.groupby('sample_id').count()
#use this to normalize our data by ERR_count
#merge normalized_threshold which contains coutns of sample_ids with our new sample_info table


In [11]:
normalized_threshold_info= normalized_threshold.merge(sample_info, left_on='sample_id', right_on='Assembly_group')

In [14]:
#new column called 'normalized_count' which we can use for each graph of interest (won't change across groupings)
normalized_threshold_info['normalized_count']=normalized_threshold_info.contig_id/normalized_threshold_info.ERR_count