In [1]:
import pandas as pd

In [2]:
def update_gene_names(gene_mat_path, save=True):
    
    """
    goes through CARD gene annotation created by bowtie mapping, and extracts gene name
    this also removes special characters from gene name (e.g. remove parentheses) in order to make it compatible with R code
    
    parameters
    gene_mat_path: str
        path to gene matrix where CARD gene annotations are rows, samples are columns, and elements
        denote number of mapped reads
        
    returns
    gene_mat_df: pandas dataframe
         updated gene count matrix with new gene names
    gene_mat_df: boolean
        genes with same base_name
    """

    # load original gene count matrix and initialize list that will contain new gene names
    gene_mat_df = pd.read_csv(gene_mat_path, sep='\t',index_col=0, header=None)
    updated_gene_list = []
    repeated_genes = []
    
    # load metadata to replace header name
    metadata = pd.read_csv('../data/metagenomics_metadata_v2.txt', '\t')
    sample_id = list(metadata['sample_id'])
    
    gene_mat_df = gene_mat_df.rename(columns = dict(zip(gene_mat_df.columns, sample_id)))

    
    # extract gene name only; if gene name already in list, add '_' with number of previous occurances (+1)
    for gene_annotation in gene_mat_df.index:
        
        # replacing certain characters in order to make compatible with R programming language
        gene_name = gene_annotation.split('|')[-1].replace("'","").replace('-',"_").replace(" ","_").replace('(', "_").replace(')',"_").replace("/","_")
        
        if gene_name in updated_gene_list:
            
            num_prev_occur = sum(gene_name in gene for gene in updated_gene_list)
            updated_gene_list.append(gene_name + '_' + str(num_prev_occur+1))
            repeated_genes.append(gene_name + '_' + str(num_prev_occur+1))
        else:
            updated_gene_list.append(gene_name)

    gene_mat_df = gene_mat_df.rename(index = dict(zip(gene_mat_df.index, updated_gene_list))) 
    
    if save:
        gene_mat_df.to_csv('../data/custom_card_db_data/gene_counts_final_custom.csv')
        gene_mat_df.to_csv('../data/custom_card_db_data/gene_counts_final_custom.tsv', sep='\t')
        
    return gene_mat_df,repeated_genes


In [3]:
gene_mat_path = '../data/custom_card_db_data/gene_mat_all_no_len_custom.tsv'
gene_counts_df,repeated_genes = update_gene_names(gene_mat_path, save=True)

In [4]:
gene_mat_df = pd.read_csv(gene_mat_path, sep='\t',index_col=0, header=None)