In [1]:
import glob
import pandas as pd
import numpy as np

input_folder = "output_mutations/"
output_folder = "output_final_table/"
csv_separator = ";"

all_filenames = glob.glob(input_folder + "*.csv")

In [3]:
# Create a series that will hold all the gene_mutation to be used as index
gm_series = pd.Series(dtype=str, name='gene_mutation')

# Create a list to store one gene_mutation-like column for each of the samples
list_of_columns = []

# Loop all files from the folder
for filename in all_filenames:
#   read the csv using the defined separator
    sample_csv = pd.read_csv(filename , sep = csv_separator)
    
#   create the column name, using the first value after the '/', then the first value before the '_'
#   example: assume the file: output_mutaitions/66126_processada_mutacao.csv
#   the column_name will be 66126
    column_name = filename.split("/")[1].split("_")[0]
    
#   create a new column on the just read csv, called 'gene_mutation' placing gene:mutation into it
    sample_csv['gene_mutation'] = sample_csv['gene'] + ':' + sample_csv['mutation']
#   repeat but for a column with the name of the file for later use
    sample_csv[column_name] = sample_csv['gene'] + ':' + sample_csv['mutation']
    
#   add the column and its content (so a Series) to the list_of_columns so after the for-loop
#   we will build a dataframe with all of this columns
    list_of_columns.append(sample_csv.iloc[:, -1])

#   get the content of gene_mutation that was just built and put inside the gm_list as single series
    gm_series = gm_series.append(sample_csv['gene_mutation'], ignore_index = True)



In [None]:
# create a dataframe with all gene_mutations as column and index
final_df = pd.DataFrame(gm_series)
final_df = pd.DataFrame(final_df['gene_mutation'].drop_duplicates())
final_df = final_df.set_index('gene_mutation', drop=False)

# loop all the columns copied from the samples that also hold the gene:mutation pair
for sample_series in list_of_columns:
#   remove the duplicates, as obvious as it is
    sample_series = sample_series.drop_duplicates()
#   define the axis for the series, it is like the index for the dataframe.
#   so, as the content refer to an axis that is available in the dataframe we'll just need to map it
    sample_series = sample_series.set_axis(sample_series.tolist())

#  just map it using the gene_mutation as key to map the series. 
    final_df[sample_series.name] = final_df['gene_mutation'].map(sample_series)
    
# replace all NaN with an empty text
final_df = final_df.fillna('')
final_df.head(50)

In [5]:
final_df.to_csv(output_folder + 'omicron_gene_mutation_pair_table.csv', sep=';', encoding='utf-8')