In [1]:
import os
import pandas as pd

# Define the root directory
root_dir = "data"

# Iterate over all subdirectories and files
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        if file == "varscan-annotated.tsv":
            # Full path to the current table.tsv
            file_path = os.path.join(dirpath, file)
            
            # Extract the replicate folder name
            replicate_folder = os.path.basename(os.path.dirname(dirpath))
            
            # Read the TSV file
            df = pd.read_csv(file_path, sep='\t')
            
            # Add the "replicate" column
            df['replicate'] = replicate_folder
            
            # Save the updated DataFrame back to the file
            df.to_csv(file_path, sep='\t', index=False)
            
            print(f"Updated: {file_path}")


Updated: data/be_w3/replicate-2/remapping/varscan-annotated.tsv
Updated: data/be_w3/replicate-1/remapping/varscan-annotated.tsv
Updated: data/rth_w2/replicate-2/remapping/varscan-annotated.tsv
Updated: data/rth_w2/replicate-1/remapping/varscan-annotated.tsv
Updated: data/cl_c2/replicate-1/remapping/varscan-annotated.tsv
Updated: data/be_w2/replicate-1/remapping/varscan-annotated.tsv
Updated: data/rth_w3/replicate-2/remapping/varscan-annotated.tsv
Updated: data/rth_w3/replicate-1/remapping/varscan-annotated.tsv
Updated: data/ms_w1/replicate-2/remapping/varscan-annotated.tsv
Updated: data/ms_w1/replicate-1/remapping/varscan-annotated.tsv
Updated: data/kc_com3/replicate-2/remapping/varscan-annotated.tsv
Updated: data/kc_com3/replicate-1/remapping/varscan-annotated.tsv
Updated: data/kc_com4/replicate-2/remapping/varscan-annotated.tsv
Updated: data/kc_com4/replicate-1/remapping/varscan-annotated.tsv
Updated: data/kc_com5/replicate-2/remapping/varscan-annotated.tsv
Updated: data/kc_com5/repl

In [2]:
import os
import pandas as pd

# Define the root directory
# root_dir = "data_run4"

# Iterate over all sample folders
for sample_folder in os.listdir(root_dir):
    sample_path = os.path.join(root_dir, sample_folder)
    
    # Check if it's a directory (i.e., a sample folder)
    if os.path.isdir(sample_path):
        concatenated_data = []  # List to store dataframes for concatenation
        
        # Traverse the replicate subfolders
        for replicate_folder in os.listdir(sample_path):
            replicate_path = os.path.join(sample_path, replicate_folder, "remapping")
            
            # Full path to table.tsv
            tsv_path = os.path.join(replicate_path, "varscan-annotated.tsv")
            if os.path.exists(tsv_path):
                # Read the table.tsv and append it to the list
                df = pd.read_csv(tsv_path, sep='\t')
                concatenated_data.append(df)
        
        # If there are tables to concatenate
        if concatenated_data:
            # Concatenate all DataFrames
            concatenated_df = pd.concat(concatenated_data, ignore_index=True)
            
            # Save the concatenated DataFrame to a new TSV file in the sample folder
            output_file = os.path.join(sample_path, f"{sample_folder}_concat_annotated_varscan.tsv")
            concatenated_df.to_csv(output_file, sep='\t', index=False)
            
            print(f"Concatenated file saved: {output_file}")


Concatenated file saved: data/be_w3/be_w3_concat_annotated_varscan.tsv
Concatenated file saved: data/rth_w2/rth_w2_concat_annotated_varscan.tsv
Concatenated file saved: data/cl_c2/cl_c2_concat_annotated_varscan.tsv
Concatenated file saved: data/be_w2/be_w2_concat_annotated_varscan.tsv
Concatenated file saved: data/rth_w3/rth_w3_concat_annotated_varscan.tsv
Concatenated file saved: data/ms_w1/ms_w1_concat_annotated_varscan.tsv
Concatenated file saved: data/kc_com3/kc_com3_concat_annotated_varscan.tsv
Concatenated file saved: data/kc_com4/kc_com4_concat_annotated_varscan.tsv
Concatenated file saved: data/kc_com5/kc_com5_concat_annotated_varscan.tsv
Concatenated file saved: data/kc_com2/kc_com2_concat_annotated_varscan.tsv
Concatenated file saved: data/cb_com1/cb_com1_concat_annotated_varscan.tsv
Concatenated file saved: data/cg_w1/cg_w1_concat_annotated_varscan.tsv
Concatenated file saved: data/g_com1/g_com1_concat_annotated_varscan.tsv
Concatenated file saved: data/gf_w1/gf_w1_concat_an

In [3]:
import os
import pandas as pd

# Define the root directory
# root_dir = "data_run4"

# Iterate over all sample folders
for sample_folder in os.listdir(root_dir):
    sample_path = os.path.join(root_dir, sample_folder)
    
    # Check if it's a directory (i.e., a sample folder)
    if os.path.isdir(sample_path):
        # Path to the concatenated file
        concatenated_file = os.path.join(sample_path, f"{sample_folder}_concat_annotated_varscan.tsv")
        
        if os.path.exists(concatenated_file):
            # Read the concatenated TSV
            df = pd.read_csv(concatenated_file, sep='\t')
            
            # Add the "sample" column with the sample name
            df['sample_ID'] = sample_folder
            
            # Save the updated DataFrame back to the same file
            df.to_csv(concatenated_file, sep='\t', index=False)
            
            print(f"Updated file with sample column: {concatenated_file}")


Updated file with sample column: data/be_w3/be_w3_concat_annotated_varscan.tsv
Updated file with sample column: data/rth_w2/rth_w2_concat_annotated_varscan.tsv
Updated file with sample column: data/cl_c2/cl_c2_concat_annotated_varscan.tsv
Updated file with sample column: data/be_w2/be_w2_concat_annotated_varscan.tsv
Updated file with sample column: data/rth_w3/rth_w3_concat_annotated_varscan.tsv
Updated file with sample column: data/ms_w1/ms_w1_concat_annotated_varscan.tsv
Updated file with sample column: data/kc_com3/kc_com3_concat_annotated_varscan.tsv
Updated file with sample column: data/kc_com4/kc_com4_concat_annotated_varscan.tsv
Updated file with sample column: data/kc_com5/kc_com5_concat_annotated_varscan.tsv
Updated file with sample column: data/kc_com2/kc_com2_concat_annotated_varscan.tsv
Updated file with sample column: data/cb_com1/cb_com1_concat_annotated_varscan.tsv
Updated file with sample column: data/cg_w1/cg_w1_concat_annotated_varscan.tsv
Updated file with sample col

In [4]:
import os
import pandas as pd

# Define the root directory
# root_dir = "data_run4"

# List to store DataFrames from each sample folder
all_data = []

# Iterate over all sample folders
for sample_folder in os.listdir(root_dir):
    sample_path = os.path.join(root_dir, sample_folder)
    
    # Check if it's a directory (i.e., a sample folder)
    if os.path.isdir(sample_path):
        # Path to the concatenated file
        concatenated_file = os.path.join(sample_path, f"{sample_folder}_concat_annotated_varscan.tsv")
        
        if os.path.exists(concatenated_file):
            # Read the concatenated TSV
            df = pd.read_csv(concatenated_file, sep='\t')
            all_data.append(df)

# Concatenate all DataFrames
if all_data:
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Save the combined DataFrame to the root directory
    output_file = os.path.join(root_dir, "all_samples_combined_annotated_varscan.tsv")
    combined_df.to_csv(output_file, sep='\t', index=False)
    
    print(f"All samples combined into: {output_file}")
else:
    print("No concatenated files found to combine.")


All samples combined into: data/all_samples_combined_annotated_varscan.tsv


In [26]:
# import pandas as pd

# # Load the combined TSV file
# file_path = "data_run5/all_samples_combined_annotated_varscan.tsv"
# df = pd.read_csv(file_path, sep='\t')
# df.drop('Unnamed: 9', axis=1, inplace=True)

# # Ensure column names are clean (strip leading/trailing spaces)
# df.columns = df.columns.str.strip()

# # Check for necessary columns
# required_cols = ['sample_ID', 'gene', 'reference_position', 'replicate']
# missing_cols = [col for col in required_cols if col not in df.columns]
# if missing_cols:
#     raise ValueError(f"Missing columns in the DataFrame: {missing_cols}")

# # Debug: Check how the data is grouped by sample, gene, and reference_position
# # This will print a preview of the groupings for the first sample/gene/reference_position combination
# for name, group in df.groupby(['sample_ID', 'gene', 'reference_position']):
#     print(f"Group for {name}:")
#     print(group)
#     print("-" * 40)
#     break  # Just show the first group to debug

# # Add the rep_shared column based on presence of variant in all replicates
# df['rep_shared'] = df.groupby(['sample_ID', 'gene', 'reference_position'])['replicate'].transform(
#     lambda x: 'shared' if x.nunique() == len(x) else 'not_shared'
# )

# # Save the updated DataFrame
# output_file = "data_run5/all_samples_combined_with_shared_status.tsv"
# df.to_csv(output_file, sep='\t', index=False)

# print(f"Updated file with rep_shared column saved: {output_file}")


Group for ('CB_com3', 'HA', 867):
     sample gene  reference_position reference_allele variant_allele  \
1817     ha   HA                 867                G              C   
2135     ha   HA                 867                G              C   

     coding_region_change synonymous/nonsynonymous frequency(%)  frequency  \
1817            Val285Leu            nonsynonymous        4.56%     0.0456   
2135            Val285Leu            nonsynonymous        8.19%     0.0819   

        replicate sample_ID  
1817  replicate-2   CB_com3  
2135  replicate-1   CB_com3  
----------------------------------------
Updated file with rep_shared column saved: data_run5/all_samples_combined_with_shared_status.tsv


In [31]:
# import pandas as pd

# # Load the combined TSV file
# file_path = "data_run5/all_samples_combined_annotated_varscan.tsv"
# df = pd.read_csv(file_path, sep='\t')
# df.drop('Unnamed: 9', axis=1, inplace=True)

# # Ensure column names are clean (strip leading/trailing spaces)
# df.columns = df.columns.str.strip()

# # Check for necessary columns
# required_cols = ['sample_ID', 'gene', 'reference_position', 'replicate']
# missing_cols = [col for col in required_cols if col not in df.columns]
# if missing_cols:
#     raise ValueError(f"Missing columns in the DataFrame: {missing_cols}")

# # Group by sample, gene, and reference_position to check for shared variants
# def determine_shared_status(group):
#     # Get unique replicates for the group
#     unique_replicates = group['replicate'].nunique()
#     # Check if the combination of sample, gene, and reference_position is present in all replicates
#     if unique_replicates == len(group['replicate'].unique()):
#         return 'shared'
#     else:
#         return 'not_shared'

# # Apply the function to determine shared status
# df['rep_shared'] = df.groupby(['sample_ID', 'gene', 'reference_position']).apply(
#     lambda x: 'shared' if x['replicate'].nunique() == len(x['replicate'].unique()) else 'not_shared'
# ).reset_index(level=[0, 1, 2], drop=True)

# # Save the updated DataFrame
# output_file = "data_run5/all_samples_combined_with_shared_status.tsv"
# df.to_csv(output_file, sep='\t', index=False)

# print(f"Updated file with rep_shared column saved: {output_file}")


Updated file with rep_shared column saved: data_run5/all_samples_combined_with_shared_status.tsv


  df['rep_shared'] = df.groupby(['sample_ID', 'gene', 'reference_position']).apply(


In [35]:
df.tail(35)

Unnamed: 0,sample,gene,reference_position,reference_allele,variant_allele,coding_region_change,synonymous/nonsynonymous,frequency(%),frequency,replicate,sample_ID,rep_shared
2470,pb1,PB1,845,T,A,Leu282Stop,stop_gained,3.17%,0.0317,replicate-1,CB_com4,
2471,pb1,PB1,1230,T,C,Gly410Gly,synonymous,6.47%,0.0647,replicate-1,CB_com4,
2472,pb1,PB1,1569,G,A,Met523Ile,nonsynonymous,1.94%,0.0194,replicate-1,CB_com4,
2473,pb2,PB2,1077,A,G,Glu358Glu,synonymous,77.04%,0.7704,replicate-1,CB_com4,
2474,pb2,PB2,1476,T,A,Thr491Thr,synonymous,99.94%,0.9994,replicate-1,CB_com4,
2475,pb2,PB2,1480,A,C,Arg493Arg,synonymous,99.88%,0.9988,replicate-1,CB_com4,
2476,pb2,PB2,1485,G,A,Val494Val,synonymous,100%,1.0,replicate-1,CB_com4,
2477,pb2,PB2,1488,C,A,Val495Val,synonymous,99.98%,0.9998,replicate-1,CB_com4,
2478,pb2,PB2,1491,G,A,Val496Val,synonymous,99.98%,0.9998,replicate-1,CB_com4,
2479,pb2,PB2,1494,T,C,Ser497Ser,synonymous,99.95%,0.9995,replicate-1,CB_com4,
