In [10]:
import pandas as pd
import os

In [11]:
def calculate_start_point(midpoint, endpoint):
    start_x = (2 * midpoint[0]) - endpoint[0]
    start_y = (2 * midpoint[1]) - endpoint[1]
    start_z = (2 * midpoint[2]) - endpoint[2]
    return [start_x, start_y, start_z]

def calculate_end_point(last, second_last):
    end_x = (2 * last[0]) - second_last[0]
    end_y = (2 * last[1]) - second_last[1]
    end_z = (2 * last[2]) - second_last[2]
    return [end_x, end_y, end_z]

In [16]:
def add_start_and_end_ids(structure_file_path, chr_lengths, resolution):
    for chr_folder_name in os.listdir(structure_file_path):
        chr_path = os.path.join(structure_file_path, chr_folder_name)

        # Check if the current item is a directory starting with 'chr'
        if os.path.isdir(chr_path) and chr_folder_name.startswith('chr'):  
#             print(f"Processing folder: {chr_folder_name}")

            # List all CSV files in the folder, ignoring those already containing '_with_id0' or 'gene_info'
            csv_files = [f for f in os.listdir(chr_path) if f.endswith('.csv') 
                         and '_with_id0' not in f 
                         and '_gene_info' not in f]

            for csv_file in csv_files:
                old_csv_path = os.path.join(chr_path, csv_file)
                normalized_csv_file = csv_file.replace('-', '_')
                new_csv_path = os.path.join(chr_path, normalized_csv_file)
                if old_csv_path != new_csv_path:
                    os.rename(old_csv_path, new_csv_path)
#                     print(f"Renamed {old_csv_path} â†’ {new_csv_path}")

                df = pd.read_csv(new_csv_path)

                # Add id0 at the start (extrapolated backward)
                id1_midpoint = df.loc[df['id'] == 1, ['x', 'y', 'z']].values.flatten().tolist()
                id2_endpoint = df.loc[df['id'] == 2, ['x', 'y', 'z']].values.flatten().tolist()
                startpoint = calculate_start_point(id1_midpoint, id2_endpoint)
                id0_row = pd.DataFrame({
                    'id': [0],
                    'x': [startpoint[0]],
                    'y': [startpoint[1]],
                    'z': [startpoint[2]],
                })
                df = pd.concat([id0_row, df]).reset_index(drop=True)

                # Determine chromosome length from genome map
                chr_name = chr_folder_name  # Make sure this matches your genome map
                chr_length = chr_lengths.get(chr_name)
                if chr_length is None:
                    print(f"Warning: No length found for {chr_name}. Skipping chromosome.")
                    continue

                # Get last id and expected genomic end
                last_id = df['id'].max()
                last_genome_end = (last_id + 1) * resolution
                # If not covering whole chromosome, extrapolate an end bead
                if last_genome_end < chr_length:
                    print(f"  Chromosome {chr_name}: Extending structure (last id {last_id}, covers {last_genome_end}, length {chr_length})")
                    last_coords = df.loc[df['id'] == last_id, ['x', 'y', 'z']].values.flatten()
                    second_last_coords = df.loc[df['id'] == last_id - 1, ['x', 'y', 'z']].values.flatten()
                    endpoint = calculate_end_point(last_coords, second_last_coords)
                    idN_row = pd.DataFrame({
                        'id': [last_id + 1],
                        'x': [endpoint[0]],
                        'y': [endpoint[1]],
                        'z': [endpoint[2]],
                    })
                    df = pd.concat([df, idN_row], ignore_index=True)

                # Save new file
                base_name = normalized_csv_file.replace('.csv', '')  # Remove ".csv"
                new_file_name = f"{base_name}_with_id0.csv"
                save_file_path = os.path.join(chr_path, new_file_name)
                df.to_csv(save_file_path, index=False)
                print(f"  File saved to {save_file_path}")



In [17]:
# Path to structure files and genome map
structure_file_path = 'data/green_monkey/all_structure_files'
genome_map_path = 'data/green_monkey/annotation/Chlorocebus_sabeus_mva.genome.map.csv'
resolution = 100000 

# Load genome map and build chr_lengths dict
genome_map = pd.read_csv(genome_map_path)
# Change the column names below if yours are different!
chr_lengths = dict(zip(genome_map['Chromosome'], genome_map['Length']))

add_start_and_end_ids(structure_file_path, chr_lengths, resolution)

  File saved to data/green_monkey/all_structure_files/chr23/structure_18hrs_untr_with_id0.csv
  File saved to data/green_monkey/all_structure_files/chr23/structure_24hrs_vacv_with_id0.csv
  File saved to data/green_monkey/all_structure_files/chr23/structure_12hrs_untr_with_id0.csv
  File saved to data/green_monkey/all_structure_files/chr23/structure_12hrs_vacv_with_id0.csv
  File saved to data/green_monkey/all_structure_files/chr23/structure_18hrs_vacv_with_id0.csv
  File saved to data/green_monkey/all_structure_files/chr23/structure_24hrs_untr_with_id0.csv
  File saved to data/green_monkey/all_structure_files/chr24/structure_18hrs_untr_with_id0.csv
  File saved to data/green_monkey/all_structure_files/chr24/structure_24hrs_vacv_with_id0.csv
  File saved to data/green_monkey/all_structure_files/chr24/structure_12hrs_vacv_with_id0.csv
  File saved to data/green_monkey/all_structure_files/chr24/structure_18hrs_vacv_with_id0.csv
  File saved to data/green_monkey/all_structure_files/chr24/

  File saved to data/green_monkey/all_structure_files/chr1/structure_12hrs_vacv_with_id0.csv
  File saved to data/green_monkey/all_structure_files/chr1/structure_18hrs_vacv_with_id0.csv
  File saved to data/green_monkey/all_structure_files/chr1/structure_24hrs_untr_with_id0.csv
