# Helper for GRCh37_hg19 Universal BED creation

## Step 1

Combine MANE_Select and MANE_Plus_clinical in a single file.

In [None]:
import pandas as pd
import os

# Define file paths
mane_plus_clinical_path = '/Users/ptpedfilven/Desktop/metrics_app/data/regions/universal_bed/GRCh38_hg38/MANE/original/MANE_Plus_clinical.bed'
mane_select_path = '/Users/ptpedfilven/Desktop/metrics_app/data/regions/universal_bed/GRCh38_hg38/MANE/original/MANE_Select.bed'
output_path = '/Users/ptpedfilven/Desktop/metrics_app/data/regions/universal_bed/GRCh38_hg38/MANE/original/MANE_Select_Plus_clinical.bed'

# Check if files exist
if not os.path.exists(mane_plus_clinical_path):
    print(f"File not found: {mane_plus_clinical_path}")
    exit()

if not os.path.exists(mane_select_path):
    print(f"File not found: {mane_select_path}")
    exit()

# Load BED files, specifying the first rows as headers
mane_plus_clinical = pd.read_csv(mane_plus_clinical_path, sep='\t', header=0)
mane_select = pd.read_csv(mane_select_path, sep='\t', header=0)

# Merge without suffixes
merged = pd.merge(
    mane_plus_clinical,
    mane_select,
    on=['Chromosome/scaffold name', 'Exon region start (bp)', 'Exon region end (bp)',
        'Gene stable ID version', 'Gene name', 'Exon stable ID', 'Strand', 'Transcript stable ID version'],
    how='outer'  # 'outer' to include all entries from both files
)

# Sort by 'Exon region start (bp)'
merged_sorted = merged.sort_values(by='Exon region start (bp)')

# Save the resulting file to a new BED file
merged_sorted.to_csv(output_path, sep='\t', header=True, index=False)  # Keep header

print("The merge is complete and saved to 'MANE_Select_Plus_clinical.bed'.")


## Step 2 

Download RefSeq for MANE_Select and MANE_Plus_clinical and combine everything into a json.

In [None]:
import pandas as pd
import json

# Paths to the .txt files
file1 = '/Users/ptpedfilven/Downloads/hg38_transcriptID_RefSeq_MANE_Plus_clinical.txt'  # Path to the first file
file2 = '/Users/ptpedfilven/Downloads/hg38_transcriptID_RefSeq_MANE_Select.txt'  # Path to the second file

# Read both .txt files, considering the first line as header
df1 = pd.read_csv(file1, sep='\t', header=0)  # header=0 is optional as it is the default
df2 = pd.read_csv(file2, sep='\t', header=0)

# Check and remove any whitespace from column names
df1.columns = df1.columns.str.strip()
df2.columns = df2.columns.str.strip()

# Rename relevant columns
df1.rename(columns={'RefSeq match transcript (MANE Plus Clinical)': 'RefSeq'}, inplace=True)
df2.rename(columns={'RefSeq match transcript (MANE Select)': 'RefSeq'}, inplace=True)

# Convert DataFrames to lists of dictionaries
data1 = df1.to_dict(orient='records')
data2 = df2.to_dict(orient='records')

# Append data from the second file to the first
combined_data = data1 + data2

# Save the result to a JSON file using json.dump
with open('/Users/ptpedfilven/Downloads/hg38_transcriptID_RefSeq_MANE_Select_Plus_clinical.json', 'w') as json_file:
    json.dump(combined_data, json_file, indent=4)

print("The data has been appended and saved as hg38_transcriptID_RefSeq_MANE_Select_Plus_clinical.json.")


Do the same for Ensembl hg19

In [None]:
import pandas as pd
import json

# File path
file = '/Users/ptpedfilven/Downloads/hg37_transcriptID_RefSeq.txt'

# Read the .txt file, considering the first line as the header
df = pd.read_csv(file, sep='\t', header=0)

# Check and remove any whitespace from column names
df.columns = df.columns.str.strip()

# Rename relevant columns
df.rename(columns={"RefSeq mRNA ID": 'RefSeq'}, inplace=True)

# Remove rows where 'RefSeq' is NaN
df = df.dropna(subset=['RefSeq'])

# Convert the DataFrame to a list of dictionaries
data = df.to_dict(orient='records')

# Save the result to a JSON file using json.dump
with open('/Users/ptpedfilven/Downloads/hg37_transcriptID_RefSeq.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

print("The data has been processed and saved as hg37_transcriptID_RefSeq.json.")


In [None]:
python bed_to_json.py ../../data/regions/universal_bed/GRCh38_hg38/MANE/original/MANE_Select_Plus_clinical.bed ../../data/regions/universal_bed/GRCh38_hg38/MANE/original/MANE_Select_Plus_clinical_v2.json --refseq_file ../../data/regions/universal_bed/GRCh38_hg38/aux_MANE_match/hg38_transcriptID_RefSeq_MANE_Select_Plus_clinical.json

In [None]:
python bed_to_json.py ../../data/regions/universal_bed/GRCh37_hg19/original/Ensembl.bed ../../data/regions/universal_bed/GRCh37_hg19/original/Ensembl_v2.json --refseq_file ../../data/regions/universal_bed/GRCh37_hg19/aux_MANE_match/hg37_transcriptID_RefSeq.json

In [None]:
python universal_bed.py ../../data/regions/universal_bed/GRCh37_hg19/original/Ensembl_v2.json ../../data/regions/universal_bed/GRCh37_hg19/modified/MANE_Select_Plus_clinical_hg37_v2.bed hg37 --mane_file ../../data/regions/universal_bed/GRCh38_hg38/MANE/original/MANE_Select_Plus_clinical_v2.json