In [3]:
import pandas as pd

# Define the column names for the BED file
column_names = [
    "chrom", "start", "end", "name", "score", "strand", "thickStart", "thickEnd",
    "itemRgb", "blockCount", "blockSizes", "blockStarts", "annotation", "novelty"
]

# Read the BED file into a DataFrame
file_path = "./data/UHR_chr22_corrected.colored.bed"
df_bed = pd.read_csv(file_path, sep="\t", header=None, names=column_names)

# Display the first few rows of the DataFrame
print(df_bed.head())

   chrom     start       end         name  score strand  thickStart  thickEnd  \
0  chr22  10738828  10739148  PB.118859.1      0      -    10739148  10739148   
1  chr22  10941696  10961547   PB.83093.1      0      -    10961547  10961547   
2  chr22  11298750  11300438   PB.52878.1      0      +    11300438  11300438   
3  chr22  11868469  11974188   PB.96068.1      0      -    11974188  11974188   
4  chr22  11908367  11908901  PB.111548.1      0      -    11908901  11908901   

       itemRgb  blockCount                    blockSizes  \
0  233,150,122           1                          320,   
1   238,106,80           8  84,87,115,58,126,70,183,265,   
2  233,150,122           1                         1688,   
3  102,194,164           3                   477,59,258,   
4   65,182,196           1                          534,   

                                blockStarts            annotation novelty  
0                                        0,            intergenic   novel  


In [5]:
import pandas as pd

# Define the file path
file_path = './data/UHR_chr22_classification.txt'

# Read the file into a pandas DataFrame
df_classification = pd.read_csv(file_path, sep='\t')

# Display the first few rows of the dataset
print(df_classification.head())

     isoform  chrom strand  length  exons      structural_category  \
0  PB.3796.1  chr22      +    5749      3     novel_not_in_catalog   
1  PB.3796.2  chr22      +    5327      8     novel_not_in_catalog   
2  PB.3797.1  chr22      +    6070      6                   fusion   
3  PB.3798.1  chr22      -    6333      3  incomplete-splice_match   
4  PB.3799.1  chr22      -    6053     21  incomplete-splice_match   

                        associated_gene associated_transcript  ref_length  \
0                    ENSG00000206195.11                 novel      4136.0   
1                    ENSG00000206195.11                 novel      2398.0   
2  ENSG00000100181.22_ENSG00000283633.1                 novel      1019.0   
3                    ENSG00000093100.13     ENST00000476405.1      4328.0   
4                     ENSG00000243156.9     ENST00000441493.7      9447.0   

   ref_exons  ...  seq_A_downstream_TTS  dist_to_CAGE_peak  within_CAGE_peak  \
0        4.0  ...  AAAAAAAAAAAAAAAAG

In [8]:
# Merge the two dataframes on the isoform and name columns
merged_df = pd.merge(df_bed, df_classification, left_on='name', right_on='isoform')

# Display the first few rows of the merged dataframe
print(merged_df.head())

  chrom_x     start       end         name  score strand_x  thickStart  \
0   chr22  10738828  10739148  PB.118859.1      0        -    10739148   
1   chr22  10941696  10961547   PB.83093.1      0        -    10961547   
2   chr22  11298750  11300438   PB.52878.1      0        +    11300438   
3   chr22  11868469  11974188   PB.96068.1      0        -    11974188   
4   chr22  11908367  11908901  PB.111548.1      0        -    11908901   

   thickEnd      itemRgb  blockCount  ...  seq_A_downstream_TTS  \
0  10739148  233,150,122           1  ...  AAAAAAAAAAAAAATAAAAG   
1  10961547   238,106,80           8  ...  AATTACGTGTGTGTATTCTT   
2  11300438  233,150,122           1  ...  TCAAAAAAAAAAAAAAAAGA   
3  11974188  102,194,164           3  ...  AAAAAAAAAAAAAAAGCAGC   
4  11908901   65,182,196           1  ...  CCAAAAAAAAAAAAAAAAAA   

  dist_to_CAGE_peak within_CAGE_peak dist_to_polyA_site within_polyA_site  \
0               NaN            False                NaN               NaN  

In [11]:
# Drop the 'name' column
merged_df = merged_df.drop(columns=['name'])

# Reorder columns to make 'isoform' the first column
columns = ['isoform'] + [col for col in merged_df.columns if col != 'isoform']
merged_df = merged_df[columns]

# Display the first few rows of the updated dataframe
print(merged_df.head())

       isoform chrom_x     start       end  score strand_x  thickStart  \
0  PB.118859.1   chr22  10738828  10739148      0        -    10739148   
1   PB.83093.1   chr22  10941696  10961547      0        -    10961547   
2   PB.52878.1   chr22  11298750  11300438      0        +    11300438   
3   PB.96068.1   chr22  11868469  11974188      0        -    11974188   
4  PB.111548.1   chr22  11908367  11908901      0        -    11908901   

   thickEnd      itemRgb  blockCount                    blockSizes  \
0  10739148  233,150,122           1                          320,   
1  10961547   238,106,80           8  84,87,115,58,126,70,183,265,   
2  11300438  233,150,122           1                         1688,   
3  11974188  102,194,164           3                   477,59,258,   
4  11908901   65,182,196           1                          534,   

                                blockStarts            annotation novelty  \
0                                        0,            in