In [2]:
import subprocess
import os
import shutil
import re
import pandas as pd
import pyensembl
import math

In [3]:
def run_bedtools_intersect(bed_file1, bed_file2, output_file): # flags : stranded, c -report the number of hits
    # Run intersectBed on the two inputs file
    command = ['intersectBed', '-a', bed_file1, '-b', bed_file2,'-s', '-c']
    with open(output_file, 'w') as file:
        process = subprocess.Popen(command, stdout=file, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()

    if process.returncode != 0:
        print(f"Error running 'bettools intersect': {stderr.decode('utf-8')}")
    else:
        print(f"'bedtools intersect' executed successfully. Output saved to {output_file}.")

In [3]:
#  switch the start and end positions whenever the start position is greater than the end position
def switch_start_end(input_bed, output_bed):
    with open(input_bed, 'r') as input_file, open(output_bed, 'w') as output_file:
        for line in input_file:
            fields = line.strip().split('\t')
            if int(fields[1]) > int(fields[2]):
                # Swap start and end positions
                fields[1], fields[2] = fields[2], fields[1]
            output_file.write('\t'.join(fields) + '\n')


In [17]:
# not need to run every time!!!
#  switch the start and end positions whenever the start position is greater than the end position
input_bed = "/private/projects/kidney_rtt/bed_arriba_output/TCGA-CZ-5465-11A-01R-1503-07.filterd.more.bed"
output_bed = "/private/projects/kidney_rtt/bed_arriba_output/TCGA-CZ-5465-11A-01R-1503-07.filterd.more.switchStartEnd.bed"
switch_start_end(input_bed, output_bed)

In [19]:
artdeco_bed = "/home/ls/parshas/5465_results/all_dogs_withnames.bed"
arriba_bed = "/private/projects/kidney_rtt/bed_arriba_output/TCGA-CZ-5465-11A-01R-1503-07.filterd.more.switchStartEnd.bed"
output = "/home/ls/parshas/vscode/5465_processing/5465_intersect_artdeco_arriba.txt"

run_bedtools_intersect(artdeco_bed, arriba_bed, output)

'bedtools intersect' executed successfully. Output saved to /home/ls/parshas/vscode/5464_processing/5465_intersect_artdeco_arriba.txt.


In [4]:
def count_lines_with_zero_last_column(file_path):
    total_lines = 0
    zero_lines = 0

    with open(file_path, 'r') as file:
        for line in file:
            fields = line.strip().split('\t')
            total_lines += 1
            if fields[-1] == '0':
                zero_lines += 1

    percentage = (zero_lines / total_lines) * 100

    return zero_lines, percentage


In [11]:

zero_count, zero_percentage = count_lines_with_zero_last_column('/home/ls/parshas/vscode/5465_processing/5465_intersect_artdeco_arriba.txt')
print("Number of lines with 0 in the last column:", zero_count)
print("Percentage of lines with 0 in the last column: {:.2f}%".format(zero_percentage))


Number of lines with 0 in the last column: 823
Percentage of lines with 0 in the last column: 43.22%


In [2]:
# basic filter arriba

# not need to run every time!!!
#  switch the start and end positions whenever the start position is greater than the end position
input_bed = "/private/projects/kidney_rtt/bed_arriba_output_basic_filter/TCGA-CZ-5465-11A-01R-1503-07.bed"
output_bed = "/private/projects/kidney_rtt/bed_arriba_output_basic_filter/TCGA-CZ-5465-11A-01R-1503-07_switchStartEnd.bed"
switch_start_end(input_bed, output_bed)

In [8]:
artdeco_bed = "/home/ls/parshas/5465_results/all_dogs_withnames.bed"
arriba_bed = "/private/projects/kidney_rtt/bed_arriba_output_basic_filter/TCGA-CZ-5465-11A-01R-1503-07_switchStartEnd.bed"
output = "/home/ls/parshas/vscode/5465_processing/5465_intersect_artdeco_arriba_basic_filter.txt"

run_bedtools_intersect(artdeco_bed, arriba_bed, output)

'bedtools intersect' executed successfully. Output saved to /home/ls/parshas/vscode/5464_processing/5465_intersect_artdeco_arriba_basic_filter.txt.


In [12]:
zero_count, zero_percentage = count_lines_with_zero_last_column('/home/ls/parshas/vscode/5465_processing/5465_intersect_artdeco_arriba_basic_filter.txt')
print("Number of lines with 0 in the last column:", zero_count)
print("Percentage of lines with 0 in the last column: {:.2f}%".format(zero_percentage))


Number of lines with 0 in the last column: 797
Percentage of lines with 0 in the last column: 41.86%


In [5]:
# for no_filter
input_bed = "/private/projects/kidney_rtt/no_filter_arriba_bed/TCGA-CZ-5465-11A-01R-1503-07.bed"
output_bed = "/private/projects/kidney_rtt/no_filter_arriba_bed/TCGA-CZ-5465-11A-01R-1503-07_switchStartEnd.bed"
switch_start_end(input_bed, output_bed)
artdeco_bed = "/home/ls/parshas/5465_results/all_dogs_withnames.bed"
arriba_bed = "/private/projects/kidney_rtt/no_filter_arriba_bed/TCGA-CZ-5465-11A-01R-1503-07_switchStartEnd.bed"
output = "/home/ls/parshas/vscode/5465_processing/5465_intersect_artdeco_arriba_no_filter.txt"
run_bedtools_intersect(artdeco_bed, arriba_bed, output)
zero_count, zero_percentage = count_lines_with_zero_last_column('/home/ls/parshas/vscode/5465_processing/5465_intersect_artdeco_arriba_no_filter.txt')
print("Number of lines with 0 in the last column:", zero_count)
print("Percentage of lines with 0 in the last column: {:.2f}%".format(zero_percentage))

'bedtools intersect' executed successfully. Output saved to /home/ls/parshas/vscode/5464_processing/5465_intersect_artdeco_arriba_no_filter.txt.
Number of lines with 0 in the last column: 0
Percentage of lines with 0 in the last column: 0.00%


In [6]:
# for intersect elife arriba
elife_bed = "/home/ls/parshas/vscode/5465_processing/elife4565_hg38.bed"
arriba_bed = "/private/projects/kidney_rtt/bed_arriba_output_basic_filter/TCGA-CZ-5465-11A-01R-1503-07_switchStartEnd.bed"
output = "/home/ls/parshas/vscode/5465_processing/5465_intersect_elife_arriba.txt"
run_bedtools_intersect(elife_bed, arriba_bed, output)
zero_count, zero_percentage = count_lines_with_zero_last_column('/home/ls/parshas/vscode/5465_processing/5465_intersect_elife_arriba.txt')
print("Number of lines with 0 in the last column:", zero_count)
print("Percentage of lines with 0 in the last column: {:.2f}%".format(zero_percentage))

'bedtools intersect' executed successfully. Output saved to /home/ls/parshas/vscode/5464_processing/5465_intersect_elife_arriba.txt.
Number of lines with 0 in the last column: 277
Percentage of lines with 0 in the last column: 46.24%


In [7]:
# for intersect elife artdeco
elife_bed = "/home/ls/parshas/vscode/5465_processing/elife4565_hg38.bed"
artdeco_bed = "/home/ls/parshas/5465_results/all_dogs_withnames.bed"
output = "/home/ls/parshas/vscode/5465_processing/5465_intersect_elife_artdeco.txt"
run_bedtools_intersect(elife_bed, artdeco_bed, output)
zero_count, zero_percentage = count_lines_with_zero_last_column('/home/ls/parshas/vscode/5465_processing/5465_intersect_elife_artdeco.txt')
print("Number of lines with 0 in the last column:", zero_count)
print("Percentage of lines with 0 in the last column: {:.2f}%".format(zero_percentage))

'bedtools intersect' executed successfully. Output saved to /home/ls/parshas/vscode/5464_processing/5465_intersect_elife_artdeco.txt.
Number of lines with 0 in the last column: 599
Percentage of lines with 0 in the last column: 100.00%


In [8]:
# for basic2_filter
# filters - /private/projects/kidney_rtt/all_plots/python_arriba/basic2_filters.txt
input_bed = "/private/projects/kidney_rtt/bed_arriba_output_basic2_filter/TCGA-CZ-5465-11A-01R-1503-07.bed"
output_bed = "/private/projects/kidney_rtt/bed_arriba_output_basic2_filter/TCGA-CZ-5465-11A-01R-1503-07_switchStartEnd.bed"
switch_start_end(input_bed, output_bed)
artdeco_bed = "/home/ls/parshas/5465_results/all_dogs_withnames.bed"
arriba_bed = "/private/projects/kidney_rtt/bed_arriba_output_basic2_filter/TCGA-CZ-5465-11A-01R-1503-07_switchStartEnd.bed"
output = "/home/ls/parshas/vscode/5465_processing/5465_intersect_artdeco_arriba_basic2_filter.txt"
run_bedtools_intersect(artdeco_bed, arriba_bed, output)
zero_count, zero_percentage = count_lines_with_zero_last_column('/home/ls/parshas/vscode/5465_processing/5465_intersect_artdeco_arriba_basic2_filter.txt')
print("Number of lines with 0 in the last column:", zero_count)
print("Percentage of lines with 0 in the last column: {:.2f}%".format(zero_percentage))

'bedtools intersect' executed successfully. Output saved to /home/ls/parshas/vscode/5464_processing/5465_intersect_artdeco_arriba_basic2_filter.txt.
Number of lines with 0 in the last column: 390
Percentage of lines with 0 in the last column: 20.48%


In [5]:
# for marge_dup_filter
artdeco_bed = "/home/ls/parshas/5465_results/all_dogs_withnames.bed"
arriba_bed = "/private/projects/kidney_rtt/merge_gene1_bed/new2.bed"
output = "/home/ls/parshas/vscode/5465_processing/5465_intersect_artdeco_arriba_merge_filter.txt"
run_bedtools_intersect(artdeco_bed, arriba_bed, output)
zero_count, zero_percentage = count_lines_with_zero_last_column(output)
print("Number of lines with 0 in the last column:", zero_count)
print("Percentage of lines with 0 in the last column: {:.2f}%".format(zero_percentage))
print("Percentage of lines without 0 in the last column:\t{:.2f}%\n\n".format(100 - zero_percentage))


'bedtools intersect' executed successfully. Output saved to /home/ls/parshas/vscode/5465_processing/5465_intersect_artdeco_arriba_merge_filter.txt.
Number of lines with 0 in the last column: 411
Percentage of lines with 0 in the last column: 21.59%
Percentage of lines without 0 in the last column:	78.41%




In [6]:
# for marge_dup_filter - intersect arriba vs artdeco (no artdeco vs arriba!)
artdeco_bed = "/home/ls/parshas/5465_results/all_dogs_withnames.bed"
arriba_bed = "/private/projects/kidney_rtt/merge_gene1_bed/new2.bed"
output = "/home/ls/parshas/vscode/5465_processing/5465_intersect_arriba_merge_vs_artdeco__filter.txt"
run_bedtools_intersect(arriba_bed, artdeco_bed , output)
zero_count, zero_percentage = count_lines_with_zero_last_column(output)
print("Number of lines with 0 in the last column:", zero_count)
print("Percentage of lines with 0 in the last column: {:.2f}%".format(zero_percentage))
print("Percentage of lines without 0 in the last column:\t{:.2f}%\n\n".format(100 - zero_percentage))


'bedtools intersect' executed successfully. Output saved to /home/ls/parshas/vscode/5465_processing/5465_intersect_arriba_merge_vs_artdeco__filter.txt.
Number of lines with 0 in the last column: 18610
Percentage of lines with 0 in the last column: 87.46%
Percentage of lines without 0 in the last column:	12.54%




In [5]:
# for marge_dup__add_indo - intersect arriba vs artdeco (no artdeco vs arriba!)
artdeco_bed = "/home/ls/parshas/5465_results/all_dogs_withnames.bed"
arriba_bed = "/private/projects/kidney_rtt/merge_gene1_bed/new3.bed"
output = "/home/ls/parshas/vscode/5465_processing/5465_intersect_arriba_merge_vs_artdeco_add_info.txt"
run_bedtools_intersect(arriba_bed, artdeco_bed , output)
zero_count, zero_percentage = count_lines_with_zero_last_column(output)
print("Number of lines with 0 in the last column:", zero_count)
print("Percentage of lines with 0 in the last column: {:.2f}%".format(zero_percentage))
print("Percentage of lines without 0 in the last column:\t{:.2f}%\n\n".format(100 - zero_percentage))



'bedtools intersect' executed successfully. Output saved to /home/ls/parshas/vscode/5465_processing/5465_intersect_arriba_merge_vs_artdeco_add_info.txt.
Number of lines with 0 in the last column: 18610
Percentage of lines with 0 in the last column: 87.46%
Percentage of lines without 0 in the last column:	12.54%




In [9]:
# for filter_3
# filters:
# Arriba filtering - try to find fusions from readthrough
# We filter out fusions from different chromosomes. 
# We filtered if fusion type contains translocation or inversions
# We calculated the distance between the breakpoints and filtered out distances that grader than 0.5kb. 
# We filtered fusions that came from mitochondrial chromosome. 
# We filtered fusion that exist in star-fusion on gtex. 
# We filtered out fusions that came from the same gene and site1 and site 2 are integenic.
# We filter out if site1 or site 2 is intron. 
# We filtered out if the filters contains ‘blacklist’

input_bed = "/private/projects/kidney_rtt/new/TCGA-CZ-5465-11A-01R-1503-07.bed"
output_bed = "/private/projects/kidney_rtt/new/TCGA-CZ-5465-11A-01R-1503-07_switchStartEnd.bed"
switch_start_end(input_bed, output_bed)
artdeco_bed = "/home/ls/parshas/5465_results/all_dogs_withnames.bed"
arriba_bed = "/private/projects/kidney_rtt/new/TCGA-CZ-5465-11A-01R-1503-07_switchStartEnd.bed"
output = "/home/ls/parshas/vscode/5465_processing/5465_intersect_artdeco_arriba_filter_3.txt"
run_bedtools_intersect(artdeco_bed, arriba_bed, output)
zero_count, zero_percentage = count_lines_with_zero_last_column('/home/ls/parshas/vscode/5465_processing/5465_intersect_artdeco_arriba_filter_3.txt')
print("Number of lines with 0 in the last column:", zero_count)
print("Percentage of lines with 0 in the last column: {:.2f}%".format(zero_percentage))

'bedtools intersect' executed successfully. Output saved to /home/ls/parshas/vscode/5464_processing/5465_intersect_artdeco_arriba_filter_3.txt.
Number of lines with 0 in the last column: 368
Percentage of lines with 0 in the last column: 19.33%


In [None]:
# move all the dogs files to new directory

# Function to loop over files in directory and subdirectories
def loop_files(directory, new_directory):
    # Loop over files in directory and subdirectories
    for root, _, files in os.walk(directory):
        for filename in files:
            print(filename)
            if filename.endswith(".dogs.bed"):
                file_path = os.path.join(root, filename)
                new_file_path = os.path.join(new_directory, filename)
                print(file_path)
                print(new_file_path)
                # Copy the file to the new directory
                shutil.copy(file_path, new_file_path)


loop_files("/private/projects/kidney_rtt/tcga_output", "/private/projects/kidney_rtt/dogs_all_samples")
loop_files("/private/projects/kidney_rtt/artdeco_run", "/private/projects/kidney_rtt/dogs_all_samples")


In [19]:
# Function to find files containing a specific number in their names
def find_files_with_number(directory, number):
    # Compile the regular expression pattern to match the number in the filename
    pattern = re.compile(fr".*{number}.*", re.IGNORECASE)

    # List to store the matching filenames
    matching_files = []

    # Loop over files in the directory
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)

        # Check if the filename matches the pattern
        if pattern.match(filename):
            matching_files.append(file_path)

    return matching_files

In [25]:
# apply the intersect with dogs and count_lines_with_zero_last_column on all the bed file from the output of arriba

# Specify the directory path
directory_arriba = '/private/projects/kidney_rtt/filters_arriba_bed'
directory_artdeco = "/private/projects/kidney_rtt/dogs_all_samples"
samples_not_found_in_arriba = []
# Loop over files in the directory
for filename in os.listdir(directory_artdeco):
    if filename.endswith(".bed"):  
        file_path_artdeco = os.path.join(directory_artdeco, filename)
        artdeco_bed = file_path_artdeco
        split_list = filename.split("-")
        number_sample = split_list[2]
        print("number sample : ", number_sample)
        
        arriba_bed_files = find_files_with_number(directory_arriba, number_sample)
        print(arriba_bed_files)
        if(len(arriba_bed_files)==0):
            samples_not_found_in_arriba.append(number_sample)
            continue
        arriba_bed = arriba_bed_files[0]

        new_filename = f"intersect_{filename}"
        output_path = os.path.join("/private/projects/kidney_rtt/intersect_all_samples", new_filename)

        run_bedtools_intersect(artdeco_bed, arriba_bed, output_path)
        zero_count, zero_percentage = count_lines_with_zero_last_column(output_path)

        summary_path = "/private/projects/kidney_rtt/intersect_all_samples/summary_intersect.txt"
        with open(summary_path, 'a') as sammary:
            sammary.write("sample: "+str(number_sample)+"\n")
            sammary.write("Number of lines with 0 in the last column:\t" + str(zero_count) + "\n")
            sammary.write("Percentage of lines with 0 in the last column:\t{:.2f}%\n".format(zero_percentage))
            sammary.write("Percentage of lines without 0 in the last column:\t{:.2f}%\n\n".format(100 - zero_percentage))
print("samples not found in arriba: ", samples_not_found_in_arriba)





number sample :  A8L7
['/private/projects/kidney_rtt/filters_arriba_bed/TCGA-G6-A8L7-01A-11R-A37O-07.bed']
'bedtools intersect' executed successfully. Output saved to /private/projects/kidney_rtt/intersect_all_samples/intersect_TCGA-G6-A8L7-01A-11R-A37O-07.dogs.bed.
number sample :  3778
['/private/projects/kidney_rtt/filters_arriba_bed/TCGA-AS-3778-01A-01R-A32Z-07.bed']
'bedtools intersect' executed successfully. Output saved to /private/projects/kidney_rtt/intersect_all_samples/intersect_TCGA-AS-3778-01A-01R-A32Z-07.dogs.bed.
number sample :  A8OX
['/private/projects/kidney_rtt/filters_arriba_bed/TCGA-A3-A8OX-01A-11R-A37O-07.bed']
'bedtools intersect' executed successfully. Output saved to /private/projects/kidney_rtt/intersect_all_samples/intersect_TCGA-A3-A8OX-01A-11R-A37O-07.dogs.bed.
number sample :  3372
['/private/projects/kidney_rtt/filters_arriba_bed/TCGA-A3-3372-01A-02R-1325-07.bed']
'bedtools intersect' executed successfully. Output saved to /private/projects/kidney_rtt/int

In [8]:
# for finish filter (up to 21.6.23) with readthrough filterd 5465
artdeco_bed = "/home/ls/parshas/vscode/5465_processing/for_filter_by_fpkm/readthrough_filtered.bed"
arriba_bed = "/private/projects/kidney_rtt/filters_arriba_bed/TCGA-CZ-5465-11A-01R-1503-07.bed"
output = "/home/ls/parshas/vscode/5465_processing/5465_intersect_readthrough_filter_arriba_filter.txt"
run_bedtools_intersect(artdeco_bed, arriba_bed, output)
zero_count, zero_percentage = count_lines_with_zero_last_column(output)
print("Number of lines with 0 in the last column:", zero_count)
print("Percentage of lines with 0 in the last column: {:.2f}%".format(zero_percentage))
print("Percentage of lines without 0 in the last column: {:.2f}%".format(100-zero_percentage))

'bedtools intersect' executed successfully. Output saved to /home/ls/parshas/vscode/5465_processing/5465_intersect_readthrough_filter_arriba_filter.txt.
Number of lines with 0 in the last column: 2325
Percentage of lines with 0 in the last column: 50.92%
Percentage of lines without 0 in the last column: 49.08%


In [9]:
# for finish filter (up to 21.6.23) with read_in filterd 5465
artdeco_bed = "/home/ls/parshas/vscode/5465_processing/for_filter_by_fpkm/read_in_filtered.bed"
arriba_bed = "/private/projects/kidney_rtt/filters_arriba_bed/TCGA-CZ-5465-11A-01R-1503-07.bed"
output = "/home/ls/parshas/vscode/5465_processing/5465_intersect_read_in_filter_arriba_filter.txt"
run_bedtools_intersect(artdeco_bed, arriba_bed, output)
zero_count, zero_percentage = count_lines_with_zero_last_column(output)
print("Number of lines with 0 in the last column:", zero_count)
print("Percentage of lines with 0 in the last column: {:.2f}%".format(zero_percentage))
print("Percentage of lines without 0 in the last column: {:.2f}%".format(100-zero_percentage))

'bedtools intersect' executed successfully. Output saved to /home/ls/parshas/vscode/5465_processing/5465_intersect_read_in_filter_arriba_filter.txt.
Number of lines with 0 in the last column: 2627
Percentage of lines with 0 in the last column: 42.12%
Percentage of lines without 0 in the last column: 57.88%


In [5]:
def find_rows_with_gene_id(bed_file, gene_id):
    matching_rows = []

    with open(bed_file, 'r') as file:
        for line in file:
            columns = line.split('\t')
            if len(columns) >= 4 and columns[3] == gene_id:
                matching_rows.append(columns)

    if len(matching_rows) > 1:
        print("found more than one line with this gene id")
    if len(matching_rows) == 0:
        print("gene id not found in readthrough.bed")
    return matching_rows[0]

In [6]:
def readthrough_readin_filter_and_to_bed(txt_file, readthrough_or_readin_bed_file): 
# the directory for the bed files = "/private/projects/kidney_rtt/readthrough_all_samples/"
    data = pyensembl.Genome(reference_name='GRCh38', annotation_name='my_genome_features',
                                gtf_path_or_url='/data01/private/resources/gencode.v34.GRCh38.annotation.gtf')
    data.index()  # parse GTF and construct database of genomic features
    df = pd.read_csv(txt_file, sep='\t')
    #count number of samples in this result
    number_of_samples = int((df.shape[1] - 2)/4)
    # create bed file for every sample
    column_list = df.columns.tolist()
    for i in range(number_of_samples):
        sample_column = column_list[(i * 4) + 2]
        split_list = sample_column.split(" ")
        sample_name = split_list[0]
        output_name = str(sample_name+".readthrough.bed")
        outputpath = os.path.join("/private/projects/kidney_rtt/readthrough_all_samples/",output_name)
        # Open the file for writing
        with open(outputpath, 'w') as bed_file:
            # Iterate over each row of the dataframe
            for index, row in df.iterrows():
                if(df.iloc[index, (i * 4) + 2] == 0 or df.iloc[index, (i * 4) + 3] == 0 or df.iloc[index, (i * 4) + 4] == 0): # filter gene count = 0 or gene FPKM = 0 or read_in/readthrough count = 0 
                    continue
                ens_id = df.iloc[index, 0]
                name = data.gene_name_of_gene_id(ens_id)
                bed_info = find_rows_with_gene_id(readthrough_or_readin_bed_file, ens_id)
                #calolate readthrough \ read-in FPKM
                z = df.iloc[index, (i * 4) + 5]
                y = df.iloc[index, (i * 4) + 3]
                fpkm = 2 ** (z + math.log2(y))
                # Write the chromosome name, start position, end position, gene name,
                # score and strand information to the BED file
                bed_file.write('\t'.join([
                    str(bed_info[0]),
                    str(bed_info[1]),
                    str(bed_info[2]),
                    str(name),
                    str(fpkm), # readthrough / read-in FPKM 
                    str(bed_info[5])
                ]))


In [16]:
#checks! need to be deleted
txt_file = "/private/projects/kidney_rtt/tcga_output/output_10/readthrough/readthrough.txt" 
txt_file = "/private/projects/kidney_rtt/tcga_5465_output_one_sample_running_before_downsample/readthrough/readthrough.txt"
df = pd.read_csv(txt_file, sep='\t')
#count number of samples in this result
number_of_samples = int((df.shape[1] - 2)/4)
print(number_of_samples)
column_list = df.columns.tolist()
for i in range(number_of_samples):
    sample_column = column_list[(i * 4) + 2]
    split_list = sample_column.split(" ")
    sample_name = split_list[0]
    output_name = str(sample_name+".readthrough.bed")
    outputpath = os.path.join("/private/projects/kidney_rtt/readthrough_all_samples/",output_name)
    print(sample_column)
    print(sample_name)
    print(output_name)
    print(outputpath)
    print(df.iloc[1, (i * 4) + 2])
    print(df.iloc[1, (i * 4) + 3])
    print(df.iloc[1, (i * 4) + 4])
    #calolate readthrough \ read-in FPKM
    z = df.iloc[1, (i * 4) + 5]
    y = df.iloc[1, (i * 4) + 3]
    fpkm = 2 ** (z + math.log2(y))
    print("FPKM, ", fpkm, "\n")

1
TCGA_CZ_5465_11A_01R_1503_07_half Gene Count
TCGA_CZ_5465_11A_01R_1503_07_half
TCGA_CZ_5465_11A_01R_1503_07_half.readthrough.bed
/private/projects/kidney_rtt/readthrough_all_samples/TCGA_CZ_5465_11A_01R_1503_07_half.readthrough.bed
40.2106270943035
0.335
0.0
FPKM,  0.008128971180987115 



In [10]:

# find all the readthrough.txt files, create from them bed files and move to one directory for all 

directory_arriba = '/private/projects/kidney_rtt/filters_arriba_bed'
samples_not_found_in_arriba = []
# Function to loop over files in directory and subdirectories
def loop_files(directory):
    # Loop over files in directory and subdirectories
    for root, _, files in os.walk(directory):
        for filename in files:
            if filename == "readthrough.txt":
                file_path = os.path.join(root, filename)
                parent_directory = os.path.dirname(root)  # Get the parent directory of root
                readthrough_bed_path = os.path.join(parent_directory, "preprocess_files/readthrough.bed")
                print(file_path)
                print(parent_directory)
                print(readthrough_bed_path, "\n\n")
                readthrough_readin_filter_and_to_bed(file_path, readthrough_bed_path)
                


loop_files("/private/projects/kidney_rtt/tcga_output")
loop_files("/private/projects/kidney_rtt/artdeco_run")

/private/projects/kidney_rtt/tcga_output/small_files/output_1/readthrough/readthrough.txt
/private/projects/kidney_rtt/tcga_output/small_files/output_1
/private/projects/kidney_rtt/tcga_output/small_files/output_1/preprocess_files/readthrough.bed 


/private/projects/kidney_rtt/tcga_output/small_files/output_2/readthrough/readthrough.txt
/private/projects/kidney_rtt/tcga_output/small_files/output_2
/private/projects/kidney_rtt/tcga_output/small_files/output_2/preprocess_files/readthrough.bed 


/private/projects/kidney_rtt/tcga_output/small_files/output_3/readthrough/readthrough.txt
/private/projects/kidney_rtt/tcga_output/small_files/output_3
/private/projects/kidney_rtt/tcga_output/small_files/output_3/preprocess_files/readthrough.bed 


/private/projects/kidney_rtt/tcga_output/small_files/output_4/readthrough/readthrough.txt
/private/projects/kidney_rtt/tcga_output/small_files/output_4
/private/projects/kidney_rtt/tcga_output/small_files/output_4/preprocess_files/readthrough.bed 




In [8]:

# count how many samples are
directory_arriba = '/private/projects/kidney_rtt/filters_arriba_bed'
samples_not_found_in_arriba = []
# Function to loop over files in directory and subdirectories

def loop_files(directory):
    count = 0
    # Loop over files in directory and subdirectories
    with open(all_samples.txt, 'r') as file:
        for root, _, files in os.walk(directory):
            for filename in files:
                if filename.endswith(".dogs.bed"):
                    file_path = os.path.join(root, filename)
                    df = pd.read_csv(file_path, sep='\t')
                    print(file_path)
                    #count number of samples in this result
                    number_of_samples = int((df.shape[1] - 2)/4)
                    count += number_of_samples
    print("count ", count)
                


loop_files("/private/projects/kidney_rtt/tcga_output")
loop_files("/private/projects/kidney_rtt/artdeco_run")
loop_files("/private/projects/kidney_rtt/dogs/only_dogs")

/private/projects/kidney_rtt/tcga_output/small_files/output_1/dogs/TCGA-G6-A8L7-01A-11R-A37O-07.dogs.bed
/private/projects/kidney_rtt/tcga_output/small_files/output_1/dogs/TCGA-AS-3778-01A-01R-A32Z-07.dogs.bed
/private/projects/kidney_rtt/tcga_output/small_files/output_1/dogs/TCGA-A3-A8OX-01A-11R-A37O-07.dogs.bed
/private/projects/kidney_rtt/tcga_output/small_files/output_1/dogs/TCGA-A3-3372-01A-02R-1325-07.dogs.bed
/private/projects/kidney_rtt/tcga_output/small_files/output_1/dogs/TCGA-A3-3326-01A-01R-0864-07.dogs.bed
/private/projects/kidney_rtt/tcga_output/small_files/output_1/dogs/TCGA-A3-3320-01A-02R-1325-07.dogs.bed
/private/projects/kidney_rtt/tcga_output/small_files/output_1/dogs/TCGA-A3-3323-01A-02R-1325-07.dogs.bed
/private/projects/kidney_rtt/tcga_output/small_files/output_1/dogs/TCGA-A3-3387-11A-01R-1541-07.dogs.bed
/private/projects/kidney_rtt/tcga_output/small_files/output_1/dogs/TCGA-A3-3322-01A-02R-1325-07.dogs.bed
/private/projects/kidney_rtt/tcga_output/small_files/ou

In [None]:
# apply the intersect with dogs and count_lines_with_zero_last_column on all the bed file from the output of arriba

# Specify the directory path
directory_arriba = '/private/projects/kidney_rtt/filters_arriba_bed'
directory_artdeco = "/private/projects/kidney_rtt/dogs_all_samples"
samples_not_found_in_arriba = []
# Loop over files in the directory
for filename in os.listdir(directory_artdeco):
    if filename.endswith(".bed"):  
        file_path_artdeco = os.path.join(directory_artdeco, filename)
        artdeco_bed = file_path_artdeco
        split_list = filename.split("-")
        number_sample = split_list[2]
        print("number sample : ", number_sample)
        
        arriba_bed_files = find_files_with_number(directory_arriba, number_sample)
        print(arriba_bed_files)
        if(len(arriba_bed_files)==0):
            samples_not_found_in_arriba.append(number_sample)
            continue
        arriba_bed = arriba_bed_files[0]

        new_filename = f"intersect_{filename}"
        output_path = os.path.join("/private/projects/kidney_rtt/intersect_all_samples", new_filename)

        run_bedtools_intersect(artdeco_bed, arriba_bed, output_path)
        zero_count, zero_percentage = count_lines_with_zero_last_column(output_path)

        summary_path = "/private/projects/kidney_rtt/intersect_all_samples/summary_intersect.txt"
        with open(summary_path, 'a') as sammary:
            sammary.write("sample: "+str(number_sample)+"\n")
            sammary.write("Number of lines with 0 in the last column:\t" + str(zero_count) + "\n")
            sammary.write("Percentage of lines with 0 in the last column:\t{:.2f}%\n".format(zero_percentage))
            sammary.write("Percentage of lines without 0 in the last column:\t{:.2f}%\n\n".format(100 - zero_percentage))
print("samples not found in arriba: ", samples_not_found_in_arriba)



