# Create .bedgraph and chrom.sizes files from .bed file.

Here we read, filter and categorize the fragments from .bed file and create .bedgraph and chrom.sizes files which can be subsequently used to create bigWig files.

### Imports

In [None]:
import csv
import numpy as np
import pandas as pd
import pyBigWig as pbw

### File Paths

In [None]:
BED_FILE_PATH ="/home/stud5/testfiles/fragments/"
BEDGRAH_FILE_PATH="/home/stud5/testfiles/fragments/"
CHROM_SIZES_PATH="/home/stud5/testfiles/fragments/"
BED_FILE_NAME ="stomach_SM-JF1O3_rep1_fragments.bed"

### Read file into a DataFrame

In [None]:
#read bed file
bed_file = open(BED_FILE_PATH+BED_FILE_NAME, "r")

In [None]:
#get data from file
frag_list = []
for line in bed_file:
    chrom, start, end, cell, value, etc = line.strip().split()
    frag_list.append([chrom, start, end, cell, value, etc])

In [None]:
#create a dataframe
frag_df = pd.DataFrame(data = frag_list, columns = ['Chromosome', 'Start', 'End', 'Name', 'Score', 'Strand'])

### Set Score and data type

In [None]:
#Here the score is set to 1 so that it can be used to calculate 
#and store the fragment count while the merging of overlapping fragments.
#Set score to 1, set datatype
frag_df.Score = np.full(len(frag_df),1)
frag_df.Start = pd.to_numeric(frag_df.Start, downcast='integer')
frag_df.End = pd.to_numeric(frag_df.End, downcast='integer')
frag_df.Score = pd.to_numeric(frag_df.Score, downcast='float')

In [None]:
#sorting entries wrt chromosome and end:
frag_df_sorted = frag_df.sort_values(by=['Chromosome', 'End'])

## Create chrom.sizes file

In [None]:
#Here we use max value of the end position for every chromosome as its size.
max_per_chrom = frag_df_sorted.groupby(['Chromosome']).max()

In [None]:
#write chroms.size file:
with open(CHROM_SIZES_PATH+"chrom.sizes", 'w', newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
    for key in max_per_chrom["End"].keys():
        writer.writerow([key, int(max_per_chrom["End"][key])])

## Create .bedgraph files using different Categorization strategies

Categorization stradegies based on the fragment length distribution.

### Categorization Strategy 0

Without any categorization we write the data into the .bedgrah file.

In [None]:
with open(BEDGRAH_FILE_PATH+BED_FILE_NAME.split(".")[0] + ".bedgraph", "w", newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
    for index, row in frag_df_sorted.iterrows():
        writer.writerow([row['Chromosome'], row['Start'], row['End'], row['Score']])

### Categorization Strategy 1

Here we categorize on the basis of wheather the fragment contains a nucleosome or not.

In [None]:
ohne_nucleusome_df = frag_df_sorted.loc[frag_df_sorted['End']-frag_df_sorted['Start'] <= 200]
mit_nucleosome_df = frag_df_sorted.loc[frag_df_sorted['End']-frag_df_sorted['Start'] >= 200]

In [None]:
#create bedgraph ohne nucleosome:
with open(BEDGRAH_FILE_PATH+BED_FILE_NAME.split(".")[0] + "_ohne_nucleosome.bedgraph", "w", newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
    for index, row in ohne_nucleusome_df.iterrows():
        writer.writerow([row['Chromosome'], row['Start'], row['End'], row['Score']])

In [None]:
#create bedgraph mit nucleosome:
with open(BEDGRAH_FILE_PATH+BED_FILE_NAME.split(".")[0] + "_mit_nucleosome.bedgraph", "w", newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
    for index, row in mit_nucleosome_df.iterrows():
        writer.writerow([row['Chromosome'], row['Start'], row['End'], row['Score']])

### Categorization Strategy 2

Define categorization characteristics:
Categorization based on wheather the fragment lies in the neighbourhood of minima or maxima.

The positions of maxima and minima should be determined in the previous stages of analysis.
Hardcoding these here for testing.

Calculating assuming the first peak lies on 50bp and the second peak on 210bp. (-> period =160bp)
Create a neighbourhood border wrt to the proximity to the maxima or minima.
Here we take it as a distance of 40b from a maxima or a minima.

max_border_1 = 90, 
min_border_1 = 170, 
max_border_2 = 250, 
min_border_2 = 330, 
max_border_3 = 410, 
min_border_3 = 480, 
etc.

In [None]:
auf_maxima_df = frag_df_sorted.loc[np.floor((frag_df_sorted['End']-frag_df_sorted['Start'] -10)/90) % 2 == 0]
auf_minima_df = frag_df_sorted.loc[np.floor((frag_df_sorted['End']-frag_df_sorted['Start'] -10)/90) % 2 == 1]

In [None]:
#create bedgraph auf maxima:
with open(BEDGRAH_FILE_PATH+BED_FILE_NAME.split(".")[0] + "_auf_maxima.bedgraph", "w", newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
    for index, row in auf_maxima_df.iterrows():
        writer.writerow([row['Chromosome'], row['Start'], row['End'], row['Score']])

In [None]:
#create bedgraph auf minima:
with open(BEDGRAH_FILE_PATH+BED_FILE_NAME.split(".")[0] + "_auf_minima.bedgraph", "w", newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
    for index, row in auf_minima_df.iterrows():
        writer.writerow([row['Chromosome'], row['Start'], row['End'], row['Score']])

### Categorization Strategy 3

Here we consider a small neighbourhood around the minimas and lebel them as the fragments that lie definitely on a minima (or in other words: there is a high probability of these fragments lying on a nucleosome). 

The positions of maxima and minima should be determined in the previous stages of analysis.
Hardcoding these here for testing.
Calculating assuming the first peak lies on 50bp and the second peak on 210bp. (-> period =160bp)

Categories: def_auf_minima and all_others
Considering 10bp left or right of a minima as as def_auf_minima.

In [None]:
def_auf_minima_df = frag_df_sorted.loc[(((frag_df_sorted['End']-frag_df_sorted['Start'] -50) % 160) >=70) & (((frag_df_sorted['End']-frag_df_sorted['Start'] -50) % 160) <= 90)]
all_others_df = frag_df_sorted.loc[(((frag_df_sorted['End']-frag_df_sorted['Start'] -50) % 160) < 70) | (((frag_df_sorted['End']-frag_df_sorted['Start'] -50) % 160) > 90)]

In [None]:
#create bedgraph definitely auf minima:
with open(BEDGRAH_FILE_PATH+BED_FILE_NAME.split(".")[0] + "_def_auf_minima.bedgraph", "w", newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
    for index, row in def_auf_minima_df.iterrows():
        writer.writerow([row['Chromosome'], row['Start'], row['End'], row['Score']])

In [None]:
#create bedgraph all other fragments:
with open(BEDGRAH_FILE_PATH+BED_FILE_NAME.split(".")[0] + "_all_others.bedgraph", "w", newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
    for index, row in all_others_df.iterrows():
        writer.writerow([row['Chromosome'], row['Start'], row['End'], row['Score']])