# Experimental Counting Optimization

In [1]:
bamfile = "/mnt/workspace2/jdetlef/data/public_data/heart_left_ventricle_194_CB_tagged.bam"
fragments_file = "/mnt/workspace2/jdetlef/data/public_data/fragments_heart_left_ventricle_194_sorted.bed"
h5ad_file = "/mnt/workspace2/jdetlef/data/public_data/heart_lv_SM-JF1NY.h5ad"

In [2]:
fragments_file = "/home/jan/Workspace/bio_data/fragments_heart_left_ventricle_194_sorted.bed"
h5ad_file = "/home/jan/Workspace/bio_data/heart_lv_SM-JF1NY.h5ad"

In [2]:
import sctoolbox.tools as tools

Unable to determine R home: [Errno 2] No such file or directory: 'R'


In [3]:
# individual imports
import episcanpy as epi
import pandas as pd
import gzip
import datetime
import multiprocessing as mp
from multiprocessing import Manager, Lock, Pool

from beartype import beartype
from beartype.typing import Any, Optional

In [4]:
adata = epi.read_h5ad(h5ad_file)
adata

AnnData object with n_obs × n_vars = 9110 × 1154611
    obs: 'logUMI', 'tsse', 'tissue', 'cell type', 'Life stage', 'closest Cell Ontology term(s)', 'Cell Ontology ID'
    var: 'Chromosome', 'hg38_Start', 'hg38_End', 'Class', 'Present in fetal tissues', 'Present in adult tissues', 'CRE module'

In [5]:
adata_barcodes = adata.obs.index.tolist()

In [6]:
%%time
# split index for barcodes CBs
barcodes = []
for entry in adata_barcodes:
    barcode = entry.split('+')[1]
    barcodes.append(barcode)

CPU times: user 1.5 ms, sys: 1.07 ms, total: 2.57 ms
Wall time: 2.57 ms


In [100]:
# individual imports
import episcanpy as epi
import pandas as pd
import numpy as np
import gzip
import datetime
from multiprocessing import Manager, Lock, Pool
from tqdm import tqdm

from beartype import beartype
from beartype.typing import Any, Optional

@beartype
def _is_gz_file(filepath: str) -> bool:
    """
    Check wheather file is a compressed .gz file.

    Parameters
    ----------
    filepath : str
        Path to file.

    Returns
    -------
    bool
        True if the file is a compressed .gz file.
    """

    with open(filepath, 'rb') as test_f:
        return test_f.read(2) == b'\x1f\x8b'

def init_pool_processes(the_lock):
    '''Initialize each process with a global variable lock.
    '''
    global lock
    lock = the_lock

class MPFragmentCounter():
    """
    """

    def __init__(self):
        """Init class variables."""
        pass

    def _check_in_list(self, element: Any, alist: list[Any] | set[Any]) -> bool:
        """
        Check if element is in list.

        TODO Do we need this function?

        Parameters
        ----------
        element : Any
            Element that is checked for.
        alist : list[Any] | set[Any]
            List or set in which the element is searched for.

        Returns
        -------
        bool
            True if element is in list else False
        """

        return element in alist

    def _check_true(element: Any, alist: Optional[list[Any]] = None) -> bool:  # true regardless of input
        """
        Return True regardless of input

        Parameters
        ----------
        element : Any
            Element that is checked for.
        alist: Optional[list[Any]]
            List or set in which the element is searched for.

        Returns
        -------
        bool
            True if element is in list else False
        """

        return True

    def insertsize_from_fragments(self, fragments: str,
                                  barcodes: Optional[list[str]] = None,
                                  n_threads: int = 8) -> pd.DataFrame:

        print('Count insertsizes from fragments...')
        # Open fragments file
        if _is_gz_file(fragments):
            f = gzip.open(fragments, "rt")
        else:
            f = open(fragments, "r")

        # Prepare function for checking against barcodes list
        if barcodes is not None:
            barcodes = set(barcodes)
            check_in = self._check_in_list
        else:
            check_in = self._check_true

        iterator = pd.read_csv(fragments,
                               delimiter='\t',
                               header=None,
                               names=['chr', 'start', 'stop', 'barcode', 'count'],
                               iterator=True,
                               chunksize=5000000)

        # start timer
        start_time = datetime.datetime.now()

        # Initialize multiprocessing
        m = Manager()
        lock = Lock()
        managed_dict = m.dict()
        managed_dict['output'] = {}
        pool = Pool(processes=n_threads, initializer=init_pool_processes, initargs=(lock,), maxtasksperchild=48)
        jobs = []
        print('Starting counting fragments...')
        # split fragments into chunks
        for chunk in tqdm(iterator, desc="Processing Chunks"):
            # apply async job wit callback function
            job = pool.apply_async(self._count_fragments_worker, args=(chunk, barcodes, check_in, managed_dict))
            jobs.append(job)
        # monitor progress
        # utils.monitor_jobs(jobs, description="Progress")
        # close pool
        pool.close()
        # wait for all jobs to finish
        pool.join()
        # reset settings
        count_dict = managed_dict['output']

        # Close file and print elapsed time
        end_time = datetime.datetime.now()
        f.close()

        elapsed = end_time - start_time
        print("Done reading file - elapsed time: {0}".format(str(elapsed).split(".")[0]))

        # Convert dict to pandas dataframe
        print("Converting counts to dataframe...")
        table = pd.DataFrame.from_dict(count_dict, orient="index")
        #table = table[["insertsize_count", "mean_insertsize"] + sorted(table.columns[2:])]
        table["mean_insertsize"] = table["mean_insertsize"].round(2)

        print("Done getting insertsizes from fragments!")

        return table

    def _count_fragments_worker(self, chunk, barcodes, check_in, managed_dict):
        """
        Worker function for counting fragments.
        Parameters
        ----------
        chunk
        barcodes
        check_in
        managed_dict

        Returns
        -------

        """

        # Initialize count_dict
        count_dict = {}
        for row in chunk.itertuples():
            start = int(row[2])
            end = int(row[3])
            barcode = row[4]
            count = int(row[5])
            size = end - start - 9  # length of insertion (-9 due to to shifted cutting of Tn5)

            # Only add fragment if check is true
            if check_in(barcode, barcodes) is True:
                count_dict = self._add_fragment(count_dict, barcode, size, count)

        with lock:
            latest = managed_dict['output']
            managed_dict['output'] = self._update_count_dict(latest, count_dict)

    def _add_fragment(self, count_dict: dict[str, int],
                      barcode: str,
                      size: int,
                      count: int = 1,
                      max_size=1000):
        """
        Add fragment of size 'size' to count_dict.

        Parameters
        ----------
        count_dict : dict[str, int]
            Dictionary containing the counts per insertsize.
        barcode : str
            Barcode of the read.
        size : int
            Insertsize to add to count_dict.
        count : int, default 1
            Number of reads to add to count_dict.
        """

        # Initialize if barcode is seen for the first time
        if barcode not in count_dict:
            count_dict[barcode] = {"mean_insertsize": 0, "insertsize_count": 0}

        # Add read to dict
        if size >= 0 and size <= max_size:  # do not save negative insertsize, and set a cap on the maximum insertsize to limit outlier effects

            count_dict[barcode]["insertsize_count"] += count

            # Update mean
            mu = count_dict[barcode]["mean_insertsize"]
            total_count = count_dict[barcode]["insertsize_count"]
            diff = (size - mu) / total_count
            count_dict[barcode]["mean_insertsize"] = mu + diff

            # Save to distribution
            if size not in count_dict[barcode]:  # first time size is seen
                count_dict[barcode]['dist'] = np.zeros(max_size)
            count_dict[barcode]['dist'][size] += count

        return count_dict

    def _update_count_dict(self, count_dict_1, count_dict_2):
        """
        updates
        """
        # Check if count_dict_1 is empty:
        if len(count_dict_1) == 0:
            return count_dict_2

        # make Dataframes for computation
        df1 = pd.DataFrame(count_dict_1).T
        df2 = pd.DataFrame(count_dict_2).T

        # merge distributions
        combined_dists = df1['dist'].combine(df2['dist'], func=self._update_dist)

        # merge counts
        merged_counts = pd.merge(df1["insertsize_count"], df2["insertsize_count"], left_index=True, right_index=True,
                                 how='outer').fillna(0)
        # sum total counts/barcode
        updated_counts = merged_counts.sum(axis=1)

        # calculate scaling factors
        x_scaling_factor = merged_counts["insertsize_count_x"] / updated_counts
        y_scaling_factor = merged_counts["insertsize_count_y"] / updated_counts

        # merge mean insertsizes
        merged_mean_insertsizes = pd.merge(df1["mean_insertsize"], df2["mean_insertsize"], left_index=True,
                                           right_index=True, how='outer').fillna(0)

        # scale mean insertsizes
        merged_mean_insertsizes["mean_insertsize_x"] = merged_mean_insertsizes["mean_insertsize_x"] * x_scaling_factor
        merged_mean_insertsizes["mean_insertsize_y"] = merged_mean_insertsizes["mean_insertsize_y"] * y_scaling_factor

        # sum the scaled means
        updated_means = merged_mean_insertsizes.sum(axis=1)

        # build the updated dictionary
        updated_dict = pd.DataFrame(
            {'mean_insertsize': updated_means, 'insertsize_count': updated_counts, 'dist': combined_dists}).T.to_dict()

        return updated_dict


    def _update_dist(self, dist_1, dist_2):
        """Updates the Insertsize Distributions"""
        if not np.isnan(dist_1).any() and not np.isnan(dist_2).any():
            updated_dist = dist_1 + dist_2
            return updated_dist.astype(int)
        elif np.isnan(dist_1).any():
            print('triggered 2')
            return dist_2.astype(int)
        elif np.isnan(dist_2).any():
            print('triggered 3')
            return dist_1.astype(int)


In [80]:
def get_dist_df(dist):
    
    table_dict = {}
    for row in dist.iterrows():
        barcode = str(row[0])
        table_dict[barcode] = {}

        for i, counts in enumerate(row[1]['dist']):
            table_dict[barcode][i] = counts
    
    dist_df = pd.DataFrame(table_dict).T
    
    return dist_df

In [88]:
%%time
adata_barcodes = adata.obs.index.tolist()
# split index for barcodes CBs
barcodes = []
for entry in adata_barcodes:
    barcode = entry.split('+')[1]
    barcodes.append(barcode)

CPU times: user 6.06 ms, sys: 2.99 ms, total: 9.05 ms
Wall time: 8.87 ms


In [101]:
%%time
counter = MPFragmentCounter()
table_mp = counter.insertsize_from_fragments(fragments_file, barcodes, n_threads=10)
print(table_mp)

Count insertsizes from fragments...
Starting counting fragments...


Processing Chunks: 17it [01:12,  4.28s/it]


Done reading file - elapsed time: 0:01:38
Converting counts to dataframe...
Done getting insertsizes from fragments!
                        mean_insertsize  insertsize_count  \
AAATCCGCATAAACGTCCCGTT           203.35              4381   
AAATCCGCATAAATGCTACGGG           163.33             11798   
AAATCCGCATAACATGAAGCGC           192.19              2123   
AAATCCGCATAACCGCTAATGA           172.88              3251   
AAATCCGCATAACTTCGACCAG           185.90             13291   
...                                 ...               ...   
TTCGTCCGACTGCCGTCTCAAC           187.90              2838   
TTCGTCCGACTGCGTCGAGTAC           194.37              8575   
TTCGTCCGACTTGAGTGCTGTG           187.27              3059   
TTCGTCCGACTTTCGAAGAAGG           190.14              4343   
TTCGTCCGACTTTGTGTTACCG           189.01              3491   

                                                                     dist  
AAATCCGCATAAACGTCCCGTT  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 

In [76]:
table_mp.loc['AAATCCGCATAAACGTCCCGTT']['dist'].sum()

21

In [93]:
np.isnan(np.nan).any()

True

In [94]:
np.isnan([1,3,1,0,4]).any()

False

In [70]:
%%time
table_sctoolbox = tools._insertsize_from_fragments(fragments_file, barcodes)
print(table_sctoolbox)

[INFO] Counting fragment lengths from fragments file...
[INFO] Done reading file - elapsed time: 0:04:26
[INFO] Converting counts to dataframe...
[INFO] Done getting insertsizes from fragments!
                        insertsize_count  mean_insertsize  0  1  2  3  4  5  \
AGGGATAAACCACCGAAGGTCA              6999           185.35  0  0  0  0  0  0   
AGCGTGTCATTCGCGAGATAGT             18041           183.39  0  4  1  0  0  0   
AAATCCGCATGTCCAGATTTCC             15322           174.45  0  0  0  1  0  0   
TGATTACGCGCCTTTCCGTATC              3066           174.37  0  0  0  0  0  0   
ACCTTCAAGCACTCGTTCCGAT              7780           199.32  0  0  0  0  3  0   
...                                  ...              ... .. .. .. .. .. ..   
TCGGCTTATTTGGACATACTGG              1985           195.80  0  0  0  0  0  0   
CTTCACAGTCCGATATCGATCC              3039           195.34  0  0  0  0  0  0   
GCTCTTGGAAGAATCGGTGTTC              1707           210.02  0  0  0  0  0  0   
ACCCTTATCTTAGATA

In [78]:
table_sctoolbox.loc['AAATCCGCATAAACGTCCCGTT'][[c for c in table_sctoolbox.columns if isinstance(c, int)]].sum()

5761.0

In [112]:
array_1 = np.array([1,1,2,1,0,1])

In [116]:
array_2 = np.array([1,2,2,1,1,1])

In [121]:
array_2[3] += 1
array_2

array([1, 2, 2, 5, 1, 1])

In [122]:
array_1 + array_2

array([2, 3, 4, 6, 1, 2])

In [45]:
table_sctoolbox = table_sctoolbox[[c for c in table_sctoolbox.columns if isinstance(c, int)]]

In [57]:
table_mp = get_dist_df(table_mp)

In [48]:
table_sctoolbox.shape

(9110, 1001)

In [62]:
sorted_sct_table = table_sctoolbox.sort_index()

In [63]:
sorted_table_mp = table_mp.sort_index()

In [68]:
sorted_sct_table.sum()

0          0
1       4962
2       2075
3       1464
4       1045
        ... 
996        7
997       11
998       10
999        5
1000       4
Length: 1001, dtype: int64

In [67]:
sorted_table_mp.sum()

0       0
1      32
2      11
3       4
4      12
       ..
995     0
996     0
997     0
998     0
999     0
Length: 1000, dtype: int64

In [60]:
table_sctoolbox == table_mp

ValueError: Can only compare identically-labeled (both index and columns) DataFrame objects

In [None]:
def count_lines(filename):
    with open(filename, 'r') as file:
        return sum(1 for line in file)

In [None]:
%%time
# Replace 'yourfile.txt' with the path to your file
number_of_lines = count_lines(fragments_file)
print(f"Total number of lines: {number_of_lines}")

In [None]:
#small_fragments = '/mnt/workspace2/jdetlef/data/public_data/cropped_heart_fragments.bed'

In [25]:
small_fragments = '/home/jan/Workspace/bio_data/small_fragments.bed'

In [None]:

class MPFragmentCounter():
    """
    """

    def __init__(self):
        """Init class variables."""
        
        self.m = Manager()
        self.d = self.m.dict()
        self.d['output'] = {}
        self.lock = Lock()


        
    def _check_in_list(element: Any, alist: list[Any] | set[Any]) -> bool:
        """
        Check if element is in list.

        TODO Do we need this function?

        Parameters
        ----------
        element : Any
            Element that is checked for.
        alist : list[Any] | set[Any]
            List or set in which the element is searched for.

        Returns
        -------
        bool
            True if element is in list else False
        """

        return element in alist


    
    def _check_true(element: Any, alist: Optional[list[Any]] = None) -> bool:  # true regardless of input
        """
        Return True regardless of input

        Parameters
        ----------
        element : Any
            Element that is checked for.
        alist: Optional[list[Any]]
            List or set in which the element is searched for.

        Returns
        -------
        bool
            True if element is in list else False
        """

        return True

    
    def insertsize_from_fragments(self, fragments: str,
                                  barcodes: Optional[list[str]] = None,
                                  n_threads: int = 8) -> pd.DataFrame:
        # Open fragments file
        if _is_gz_file(fragments):
            f = gzip.open(fragments, "rt")
        else:
            f = open(fragments, "r")

        # Prepare function for checking against barcodes list
        if barcodes is not None:
            barcodes = set(barcodes)
            check_in = self._check_in_list
        else:
            check_in = self._check_true

        iterator = pd.read_csv(fragments,
                               delimiter='\t',
                               header=None,
                               names=['chr', 'start', 'stop', 'barcode', 'count'],
                               iterator=True,
                               chunksize=1000)

        # start timer
        start_time = datetime.datetime.now()

        pool = Pool(n_threads, maxtasksperchild=48)
        jobs = []
        # split fragments into chunks
        for chunk in iterator:
            # apply async job wit callback function
            job = pool.apply_async(self._count_fragments_worker, args=(chunk, barcodes, check_in))
            jobs.append(job)
        # monitor progress
        # utils.monitor_jobs(jobs, description="Progress")
        # close pool
        pool.close()
        # wait for all jobs to finish
        pool.join()
        # reset settings
        count_dict = self.d
        print('what is going on')
        print(count_dict)
        # Fill missing sizes with 0
        max_fragment_size = 1001

        for barcode in count_dict:
            for size in range(max_fragment_size):
                if size not in count_dict[barcode]:
                    count_dict[barcode][size] = 0

        # Close file and print elapsed time
        end_time = datetime.datetime.now()
        f.close()

        elapsed = end_time - start_time
        print("Done reading file - elapsed time: {0}".format(str(elapsed).split(".")[0]))

        # Convert dict to pandas dataframe
        print("Converting counts to dataframe...")
        table = pd.DataFrame.from_dict(count_dict, orient="index")
        table = table[["insertsize_count", "mean_insertsize"] + sorted(table.columns[2:])]
        table["mean_insertsize"] = table["mean_insertsize"].round(2)

        print("Done getting insertsizes from fragments!")

        return table

    
    def _count_fragments_worker(self, chunk, barcodes, check_in):
        
        count_dict = {}
        
        for i in range(len(chunk)):
            row = chunk.iloc[i]
            start = int(row['start'])
            end = int(row['stop'])
            barcode = row['barcode']
            count = int(row['count'])
            size = end - start - 9  # length of insertion (-9 due to to shifted cutting of Tn5)

            # Only add fragment if check is true
            if check_in(barcode, barcodes) is True:
                count_dict = self._add_fragment(count_dict, barcode, size, count)
                
        with self.lock:
            self.d['output'] = update_count_dict(self.d['output'], count_dict)


    def _add_fragment(count_dict: dict[str, int],
                      barcode: str,
                      size: int,
                      count: int = 1):
        """
        Add fragment of size 'size' to count_dict.

        Parameters
        ----------
        count_dict : dict[str, int]
            Dictionary containing the counts per insertsize.
        barcode : str
            Barcode of the read.
        size : int
            Insertsize to add to count_dict.
        count : int, default 1
            Number of reads to add to count_dict.
        """

        # Initialize if barcode is seen for the first time
        if barcode not in count_dict:
            count_dict[barcode] = {"mean_insertsize": 0, "insertsize_count": 0}

        # Add read to dict
        if size >= 0 and size <= 1000:  # do not save negative insertsize, and set a cap on the maximum insertsize to limit outlier effects

            count_dict[barcode]["insertsize_count"] += count

            # Update mean
            mu = count_dict[barcode]["mean_insertsize"]
            total_count = count_dict[barcode]["insertsize_count"]
            diff = (size - mu) / total_count
            count_dict[barcode]["mean_insertsize"] = mu + diff

            # Save to distribution
            if size not in count_dict[barcode]:  # first time size is seen
                count_dict[barcode][size] = 0
            count_dict[barcode][size] += count
            
        return count_dict
    

    def _log_result(self, result: Any) -> None:
        """Log results from mp_counter."""

        if self.merged_dict:
            self.merged_dict = dict(Counter(self.merged_dict) + Counter(result))
            # print('merging')
        else:
            self.merged_dict = result

In [None]:
 mpc = MPFragmentCounter()

In [None]:
%%time
counts = mpc.insertsize_from_fragments(small_fragments, barcodes)

In [None]:
some_dict = {}

In [None]:
some_dict['another'] = {'test': 'Hallo'}

In [None]:
some_dict['another']

In [None]:
count_dict={}

In [None]:
count_dict_1={}
count_dict_1['ACGTT'] = {"mean_insertsize": 10, "insertsize_count": 5, 'dist': np.array([0,1,0,2,1,1,0])}
count_dict_1['GTCCT'] = {"mean_insertsize": 10, "insertsize_count": 20, 'dist': np.array([0,0,0,1,2,2,1])}
count_dict_1['GCGCG'] = {"mean_insertsize": 10, "insertsize_count": 20, 'dist': np.array([0,0,0,1,2,2,1])}

count_dict_2={}
count_dict_2['ACGTT'] = {"mean_insertsize": 20, "insertsize_count": 20, 'dist': np.array([2,1,1,0,1,1,0])}
count_dict_2['GTCCT'] = {"mean_insertsize": 20, "insertsize_count": 5, 'dist': np.array([1,0,2,2,1,1,0])}
count_dict_2['TTTAA'] = {"mean_insertsize": 20, "insertsize_count": 5, 'dist': np.array([1,0,2,2,1,1,0])}

In [None]:
# make Dataframes for computation
df1 = pd.DataFrame(count_dict_1).T
df2 = pd.DataFrame(count_dict_2).T

# merge counts
combined_dists = df1['dist'].combine(df2['dist'], func=update_dist)

In [None]:
merged_counts = pd.merge(df1["insertsize_count"], df2["insertsize_count"], left_index=True, right_index=True, how='outer').fillna(0)


In [None]:
merged_counts

In [None]:
    # merge counts
    merged_counts = pd.merge(df1["insertsize_count"], df2["insertsize_count"], left_index=True, right_index=True, how='outer').fillna(0)
    # sum total counts/barcode
    updated_counts = merged_counts.sum(axis=1)

In [None]:
df_dists= pd.DataFrame({'combined_dists' : combined_dists})

In [None]:
df_counts = pd.DataFrame({'insertsize_counts' : updated_counts})

In [None]:
df_counts = pd.DataFrame({'insertsize_counts': {'TTTAA':20, 'ACGTT':25, 'GCGCG': 25, 'GTCCT': 5}})

In [None]:
df_counts

In [None]:
some_dict = {}

In [None]:
len(some_dict)

In [32]:
def update_count_dict(count_dict_1, count_dict_2):
    """
    updates
    """
    # Check if count_dict_1 is empty:
    if len(count_dict_1) == 0:
        return count_dict_2
        
    # make Dataframes for computation
    df1 = pd.DataFrame(count_dict_1).T
    df2 = pd.DataFrame(count_dict_2).T

    # merge distributions
    combined_dists = df1['dist'].combine(df2['dist'], func=update_dist)
    
    # merge counts
    merged_counts = pd.merge(df1["insertsize_count"], df2["insertsize_count"], left_index=True, right_index=True, how='outer').fillna(0)
    # sum total counts/barcode
    updated_counts = merged_counts.sum(axis=1)
    

    # calculate scaling factors
    x_scaling_factor = merged_counts["insertsize_count_x"] / updated_counts
    y_scaling_factor = merged_counts["insertsize_count_y"] / updated_counts

    # merge mean insertsizes
    merged_mean_insertsizes = pd.merge(df1["mean_insertsize"], df2["mean_insertsize"], left_index=True, right_index=True, how='outer').fillna(0)

    # scale mean insertsizes
    merged_mean_insertsizes["mean_insertsize_x"] = merged_mean_insertsizes["mean_insertsize_x"] * x_scaling_factor
    merged_mean_insertsizes["mean_insertsize_y"] = merged_mean_insertsizes["mean_insertsize_y"] * y_scaling_factor

    # sum the scaled means
    updated_means = merged_mean_insertsizes.sum(axis=1)

    # build the updated dictionary
    updated_dict = pd.DataFrame({'mean_insertsize': updated_means, 'insertsize_count' : updated_counts, 'dist': combined_dists}).T.to_dict()
    
    
    return updated_dict


def update_dist(dist_1, dist_2):
    """Updates the Insertsize Distributions"""
    if not np.isnan(dist_1).any() and not np.isnan(dist_2).any():
        updated_dist = dist_1 + dist_2
        return updated_dist
    elif np.isnan(dist_1).any():
        return dist_2
    elif np.isnan(dist_2).any():
        return dist_1

In [None]:
pd.DataFrame({'mean_insertsizes': updated_means, 'insertsize_counts' : updated_counts})

In [None]:
np.array([1,3,21,0]) / 10

In [None]:
merged_insertsizes = pd.merge(df1["insertsize_count"], df2["insertsize_count"], left_index=True, right_index=True)
merged

In [None]:
x_scaling_factor = merged_insertsizes["insertsize_count_x"] / merged_insertsizes.sum(axis=1)
y_scaling_factor = merged_insertsizes["insertsize_count_y"] / merged_insertsizes.sum(axis=1)

In [None]:
merged_mean_insertsizes = pd.merge(df1["mean_insertsize"], df2["mean_insertsize"], left_index=True, right_index=True)
merged_mean_insertsizes

In [None]:
merged_mean_insertsizes["mean_insertsize_x"] = merged_mean_insertsizes["mean_insertsize_x"] * x_scaling_factor
merged_mean_insertsizes["mean_insertsize_y"] = merged_mean_insertsizes["mean_insertsize_y"] * y_scaling_factor

In [None]:
merged_mean_insertsizes.sum(axis=1)

In [None]:
merged_mean_insertsizes * 

In [None]:
import pandas as pd

# Erstellen Sie zwei Beispieldatenframes
df1 = pd.DataFrame({'Werte1': [1, 2, 3]}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({'Werte1': [4, 5, 6]}, index=['a', 'b', 'c'])

# Mergen Sie die DataFrames am Index
merged_df = pd.merge(df1, df2, left_index=True, right_index=True)

# Summieren Sie die Werte
summed_df = merged_df.sum(axis=1)

print(summed_df)


In [None]:
merged_df

In [None]:
%%time
count_table = tools._insertsize_from_fragments(small_fragments, barcodes)

In [None]:
@beartype
def _insertsize_from_fragments(fragments: str,
                               barcodes: Optional[list[str]] = None) -> pd.DataFrame:
    """
    Get fragment insertsize distributions per barcode from fragments file.

    Parameters
    ----------
    fragments : str
        Path to fragments.bed(.gz) file.
    barcodes : Optional[list[str]], default None
        Only collect fragment sizes for the barcodes in barcodes

    Returns
    -------
    pd.DataFrame
        DataFrame with insertsize distributions per barcode.
    """

    # Open fragments file
    if utils._is_gz_file(fragments):
        f = gzip.open(fragments, "rt")
    else:
        f = open(fragments, "r")

    # Prepare function for checking against barcodes list
    if barcodes is not None:
        barcodes = set(barcodes)
        check_in = _check_in_list
    else:
        check_in = _check_true

    # Read fragments file and add to dict
    print("Counting fragment lengths from fragments file...")
    start_time = datetime.datetime.now()
    count_dict = {}
    for line in f:
        columns = line.rstrip().split("\t")
        start = int(columns[1])
        end = int(columns[2])
        barcode = columns[3]
        count = int(columns[4])
        size = end - start - 9  # length of insertion (-9 due to to shifted cutting of Tn5)

        # Only add fragment if check is true
        if check_in(barcode, barcodes) is True:
            count_dict = _add_fragment(count_dict, barcode, size, count)

    # Fill missing sizes with 0
    max_fragment_size = 1001

    for barcode in count_dict:
        for size in range(max_fragment_size):
            if size not in count_dict[barcode]:
                count_dict[barcode][size] = 0

    # Close file and print elapsed time
    end_time = datetime.datetime.now()
    elapsed = end_time - start_time
    f.close()
    print("Done reading file - elapsed time: {0}".format(str(elapsed).split(".")[0]))

    # Convert dict to pandas dataframe
    print("Converting counts to dataframe...")
    table = pd.DataFrame.from_dict(count_dict, orient="index")
    table = table[["insertsize_count", "mean_insertsize"] + sorted(table.columns[2:])]
    table["mean_insertsize"] = table["mean_insertsize"].round(2)

    print("Done getting insertsizes from fragments!")

    return table

In [None]:
@beartype
def _add_fragment(count_dict: dict[str, int],
                  barcode: str,
                  size: int,
                  count: int = 1) -> dict[str, int]:
    """
    Add fragment of size 'size' to count_dict.

    Parameters
    ----------
    count_dict : dict[str, int]
        Dictionary containing the counts per insertsize.
    barcode : str
        Barcode of the read.
    size : int
        Insertsize to add to count_dict.
    count : int, default 1
        Number of reads to add to count_dict.

    Returns
    -------
    dict[str, int]
        Updated count_dict
    """

    # Initialize if barcode is seen for the first time
    if barcode not in count_dict:
        count_dict[barcode] = {"mean_insertsize": 0, "insertsize_count": 0}

    # Add read to dict
    if size >= 0 and size <= 1000:  # do not save negative insertsize, and set a cap on the maximum insertsize to limit outlier effects

        count_dict[barcode]["insertsize_count"] += count

        # Update mean
        mu = count_dict[barcode]["mean_insertsize"]
        total_count = count_dict[barcode]["insertsize_count"]
        diff = (size - mu) / total_count
        count_dict[barcode]["mean_insertsize"] = mu + diff

        # Save to distribution
        if size not in count_dict[barcode]:  # first time size is seen
            count_dict[barcode][size] = 0
        count_dict[barcode][size] += count

    return count_dict

# HELPERS

In [None]:
@beartype
def _is_gz_file(filepath: str) -> bool:
    """
    Check wheather file is a compressed .gz file.

    Parameters
    ----------
    filepath : str
        Path to file.

    Returns
    -------
    bool
        True if the file is a compressed .gz file.
    """

    with open(filepath, 'rb') as test_f:
        return test_f.read(2) == b'\x1f\x8b'

In [None]:
@beartype
def gunzip_file(f_in: str, f_out: str) -> None:
    """
    Decompress file.

    Parameters
    ----------
    f_in : str
        Path to compressed input file.
    f_out : str
        Destination to decompressed output file.
    """

    with gzip.open(f_in, 'rb') as h_in:
        with open(f_out, 'wb') as h_out:
            shutil.copyfileobj(h_in, h_out)

In [37]:
iterator = pd.read_csv(fragments_file,
                       delimiter='\t',
                       header=None,
                       names=['chr', 'start', 'stop', 'barcode', 'count'],
                       iterator=True,
                       chunksize=100000)
updated = {}

In [38]:
chunk = next(iterator)

In [39]:
chunk

Unnamed: 0,chr,start,stop,barcode,count
0,chr1,10067,10279,AGGGATAAACCACCGAAGGTCA,1
1,chr1,10072,10316,AGCGTGTCATTCGCGAGATAGT,1
2,chr1,10073,10327,AAATCCGCATGTCCAGATTTCC,1
3,chr1,10084,10279,TGATTACGCGCCTTTCCGTATC,2
4,chr1,10091,10273,ACCTTCAAGCACTCGTTCCGAT,1
...,...,...,...,...,...
99995,chr1,3157139,3157224,GATCGGGTGAAACATGAAGCGC,3
99996,chr1,3157140,3157193,AACGACCAAAGTATGCCGAGAA,2
99997,chr1,3157140,3157224,TAGTGCTGTCCTTAGCGTGAGT,1
99998,chr1,3157141,3157168,AGAAAGGCGGGCGTAACCAGTA,2


In [79]:
%%time
check_in = _check_true

count_dict = {}

for row in chunk.itertuples():
    start = int(row[2])
    end = int(row[3])
    barcode = row[4]
    count = int(row[5])
    size = end - start - 9  # length of insertion (-9 due to to shifted cutting of Tn5)

    # Only add fragment if check is true
    if check_in(barcode, barcodes) is True:
        count_dict = _add_fragment(count_dict, barcode, size, count)
        

updated = update_count_dict(updated, count_dict)

CPU times: user 776 ms, sys: 3.93 ms, total: 780 ms
Wall time: 779 ms


In [70]:
def wrap_add_fragments(row, count_dict):
    start = int(row[1])
    end = int(row[2])
    barcode = str(row[3])
    count = int(row[4])
    size = end - start - 9 

    if check_in(barcode, barcodes) is True:
        result = _add_fragment(count_dict, barcode, size, count)

In [77]:
count_dict = {}

In [80]:
%%time
_ = chunk.apply(lambda row: wrap_add_fragments(row, count_dict), axis=1)

CPU times: user 531 ms, sys: 277 µs, total: 531 ms
Wall time: 527 ms


In [76]:
count_dict

{'AGGGATAAACCACCGAAGGTCA': {'mean_insertsize': 129.91666666666666,
  'insertsize_count': 6,
  'dist': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [None]:
%%time
check_in = _check_true

count_dict = {}

for i in range(len(chunk)):
    row = chunk.iloc[i]
    start = int(row['start'])
    end = int(row['stop'])
    barcode = row['barcode']
    count = int(row['count'])
    size = end - start - 9  # length of insertion (-9 due to to shifted cutting of Tn5)

    # Only add fragment if check is true
    if check_in(barcode, barcodes) is True:
        count_dict = _add_fragment(count_dict, barcode, size, count)
        

updated = update_count_dict(updated, count_dict)

In [None]:
len(count_dict)

In [None]:
len(updated)

In [36]:
pd.DataFrame(updated)

Unnamed: 0,AGGGATAAACCACCGAAGGTCA,AGCGTGTCATTCGCGAGATAGT,AAATCCGCATGTCCAGATTTCC,TGATTACGCGCCTTTCCGTATC,ACCTTCAAGCACTCGTTCCGAT,TAAGGTAGGGCAGAATCCTTCC,AGGGATAAACACGTCGAGTCCA,CCGCTACTCATCAAGGCCTGGA,ACTTGCTTCTTCAGGGAAAGCG,AAGTCCTTAGGGGCTCCGAATA,...,GGTGACATACATCCGTGGTTGC,AAGTCCTTAGATAAGGTCCTGA,GACCCTCAAACCGTGAAAGACC,CAGGAAAGCAGATGGCATCAAA,CTTTCGCGTGTAGATACATCCC,GCTGGTAGGACGGAACTATACT,CTAGTGTTGCTGGGTTTGTACT,AGGGATAAACTAGTCCACCAGT,GAGGTAATCTGACGGCATAATG,AGCCATAGGGACTAACGGTATC
mean_insertsize,154.666667,235.0,245.0,93.0,173.0,108.0,202.0,120.0,50.0,98.333333,...,0.533981,56.0,28.5,9.666667,8.428571,2.857143,63.0,5.818182,0.77381,0.814815
insertsize_count,3,1,1,2,1,2,1,3,1,3,...,103,1,2,6,7,21,1,11,84,81
dist,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [34]:
updated = {}

In [None]:
pd.DataFrame(count_dict)

In [None]:
df = pd.DataFrame(count_dict).T

In [None]:
df['dist']

In [30]:
    def _count_fragments_worker(self, chunk, barcodes, check_in):
        
        count_dict = {}
        
        for i in range(len(chunk)):
            row = chunk.iloc[i]
            start = int(row['start'])
            end = int(row['stop'])
            barcode = row['barcode']
            count = int(row['count'])
            size = end - start - 9  # length of insertion (-9 due to to shifted cutting of Tn5)

            # Only add fragment if check is true
            if check_in(barcode, barcodes) is True:
                count_dict = self._add_fragment(count_dict, barcode, size, count)
                
        with self.lock:
            self.d = update_count_dict(self.d, count_dict)
            
    def _check_true(element: Any, alist: Optional[list[Any]] = None) -> bool:  # true regardless of input

        return True

    def _add_fragment(count_dict: dict[str, int],
                      barcode: str,
                      size: int,
                      count: int = 1,
                      max_size=1000):

        # Initialize if barcode is seen for the first time
        if barcode not in count_dict:
            count_dict[barcode] = {"mean_insertsize": 0, "insertsize_count": 0}

        # Add read to dict
        if size >= 0 and size <= max_size:  # do not save negative insertsize, and set a cap on the maximum insertsize to limit outlier effects

            count_dict[barcode]["insertsize_count"] += count

            # Update mean
            mu = count_dict[barcode]["mean_insertsize"]
            total_count = count_dict[barcode]["insertsize_count"]
            diff = (size - mu) / total_count
            count_dict[barcode]["mean_insertsize"] = mu + diff

            # Save to distribution
            if size not in count_dict[barcode]:  # first time size is seen
                sizes = np.arange(0,max_size+1)
                count_dict[barcode]['dist'] = np.zeros(max_size)
            count_dict[barcode]['dist'][size] += count
            
        return count_dict

In [None]:
np.arange(0,1001) + np.arange(0,1001)

In [None]:
import numpy as np

In [None]:
count_dict[']