In [None]:
import gzip
import pandas as pd

def read_tsv_gz_to_dataframe_skipping_comments_and_empty_lines(tsv_gz_file, comment_character='#'):
    # Open the compressed file using gzip
    print("tsv_gz_file is {}".format(tsv_gz_file))
    with gzip.open(tsv_gz_file, 'rt') as file:
        # Skip comment lines and empty lines, and load the remaining data into a DataFrame
        df = pd.read_csv(file, delimiter='\t', comment=comment_character, skip_blank_lines=True)

    return df

# # Specify the path to the TSV.gz file
# tsv_gz_file = '/path/to/file.tsv.gz'

# # Read the TSV.gz file and create a DataFrame
# dataframe = read_tsv_gz_to_dataframe(tsv_gz_file)

# # Display the DataFrame
# print(dataframe)


In [None]:
import pandas as pd
import gzip

def read_large_tsv_skip_comments_and_empty_lines(file_path, column_names):
    chunk_size = 10000
    chunks = pd.read_csv(
        gzip.open(file_path, 'rt'),
        sep='\t',
        chunksize=chunk_size,
        comment='#',
        skip_blank_lines=True,
        names=column_names
    )
    dataframes = []

    for chunk in chunks:
        dataframes.append(chunk)

    combined_df = pd.concat(dataframes)
    return combined_df

In [None]:
import gzip
import pandas as pd

def read_tsv_gz_to_dataframe_skipping_comments_and_empty_lines_with_col_names(tsv_gz_file, col_names,comment_character='#'):
    # debug only
    # Specify the number of lines to read
    n_lines = 1000

    # Open the compressed file using gzip
    with gzip.open(tsv_gz_file, 'rt') as file:
        # Skip comment lines and empty lines, and load the remaining data into a DataFrame
        df = pd.read_csv(file, delimiter='\t', comment=comment_character, skip_blank_lines=True,names=col_names,nrows=n_lines)
    return df

In [8]:

def split_column_to_cell_id_and_atac_dataset(df):
    # Split 'Column2' into two columns
    # df[['cell_id', 'atac_dataset']] = df['cell_id_atac_dataset'].str.split('_', 1, expand=True)
    # Split column values by "_"
    df[['cell_id', 'atac_dataset']] = df['cell_id_atac_dataset'].str.split('_', expand=True)
    # Add a new column as a combination of 'atac_dataset' and 'cell_id'
    df = df.assign(atac_dataset_cell_id=df['atac_dataset'] + '_' + df['cell_id'])
    return(df)

In [4]:
import os
# prefix_file_name = split_fragment_file_ or 
def split_df_based_on_col_name(df,fld_name,col_name,prefix_file_name):
    # Split the DataFrame based on unique ID values
    split_dfs = {group_id: group for group_id, group in df.groupby(col_name)}
    files_created = dict()
    # Save each split DataFrame to separate files
    for group_id, split_df in split_dfs.items():
        print("group_id is {}".format(group_id))
        file_name = os.path.join(fld_name,group_id,"{}_{}.tsv".format(prefix_file_name,group_id))
        print("file_name is {}".format(file_name))
#         unique file name based on folder id given and the split criteria
        os.makedirs(os.path.join(fld_name,group_id),exist_ok=True)
        files_created[group_id] = file_name # under the folder, will be used also for the other one
        split_df.to_csv(file_name, sep='\t', index=False)
        print(f"Split DataFrame for atac_dataset {group_id} saved to {file_name}")
    return(files_created)

In [None]:
import pandas as pd

def read_tsv_file(filepath):
    df = pd.read_csv(filepath, sep='\t')
    return df

# # Provide the filepath of the TSV file
# tsv_filepath = '/path/to/file.tsv'

# # Call the function to read the TSV file and return a dataframe
# dataframe = read_tsv_file(tsv_filepath)

# # Print the dataframe
# print(dataframe)


In [3]:
import pandas as pd

def filter_dataframe(df, column, values):
    print("filter_dataframe column is {}".format(column))
#     print("filter_dataframe values is {}".format(values))
    filtered_df = df[df[column].isin(values)]
    print("****filtered_df size is: ",filtered_df.shape)
    
    return filtered_df

In [10]:
def convert_to_tagAlign(df):
    import pandas as pd
    # converting fragments to tagalign format
    rows = list()
    # Define a custom order for the 'chr*' column
    custom_order = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', \
                   'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19',
                   'chr20', 'chr21', 'chr22', 'chrx', 'chry','NaN']

    for r in df.iterrows():
        print(" {}".format(r))
        row1 = []
        row2 = []

        row1.append(r[1][0])
        row1.append(r[1][1])
        row1.append(r[1][1] + 1)
        row1.append(r[1][3])
        row1.append('1')
        row1.append('+')
        # print("convert_to_tagAlign: row 1 is: {}".format(row1))

        row2.append(r[1][0])
        row2.append(r[1][2] - 1)
        row2.append(r[1][2])
        row2.append(r[1][3])
        row2.append('1')
        row2.append('-')
        # print("convert_to_tagAlign: row 2 is: {}".format(row2))

        rows.append(row1)
        rows.append(row2)
    
    df_tag = pd.DataFrame(rows)
    # Sort the DataFrame by the second column with the custom order and then by the first column
    # Convert the 'choromosome' column to categorical with the custom order
    # Convert column 1 to categorical with the custom order
    df_tag[0] = pd.Categorical(df_tag[0], categories=custom_order, ordered=True)

    # Sort the DataFrame by column 1 with the custom order and then by column 0
    sorted_df = df_tag.sort_values(by=[0, 1])
    print("convert_to_tagAlign: sorted_df is: {}".format(sorted_df.head(10)))
    
    return sorted_df

In [None]:
def convert_to_tagAlign_unsorted(df):
    import pandas as pd
    # converting fragments to tagalign format
    rows = list()
#     # Define a custom order for the 'chr*' column
#     custom_order = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', \
#                    'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19',
#                    'chr20', 'chr21', 'chr22', 'chrx', 'chry','NaN']

    for r in df.iterrows():
        print(" {}".format(r))
        row1 = []
        row2 = []

        row1.append(r[1][0])
        row1.append(r[1][1])
        row1.append(r[1][1] + 1)
        row1.append(r[1][3])
        row1.append('1')
        row1.append('+')
        # print("convert_to_tagAlign: row 1 is: {}".format(row1))

        row2.append(r[1][0])
        row2.append(r[1][2] - 1)
        row2.append(r[1][2])
        row2.append(r[1][3])
        row2.append('1')
        row2.append('-')
        # print("convert_to_tagAlign: row 2 is: {}".format(row2))

        rows.append(row1)
        rows.append(row2)
    
    df_tag = pd.DataFrame(rows)
#     # Sort the DataFrame by the second column with the custom order and then by the first column
#     # Convert the 'choromosome' column to categorical with the custom order
#     # Convert column 1 to categorical with the custom order
#     df_tag[0] = pd.Categorical(df_tag[0], categories=custom_order, ordered=True)

#     # Sort the DataFrame by column 1 with the custom order and then by column 0
#     sorted_df = df_tag.sort_values(by=[0, 1])
#     print("convert_to_tagAlign: sorted_df is: {}".format(sorted_df.head(10)))
    
    return df_tag

In [None]:
def convert_fragment_line_to_tagAlign(r):
#     chr1	10007	10175	ENCSR023FME_GAAGGTTCAAAGTGTCAGTCAA	1
    rows_str = ""
    r_list = r.split("\t")
    print("r_list is {}".format(r_list))
    row1 = []
    row2 = []

    row1.append(r_list[0])
    row1.append(r_list[1])
    row1.append(str(int(r_list[1]) + 1))
    row1.append(r_list[3])
    row1.append('1')
    row1.append('+')
    print("convert_to_tagAlign: row 1 is: {}".format(row1))

    row2.append(r_list[0])
    row2.append(str(int(r_list[2]) - 1))
    row2.append(r_list[2])
    row2.append(r_list[3])
    row2.append('1')
    row2.append('-')
    print("convert_to_tagAlign: row 2 is: {}".format(row2))
    
    rows_str = "\t".join(row1)+"\n"+"\t".join(row2)+"\n"
    print("rows_str is {}".format(rows_str))
    return rows_str

In [None]:
def split_fragment_line_string(string):
    # Remove newline characters
    string = string.replace("\n", "")

    # Splitting by tab character
    split_list = string.split("\t")

    # Splitting the word before the last one by underscore
    last_word = split_list[-2]
    split_word = last_word.split("_")

    # Inserting the split word before the last one in the list
    split_list.insert(-1, split_word[0])
    split_list.insert(-1, split_word[1])

    # Concatenating values at index 5 and index 4 with underscore
    concatenated_value = split_list[5] + "_" + split_list[4]
    split_list.append(concatenated_value)

    return split_list
