In [66]:
import pandas as pd
from pandas import DataFrame
import sys

In [65]:
df = pd.read_csv('../data/raw/fastqc_data1.txt', names=['raw'])

Use str.contains to get access to the lines of interest

In [61]:
index_start = df[df['raw'].str.contains('Basic Statistics')].index.tolist()
index_ends = df[df['raw'].str.contains('END_MODULE')].index.tolist()

# from Basic Statistics TO first END_MODULE
df.iloc[index_start[0]:index_ends[0]]

Unnamed: 0,raw
1,>>Basic Statistics\tpass
2,#Measure\tValue
3,Filename\t4_age21_S12_L001_R1_001_concat.fastq.gz
4,File type\tConventional base calls
5,Encoding\tSanger / Illumina 1.9
6,Total Sequences\t37287903
7,Sequences flagged as poor quality\t0
8,Sequence length\t75
9,%GC\t55


If I had a mapper object that told me the structure of the FASTQ file, then I can use that to pull out the correct END_MODULE. I need a dict like structure with keys as title name and values as their order

In [75]:
fastq_order_map = dict([
    ('Basic Statistics', 0),
    ('Per base sequence quality', 1),
    ('Per tile sequence quality', 2),
    ('Per sequence quality scores', 3),
    ('Per base sequence content', 4),
    ('Per sequence GC content', 5),
    ('Per base N content', 6),
    ('Sequence Length Distribution', 7),
    ('Sequence Duplication Levels', 8),
    ('Overrepresented sequences', 9),
    ('Adapter Content', 10),
    ('Kmer Content	fail', 11),

])

In [62]:
fastq_order_map
type(df)

pandas.core.frame.DataFrame

In [283]:
def extract_raw(filepath:str, search_string: str) -> DataFrame:
    '''
    Extracts a section of a CSV file based on a search string and returns it as a DataFrame.

    This function reads a CSV file into a DataFrame, searches for a specified string within the 'raw' column,
    and extracts rows from the first occurrence of the search string to the corresponding 'END_MODULE' marker.
    The function uses a predefined mapping to determine the correct 'END_MODULE' for each search string.

    Parameters:
    filepath (str): The path to the CSV file to be read.
    search_string (str): The string to search for within the 'raw' column of the DataFrame.

    Returns:
    DataFrame: A DataFrame containing the rows from the first occurrence of the search string to the corresponding 'END_MODULE'.

    Raises:
    FileNotFoundError: If the specified file does not exist, the function prints an error message and exits the program.

    Example:
    >>> extract_raw('path/to/your/file.csv', 'Basic Statistics')
    '''

    fastq_order_map = dict([
    ('Basic Statistics', 0),
    ('Per base sequence quality', 1),
    ('Per tile sequence quality', 2),
    ('Per sequence quality scores', 3),
    ('Per base sequence content', 4),
    ('Per sequence GC content', 5),
    ('Per base N content', 6),
    ('Sequence Length Distribution', 7),
    ('Sequence Duplication Levels', 8),
    ('Overrepresented sequences', 9),
    ('Adapter Content', 10),
    ('Kmer Content', 11),

])

    try:
        df = pd.read_csv(filepath, names=['raw'])
    except FileNotFoundError as e:
        print(f'The file {filepath} could not be found: {e}')
        sys.exit(1)

    index_start = df[df['raw'].str.contains(search_string)].index.tolist()
    index_ends = df[df['raw'].str.contains('END_MODULE')].index.tolist()

    # from search_string TO first END_MODULE
    return df.iloc[index_start[0]+1:index_ends[fastq_order_map[search_string]]]

In [268]:
extract_raw('../data/raw/fastqc_data1.txt', 'Sequence Length Distribution')


Unnamed: 0,raw
8784,#Length\tCount
8785,75\t3.7287903E7


Good its working, I need to tab separate the output

In [209]:
op = extract_raw('../data/raw/fastqc_data1.txt', 'Sequence Duplication Levels')

# split = op['raw'].str.split('\t')
# header = split.head(1) # header
# values = split[1:] # vales
# op = pd.DataFrame(values)
# op.columns

In [238]:
df_op = pd.DataFrame(op)

op_values_raw = df_op[~df_op['raw'].str.startswith('#')]
op_values_raw


Unnamed: 0,raw
8790,1\t67.90783572533351\t16.30078473389498
8791,2\t15.113985471979557\t7.256005761881316
8792,3\t5.9511408968300845\t4.285584968692337
8793,4\t3.0257991756399845\t2.9052848162972165
8794,5\t1.7810060638290006\t2.1375881108671266
8795,6\t1.1719438383374363\t1.687899844685502
8796,7\t0.794715444467579\t1.3353598554009634
8797,8\t0.6092826471110954\t1.1700311360623195
8798,9\t0.4788782119343553\t1.0345612559736035
8799,>10\t2.744527094678331\t12.23008174185119


In [245]:
op_values_list = op_values_raw['raw'].apply(lambda x: x.split('\t'))
op_values_list

8790            [1, 67.90783572533351, 16.30078473389498]
8791           [2, 15.113985471979557, 7.256005761881316]
8792           [3, 5.9511408968300845, 4.285584968692337]
8793          [4, 3.0257991756399845, 2.9052848162972165]
8794          [5, 1.7810060638290006, 2.1375881108671266]
8795           [6, 1.1719438383374363, 1.687899844685502]
8796           [7, 0.794715444467579, 1.3353598554009634]
8797          [8, 0.6092826471110954, 1.1700311360623195]
8798          [9, 0.4788782119343553, 1.0345612559736035]
8799          [>10, 2.744527094678331, 12.23008174185119]
8800         [>50, 0.22606768563306923, 3.76557958541675]
8801        [>100, 0.1636436290040782, 7.629341777834772]
8802     [>500, 0.015018455462908557, 2.4740827793651854]
8803       [>1k, 0.010320496799441697, 5.084695654027676]
8804        [>5k, 0.00238818161467711, 4.066128575301802]
8805    [>10k+, 0.0034469813453151137, 26.63698940244748]
Name: raw, dtype: object

In [241]:
op_headers_raw = df_op[df_op['raw'].str.startswith('#')]
op_headers_raw

Unnamed: 0,raw
8788,#Total Deduplicated Percentage\t24.00427661960...
8789,#Duplication Level\tPercentage of deduplicated...


In [273]:
op_headers_list = op_headers_raw['raw'].apply(lambda x: x.split('\t'))
op_headers_list.tolist()[0]

['#Total Deduplicated Percentage', '24.004276619603495']

In [265]:
pd.DataFrame(op_values_list.tolist(), columns=op_headers_list.tolist()[-1])

Unnamed: 0,#Duplication Level,Percentage of deduplicated,Percentage of total
0,1,67.90783572533351,16.30078473389498
1,2,15.113985471979555,7.256005761881316
2,3,5.9511408968300845,4.285584968692337
3,4,3.0257991756399845,2.9052848162972165
4,5,1.7810060638290006,2.137588110867126
5,6,1.1719438383374363,1.687899844685502
6,7,0.794715444467579,1.3353598554009634
7,8,0.6092826471110954,1.1700311360623197
8,9,0.4788782119343553,1.0345612559736037
9,>10,2.744527094678331,12.23008174185119


That was pretty ugly, but the general steps are
1. extract header(s) by finding all rows with '#'. Some types have more than one row with '#'. Sequence Duplication Levels. Seems pretty unusual but I can handle it later.
2. extract values as rows without '#'
3. Split data on \t
4. convert to list
5. convert to dataframe with values and data and headers as column names

In [292]:
def parse_raw(raw_data: DataFrame) -> DataFrame:
    """
    Parses raw data from extract_raw().

    This function processes a DataFrame containing raw data, where some lines start with '#'.
    It extracts lines that do not start with '#', splits them on tab characters, and converts them into a list.
    It extracts header lines that start with '#', splits them on tab characters, and uses the last header line as column names.
    The function returns a formatted DataFrame with the extracted values and headers.

    Parameters:
    raw_data (DataFrame): The output from extract_raw()

    Returns:
    DataFrame: A formatted DataFrame with the extracted values and headers.

    Example:
    >>> raw_data = extract_raw('../data/raw/fastqc_data1.txt', 'Sequence Duplication Levels')
    >>> formatted_df = parse_raw(raw_data)
    >>> print(formatted_df)
    """

    # make sure incoming object is DataFrame
    df_raw = pd.DataFrame(raw_data)

    # extract lines NOT '#'
    values_raw = df_raw[~df_raw['raw'].str.startswith('#')]
    # convert to list by splitting on '\t'
    values_list = values_raw['raw'].apply(lambda x: x.split('\t'))
    values_list = values_list.tolist()

    # extract line '#
    headers_raw = df_raw[df_raw['raw'].str.startswith('#')]
    headers_list = headers_raw['raw'].apply(lambda x: x.split('\t'))    # sometimes len > 1
    if len(headers_list) > 1:
        headers_list = headers_list.tolist()[-1]                        # headers
        # misc_out = headers_list.tolist()[0]                             # misc
    else:
        headers_list = headers_list.tolist()

    formatted_df = pd.DataFrame(values_list, columns=headers_list)

    return formatted_df

In [298]:
op_test = extract_raw('../data/raw/fastqc_data1.txt', 'All the above')
parse_raw(op_test)

IndexError: list index out of range