In [3]:
import os
import pandas as pd

In [5]:
def create_output_directory(input_filepath, suffix="_clean_data"):
   file_name = os.path.basename(input_filepath)
   base_name = os.path.splitext(file_name)[0]
   new_dir_name = base_name + suffix
   input_dir = os.path.dirname(input_filepath)
   if not input_dir:
       new_dir_path = new_dir_name
   else: 
       new_dir_path = os.path.join(input_dir, new_dir_name)
   os.makedirs(new_dir_path, exist_ok=True)
   return new_dir_path

In [19]:
SERIES_MATRIX_PATH = "E:/Biogrademy project/GSE69657/GSE69657_series_matrix.csv"
output_dir_path = create_output_directory(SERIES_MATRIX_PATH)
print(f"\nFinal directory path variable: {output_dir_path}")


Final directory path variable: E:/Biogrademy project/GSE69657\GSE69657_series_matrix_clean_data


In [23]:
def clean_series_matrix_csv(input_filepath, output_dir_path):
    print(f"Processing: {input_filepath}")
    start_row = 0
    with open(input_filepath, 'r', encoding='latin-1') as f:
        for i, line in enumerate(f):
            if line.startswith('"ID_REF"') or line.startswith('ID_REF') or '!Series_matrix_table_begin' in line:
                starts_row = i+1
                if line.startswith('!'):
                    start_row = i+1
                else:
                    start_row = i
                print(f"Found data table header/maker on line index {i}. Starting read at row: {start_row}")
                break

    if start_row == 0:
        print("Warning: Could not find clear start marker. Assuming data starts at row 0.")

    try:
        expression_df = pd.read_csv(
            input_filepath,
            sep='\t',
            skiprows=start_row,
            index_col=0,
            skipinitialspace=True,
            encoding='latin-1'
        )
    except:
        expression_df = pd.read_csv(
           input_filepath,
            sep=',',
            skiprows=start_row,
            index_col=0,
            skipinitialspace=True
        )
    
    expression_df.columns = expression_df.columns.str.strip('"') 
    expression_df.index = expression_df.index.str.strip('"')

    if expression_df.index[-1].startswith('!Series_matrix_table_end'):
        expression_df = expression_df.iloc[:-1]

    base_file_name = os.path.basename(input_filepath).split('.')[0]
    output_filepath = os.path.join(output_dir_path, f"{base_file_name}_clean.csv")

    expression_df.to_csv(output_filepath)

    print(f"\n✅ Successfully cleaned and saved data to: {output_filepath}")
    print(f"Cleaned Data Shape (Genes x Samples): {expression_df.shape}")

    return expression_df 

clean_data_df = clean_series_matrix_csv(SERIES_MATRIX_PATH, output_dir_path)

Processing: E:/Biogrademy project/GSE69657/GSE69657_series_matrix.csv
Found data table header/maker on line index 63. Starting read at row: 63

✅ Successfully cleaned and saved data to: E:/Biogrademy project/GSE69657\GSE69657_series_matrix_clean_data\GSE69657_series_matrix_clean.csv
Cleaned Data Shape (Genes x Samples): (54676, 0)
