# .imzML 파일 뷰어 및 내보내기 도구

이 노트북은 단일 `.imzML` 파일을 열어 내부 메타데이터와 스펙트럼 정보를 확인하고, 트러블슈팅을 위해 데이터를 CSV로 내보내는 도구입니다.

**사용 방법:**
1. `--- 1. 설정 ---` 셀에서 `imzml_filepath` 변수에 검사할 파일의 경로를 입력합니다.
2. `lxml` 파서로 오류가 발생한 파일의 경우 `parse_mode = 'xml'`로 설정합니다.
3. 상단 메뉴의 [Kernel] -> [Restart Kernel and Run All Cells...]를 실행합니다.

In [1]:
import os
import numpy as np
import pandas as pd
from pyimzml.ImzMLParser import ImzMLParser
import warnings
from IPython.display import display, HTML

# pyimzml의 특정 UserWarning을 무시합니다.
warnings.filterwarnings('ignore', category=UserWarning, module='pyimzml')

print("라이브러리 임포트 완료.")

라이브러리 임포트 완료.


---

In [4]:
# --- 1. 설정 --- 
# 검사할 .imzML 파일의 경로를 지정하세요.
# (프로젝트 root 폴더 기준)

# 예시: 이전에 오류가 발생했던 파일
imzml_filepath = "data/het 5-2 hippocampus-total ion count.imzML"

# --- 파서(Parser) 모드 설정 ---
# 'lxml': (기본값) 빠르지만 엄격합니다. 파일이 완벽해야 합니다.
# 'xml':  느리지만 관대합니다. 파일에 사소한 오류가 있어도 (대부분) 읽어냅니다.
parse_mode = 'xml' # 또는 'lxml'

print(f"  - 대상 파일: {imzml_filepath}")
print(f"  - 파서 모드: {parse_mode}")

# .ibd 파일이 존재하는지 확인
ibd_filepath = os.path.splitext(imzml_filepath)[0] + '.ibd'
if not os.path.exists(ibd_filepath):
    print(f"경고: .ibd 파일이 없습니다! {ibd_filepath}")
if not os.path.exists(imzml_filepath):
    print(f"오류: .imzML 파일이 없습니다! {imzml_filepath}")


  - 대상 파일: data/het 5-2 hippocampus-total ion count.imzML
  - 파서 모드: xml


---

In [5]:
# --- 2. 파일 파싱 및 기본 정보 --- 

p = None # 파서 객체를 저장할 변수
try:
    print(f"'{imzml_filepath}' 파일 파싱 시도 (모드: {parse_mode})...")
    p = ImzMLParser(imzml_filepath, parse_lib=parse_mode)
    print("파일 파싱 성공.")
    
except FileNotFoundError:
    print(f"오류: 파일을 찾을 수 없습니다. {imzml_filepath}")
except Exception as e:
    print(f"데이터 파싱 중 심각한 오류 발생: {e}")
    print("  -> 'lxml' 모드에서 오류가 났다면, 'xml' 모드로 변경해보세요.")

'data/het 5-2 hippocampus-total ion count.imzML' 파일 파싱 시도 (모드: xml)...
파일 파싱 성공.


## 3. 메타데이터 확인

파일에서 읽어온 주요 메타데이터를 테이블 형식으로 표시합니다.

In [6]:
if p is not None:
    # 1. 이미지 기본 정보 (좌표 기준)
    if p.coordinates:
        num_spectra = len(p.coordinates)
        # x, y 좌표만 추출
        coords_x = [coord[0] for coord in p.coordinates]
        coords_y = [coord[1] for coord in p.coordinates]
        
        max_x = max(coords_x)
        max_y = max(coords_y)
        
        print("--- 이미지 차원 정보 (Image Dimensions) ---")
        df_dims = pd.DataFrame([
            {'Total Spectra (Pixels)': num_spectra,
             'Max X Index': max_x,
             'Max Y Index': max_y,
             'Image Size (pixels)': f"{max_x} x {max_y}"}
        ])
        display(df_dims)
        
    else:
        print("오류: 파일에 좌표 데이터가 없습니다.")

    # 2. 소프트웨어 및 기기 정보
    print("\n--- 소프트웨어 및 기기 정보 (Software & Instrument) ---")
    try:
        software_name = p.metadata.software.name if p.metadata.software else 'N/A'
        instrument_name = p.metadata.instrument_configuration_list[0].name if p.metadata.instrument_configuration_list else 'N/A'
        
        df_meta = pd.DataFrame([
            {'Parameter': 'Generating Software', 'Value': software_name},
            {'Parameter': 'Instrument Configuration', 'Value': instrument_name}
        ])
        display(df_meta.set_index('Parameter'))
    except Exception as e:
        print(f"메타데이터(소프트웨어/기기) 읽기 중 오류: {e}")

--- 이미지 차원 정보 (Image Dimensions) ---


Unnamed: 0,Total Spectra (Pixels),Max X Index,Max Y Index,Image Size (pixels)
0,750,48,26,48 x 26



--- 소프트웨어 및 기기 정보 (Software & Instrument) ---
메타데이터(소프트웨어/기기) 읽기 중 오류: 'Metadata' object has no attribute 'software'


## 4. 첫 번째 스펙트럼 확인

가장 첫 번째 픽셀(Index 0)의 m/z와 Intensity 데이터를 확인합니다.

In [7]:
if p is not None and len(p.coordinates) > 0:
    print(f"--- 첫 번째 스펙트럼 (Index 0) 상세 정보 ---")
    print(f"  - 좌표 (x, y, z): {p.coordinates[0]}")
    
    try:
        mzs, intensities = p.getspectrum(0)
        
        df_spec = pd.DataFrame({
            'm/z': mzs,
            'intensity': intensities
        })
        
        print(f"  - m/z 값 개수: {len(mzs)}")
        print(f"  - m/z 범위: {mzs.min():.4f} ~ {mzs.max():.4f}")
        print(f"  - Intensity 합계: {intensities.sum():.2f}")
        
        print("\n--- 데이터 (상위 20개) ---")
        display(df_spec.head(20))
        
    except Exception as e:
        print(f"스펙트럼(Index 0)을 읽는 중 오류 발생: {e}")
else:
    print("파일이 파싱되지 않았거나 스펙트럼이 없습니다.")

--- 첫 번째 스펙트럼 (Index 0) 상세 정보 ---
  - 좌표 (x, y, z): (17, 1, 1)
  - m/z 값 개수: 56
  - m/z 범위: 933.3122 ~ 2612.9590
  - Intensity 합계: 14219.99

--- 데이터 (상위 20개) ---


Unnamed: 0,m/z,intensity
0,933.312161,244.024734
1,933.340161,76.535027
2,1079.371882,349.399048
3,1095.366779,230.714294
4,1136.393015,194.11058
5,1257.428938,1402.032959
6,1282.458693,592.314575
7,1298.453674,207.421021
8,1339.476734,292.829681
9,1403.485785,106.483521


## 5. 전체 데이터 CSV로 내보내기 (Binned)

`src/parsing.py`에서 사용된 것과 유사한 'm/z 반올림(precision)' 방식을 사용하여, 현재 파일의 **모든 픽셀** 데이터를 'Wide Format' CSV로 저장합니다. 

**경고:** 이 작업은 파일 크기에 따라 매우 오래 걸릴 수 있습니다.

In [10]:
def export_binned_data_to_csv(p, output_filename, mz_precision=2):
    """
    ImzMLParser 객체(p)를 받아 m/z precision 기준으로 비닝(binning)된
    'wide' format CSV 파일을 저장합니다.
    """
    if p is None:
        print("오류: 파서 객체(p)가 없습니다. (파일이 열리지 않음)")
        return

    try:
        num_spectra = len(p.coordinates)
        if num_spectra == 0:
            print("오류: 파일에 스펙트럼 데이터가 없습니다.")
            return

        print(f"1단계: m/z 값을 소수점 {mz_precision}자리까지 반올림하여 마스터 축 생성 중...")
        master_mzs_set = set()
        for i, _ in enumerate(p.coordinates):
            if (i+1) % 1000 == 0:
                print(f"  - 마스터 축 생성 중... {i+1}/{num_spectra}")
            mzs, _ = p.getspectrum(i)
            master_mzs_set.update(np.round(mzs, mz_precision))
        
        master_mzs = sorted(list(master_mzs_set))
        num_master_mzs = len(master_mzs)
        mz_to_index_map = {mz: i for i, mz in enumerate(master_mzs)}
        print(f"  - 고유 m/z 값 {num_master_mzs}개를 기준으로 데이터 테이블을 생성합니다.")

        all_aligned_intensities = []
        coordinates_list = []
        print("2단계: 각 스펙트럼을 마스터 m/z 축에 정렬하는 중...")
        
        for i, (x, y, z) in enumerate(p.coordinates):
            if (i+1) % 1000 == 0:
                print(f"  - 스펙트럼 처리 중... {i+1}/{num_spectra}")
            
            mzs, intensities = p.getspectrum(i)
            aligned_intensities = np.zeros(num_master_mzs, dtype=np.float32)
            rounded_mzs = np.round(mzs, mz_precision)

            for mz, intensity in zip(rounded_mzs, intensities):
                if mz in mz_to_index_map:
                    index = mz_to_index_map[mz]
                    aligned_intensities[index] += intensity
            
            all_aligned_intensities.append(aligned_intensities)
            coordinates_list.append({'x': x, 'y': y, 'z': z})
        
        print("3단계: DataFrame 생성 및 CSV 저장...")
        
        # m/z 컬럼 이름을 문자열로 변환 (예: 72.99)
        formatted_headers = [f"{mz:.{mz_precision}f}" for mz in master_mzs]
        df_coords = pd.DataFrame(coordinates_list)
        df_intensities = pd.DataFrame(all_aligned_intensities, columns=formatted_headers)
        
        df_full = pd.concat([df_coords, df_intensities], axis=1)
        
        df_full.to_csv(output_filename, index=False, float_format='%.4f')
        print(f"\nCSV 파일 저장 완료: {output_filename}")
        print(f"  - 형태 (행, 열): {df_full.shape}")

    except Exception as e:
        print(f"CSV 내보내기 중 오류 발생: {e}")
        traceback.print_exc() # 상세 오류 출력

    return df_full


### CSV 내보내기 실행

아래 셀을 실행하여 위에서 정의한 함수를 실행합니다. `mz_precision` 값을 조절하여 m/z bin의 정밀도를 설정할 수 있습니다.

In [13]:
# --- CSV 내보내기 설정 ---

# 소수점 몇째 자리까지 m/z 값을 그룹화할지 설정 (예: 2 -> 72.99)
MZ_PRECISION = 2

# 저장할 파일 이름 설정
output_csv_filename = f"{os.path.splitext(imzml_filepath)[0]}_binned_export.csv"

print("--- 5. CSV 내보내기 시작 ---")
print(f"  - 정밀도(precision): {MZ_PRECISION}")
print(f"  - 저장 파일: {output_csv_filename}")

# 함수 실행
df_full = export_binned_data_to_csv(p, output_csv_filename, mz_precision=MZ_PRECISION)

print("--- 작업 완료 ---")
display(df_full.head(50))

--- 5. CSV 내보내기 시작 ---
  - 정밀도(precision): 2
  - 저장 파일: data/het 5-2 hippocampus-total ion count_binned_export.csv
1단계: m/z 값을 소수점 2자리까지 반올림하여 마스터 축 생성 중...
  - 고유 m/z 값 562개를 기준으로 데이터 테이블을 생성합니다.
2단계: 각 스펙트럼을 마스터 m/z 축에 정렬하는 중...
3단계: DataFrame 생성 및 CSV 저장...

CSV 파일 저장 완료: data/het 5-2 hippocampus-total ion count_binned_export.csv
  - 형태 (행, 열): (750, 565)
--- 작업 완료 ---


Unnamed: 0,x,y,z,933.27,933.28,933.29,933.30,933.31,933.32,933.33,...,2978.05,2978.06,2978.08,2978.09,2978.11,2978.12,3019.10,3124.11,3124.12,3124.14
0,17,1,1,0.0,0.0,0.0,0.0,244.024734,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,18,1,1,0.0,0.0,0.0,0.0,0.0,184.930664,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,19,1,1,0.0,0.0,0.0,0.0,0.0,192.719421,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20,1,1,0.0,0.0,0.0,0.0,0.0,110.831329,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,21,1,1,0.0,0.0,0.0,0.0,0.0,160.117447,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,22,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,23,1,1,0.0,0.0,0.0,0.0,0.0,158.366837,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,24,1,1,0.0,0.0,0.0,0.0,0.0,183.715439,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,13,2,1,0.0,72.387489,0.0,0.0,0.0,316.695251,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,14,2,1,0.0,0.0,0.0,0.0,0.0,234.004578,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
