# RR 전처리(Vent 사용 고려)

# RR 전처리 PySpark 코드 정리 (new_cohort 기준, 조건부 보존 포함)

## 1. spark 세션 생성 & cohort 로딩

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, expr, when, lit, concat, max as spark_max  

# 1. Spark 세션 생성
spark = SparkSession.builder \
    .appName("MIMIC Final - RR Preprocessing") \
    .getOrCreate()

# 2. Cohort 파일 로딩
cohort = spark.read.csv("/Users/skku_aws165/Documents/MIMIC/MIMIC-IV-Project/notebooks/final/new_cohort.csv", header=True, inferSchema=True)
cohort = cohort.select("subject_id", "hadm_id", "stay_id").dropDuplicates()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/06 21:11:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# 2. RR 데이터 로딩 및 필터링

In [2]:
# 처음 1회만 실행
rr_raw = spark.read.csv("/Users/skku_aws165/Documents/MIMIC/icu/chartevents.csv.gz", header=True, inferSchema=True)
rr_only = rr_raw.filter(col("itemid") == 220210)
rr_only.write.mode("overwrite").parquet("/Users/skku_aws165/Documents/MIMIC/preprocessed/rr_only.parquet")

                                                                                

In [3]:
from pyspark.sql.functions import col, to_timestamp
import os

# 저장된 파일 경로
rr_parquet_path = "/Users/skku_aws165/Documents/MIMIC/preprocessed/rr_only.parquet"

# RR 데이터 로딩
if os.path.exists(rr_parquet_path):
    print("✅ 캐시된 RR parquet 파일 로딩 중...")
    rr_df = spark.read.parquet(rr_parquet_path)
else:
    print("🔄 RR 필터링 중 (chartevents.csv.gz → RR only)...")
    rr_raw = spark.read.csv("/Users/skku_aws165/Documents/MIMIC/icu/chartevents.csv.gz", header=True, inferSchema=True)
    rr_df = rr_raw.filter(col("itemid") == 220210)
    rr_df.write.mode("overwrite").parquet(rr_parquet_path)
    print("✅ 저장 완료:", rr_parquet_path)

# 공통 후처리 (charttime 변환 + cohort 필터)
rr_df = rr_df.withColumn("charttime", to_timestamp("charttime"))
rr_df = rr_df.join(cohort, on=["subject_id", "hadm_id", "stay_id"], how="inner")

# 샘플 확인
rr_df.show(3)


✅ 캐시된 RR parquet 파일 로딩 중...
+----------+--------+--------+------------+-------------------+-------------------+------+-----+--------+--------+-------+
+----------+--------+--------+------------+-------------------+-------------------+------+-----+--------+--------+-------+
|  10000690|25860671|37081114|        8787|2150-11-06 08:00:00|2150-11-06 09:07:00|220210|   23|    23.0|insp/min|      0|
|  10000690|25860671|37081114|        8787|2150-11-06 09:00:00|2150-11-06 09:07:00|220210|   26|    26.0|insp/min|      0|
|  10000690|25860671|37081114|        8787|2150-11-06 10:00:00|2150-11-06 13:15:00|220210|   27|    27.0|insp/min|      0|
+----------+--------+--------+------------+-------------------+-------------------+------+-----+--------+--------+-------+
only showing top 3 rows


# 3. chart 기반 Ventilator 정보 로딩

In [4]:
# 5. chart events에서 ventilator 관련 itemid 로딩
vent_itemids = [223848, 223849, 223870]
chart_vent_df = rr_raw.filter(col("itemid").isin(vent_itemids))
chart_vent_df = chart_vent_df.withColumn("charttime", to_timestamp("charttime"))
chart_vent_df = chart_vent_df.join(cohort, on=["subject_id", "hadm_id", "stay_id"], how="inner")

# 4. procedureevents 기반 Ventilation 정보 로딩

In [7]:
# 6. procedureevents 로딩 및 필터
proc_vent_df = spark.read.csv("/Users/skku_aws165/Documents/MIMIC/icu/procedureevents.csv.gz", header=True, inferSchema=True)
proc_vent_df = proc_vent_df.filter(col("itemid").isin([225792, 225794]))
proc_vent_df = proc_vent_df.withColumn("starttime", to_timestamp("starttime"))
proc_vent_df = proc_vent_df.join(cohort, on=["subject_id", "hadm_id", "stay_id"], how="inner")


                                                                                

# 5. RR 조건부 필터링 함수 정의

In [10]:
def filter_rr_with_ventilator(rr_df, chart_vent_df, proc_vent_df, cohort):
    """
    RR = 0~6 bpm 구간에 대해 ventilator 기록 기반 보존 여부 결정
    """
    from pyspark.sql.functions import unix_timestamp, abs as spark_abs

    # Step 1: RR 0~6 필터링
    rr_low_df = rr_df.filter((col("valuenum") > 0) & (col("valuenum") <= 6))

    # Step 2: chart events 기반 ventilator 기록 정리
    vent_itemids = [223848, 223849, 223870]
    chart_vent_flag = chart_vent_df.filter(col("itemid").isin(vent_itemids)) \
        .select("subject_id", "hadm_id", "stay_id", "charttime").dropDuplicates()

    # Step 3: procedure events 기반 ventilator 기록 정리
    proc_vent_flag = proc_vent_df.select(
        "subject_id", "hadm_id", "stay_id", "starttime"
    ).withColumnRenamed("starttime", "charttime").dropDuplicates()

    # Step 4: chart events 기반 RR 보존
    rr_keep_chart = rr_low_df.join(
        chart_vent_flag,
        on=["subject_id", "hadm_id", "stay_id", "charttime"],
        how="leftsemi"
    )

    # Step 5: procedure events 기반 RR 보존 (1시간 이내)
    rr_proc_join = rr_low_df.alias("rr").join(
        proc_vent_flag.alias("pv"),
        on=["subject_id", "hadm_id", "stay_id"],
        how="inner"
    ).filter(
        spark_abs(unix_timestamp("rr.charttime") - unix_timestamp("pv.charttime")) <= 3600
    ).select("rr.*")

    # ✅ Step 6: 두 조건 만족하는 RR 모두 보존
    common_cols = rr_df.columns
    rr_keep_chart = rr_keep_chart.select(*common_cols)
    rr_proc_join = rr_proc_join.select(*common_cols)
    rr_keep_all = rr_keep_chart.union(rr_proc_join).dropDuplicates()

    # ✅ Step 7: RR > 6은 모두 유지
    rr_normal_df = rr_df.filter(col("valuenum") > 6)

    # Step 8: 최종 병합
    rr_final = rr_normal_df.union(rr_keep_all).dropDuplicates()

    return rr_final


# 6. 함수 적용 및 저장

In [11]:
# 7. 함수 실행
rr_final = filter_rr_with_ventilator(rr_df, chart_vent_df, proc_vent_df, cohort)

# 8. 저장
rr_final.write.mode("overwrite").parquet("/Users/skku_aws165/Documents/MIMIC/MIMIC-IV-Project/outputs/rr_filtered_with_vent.parquet")


25/08/07 13:43:26 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/08/07 13:43:26 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/08/07 13:43:26 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
25/08/07 13:43:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/08/07 13:43:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/08/07 13:43:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/08/07 13:43:27 WARN MemoryManager: Total allocation exceeds 95.00%

In [12]:
# CSV로도 저장 (주의: 큰 파일일 경우 오래 걸릴 수 있음)
rr_final.write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("/Users/skku_aws165/Documents/MIMIC/MIMIC-IV-Project/outputs/rr_filtered_with_vent.csv")

                                                                                

In [13]:
from pyspark.sql import SparkSession
import shutil
import os

# Spark 세션 (이미 있으면 생략)
spark = SparkSession.builder.getOrCreate()

# 병합할 경로
csv_dir = "/Users/skku_aws165/Documents/MIMIC/MIMIC-IV-Project/outputs/rr_filtered_with_vent.csv"
merged_csv_path = "/Users/skku_aws165/Documents/MIMIC/MIMIC-IV-Project/outputs/rr_filtered_with_vent_merged.csv"

# CSV 디렉토리 로딩
df = spark.read.option("header", True).csv(csv_dir)

# 단일 CSV로 저장 (Spark는 기본적으로 병렬 저장하므로 다시 저장 + 병합)
# tmp 경로에 저장 후, 하나만 복사
tmp_path = csv_dir + "_tmp"

df.coalesce(1).write.option("header", True).mode("overwrite").csv(tmp_path)

# tmp 디렉토리에서 part-*.csv 파일 찾기
for filename in os.listdir(tmp_path):
    if filename.startswith("part-") and filename.endswith(".csv"):
        shutil.move(os.path.join(tmp_path, filename), merged_csv_path)
        break

# tmp 디렉토리 삭제
shutil.rmtree(tmp_path)

print("✅ 단일 CSV 저장 완료:", merged_csv_path)


[Stage 44:>                                                         (0 + 1) / 1]

✅ 단일 CSV 저장 완료: /Users/skku_aws165/Documents/MIMIC/MIMIC-IV-Project/outputs/rr_filtered_with_vent_merged.csv


                                                                                