In [1]:
ORIGIN_DATA = "origin_data"
DATA_CSV = "origin_data_csv"
LOGS = "logs"
MATCH = "Z:"
import pandas as pd

import numpy as np
import wfdb                                      # 讀取 WFDB header / record :contentReference[oaicite:4]{index=4}
from pathlib import Path                         # 物件導向檔案操作 :contentReference[oaicite:5]{index=5}
from datetime import datetime, timedelta, date, time
from tqdm import tqdm                            # 進度列（可省略）
import logging, os                               # 紀錄檔與系統路徑
from collections import defaultdict
import logging
import os
import ast
import re
from typing import List, Optional, Tuple,Set
from  tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

try:
    alive_yuran = pd.read_csv("./experiment_data_from_yuran/alive_42731_withHRV.csv")
    dead_yuran = pd.read_csv("./experiment_data_from_yuran/dead_42731_withHRV.csv")

    alive_set = set(alive_yuran['SUBJECT_ID'].to_list())
    dead_set = set(dead_yuran["SUBJECT_ID"].to_list())

    total_set = alive_set | dead_set

    mort_stage2_filtered = pd.read_csv(os.path.join(LOGS, "mort_stage2_filtered.csv"))
    surv_stage2_filtered = pd.read_csv(os.path.join(LOGS, "surv_stage2_filtered.csv"))

    mort_set = set(mort_stage2_filtered['SUBJECT_ID'].to_list())
    surv_set = set(surv_stage2_filtered['SUBJECT_ID'].to_list())
    
except Exception as e:
    print(e)

In [2]:
def extract_wave_id(row):
    header_path = os.path.join(MATCH, row['PREFIX'], row['FOLDER'], row['HEADER'])
    
    try:
        hdr = wfdb.rdheader(header_path, rd_segments=True)
        for seg_name in hdr.seg_name:
            if seg_name == '~':
                continue
            wave_id = seg_name.split("_")[0]
            if re.fullmatch(r'\d+', wave_id):
                return wave_id
    except Exception as e:
        print(f"Error reading {header_path}: {e}")
    
    return None

def read_wave_id(df: pd.DataFrame, max_workers=8) -> pd.DataFrame:
    rows = list(df.iterrows())

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        wave_ids = list(tqdm(
            executor.map(lambda x: extract_wave_id(x[1]), rows),
            total=len(rows),
            desc="Extracting wave_id"
        ))

    df['wave_id'] = wave_ids
    return df

# mort_stage2_filtered_with_wave_id=read_wave_id(mort_stage2_filtered)
# print(mort_stage2_filtered_with_wave_id.info())

# mort_stage2_filtered_with_wave_id = mort_stage2_filtered_with_wave_id.rename(columns={"AGE_YEARS":"age"})
# mort_stage2_filtered_with_wave_id.to_csv(os.path.join(LOGS,"mort_stage2_filtered_with_wave_id.csv"),index = False)

surv_stage2_filtered_with_wave_id = read_wave_id(surv_stage2_filtered)
surv_stage2_filtered_with_wave_id = surv_stage2_filtered_with_wave_id.rename(columns={"AGE_YEARS":"age"})
surv_stage2_filtered_with_wave_id.to_csv(os.path.join(LOGS,"surv_stage2_filtered_with_wave_id.csv"),index = False)


Extracting wave_id: 100%|██████████| 1450/1450 [06:41<00:00,  3.61it/s]


In [None]:
# Mort: ['SUBJECT_ID', 'HADM_ID','wave_id'] 嚴格交叉比對

cols = ['SUBJECT_ID', 'HADM_ID','wave_id']
mort_final_same = mort_stage2_filtered_with_wave_id[mort_stage2_filtered_with_wave_id['SUBJECT_ID'].isin(dead_set & mort_set)]
dead_yuran = dead_yuran[dead_yuran['SUBJECT_ID'].isin(dead_set & mort_set)]

# 前處理

dead_yuran['wave_id'] = dead_yuran['wave_id'].astype(str).str.strip()
mort_final_same['wave_id'] = mort_final_same['wave_id'].astype(str).str.strip()


# 1. 建立每組 row 的 tuple set
set_mort = set(tuple(row) for row in mort_final_same[cols].values)
set_yuran = set(tuple(row) for row in dead_yuran[cols].values)

# 2. 找兩組有交集（完全重複的 row）
common = set_mort & set_yuran

print(f"共有 {len(common)} 組 row 在兩個表格都出現。")
# if common:
#     print("重複的 row：")
#     for t in common:
#         print(t)
# else:
#     print("沒有任何 row 同時存在兩表。")

# 3. 若要找 dead_yuran 中有哪些 row 沒有出現在 mort_final_same：
only_in_yuran = set_yuran - set_mort
print("dead_yuran 有、mort_final_same 沒有的 row：")
for t in only_in_yuran:
    print(t)

共有 96 組 row 在兩個表格都出現。
dead_yuran 有、mort_final_same 沒有的 row：


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mort_final_same['wave_id'] = mort_final_same['wave_id'].astype(str).str.strip()


In [24]:
# Surv: ['SUBJECT_ID', 'HADM_ID','wave_id'] 嚴格交叉比對

cols = ['SUBJECT_ID', 'HADM_ID','wave_id']
surv_final_same = surv_stage2_filtered_with_wave_id[surv_stage2_filtered_with_wave_id['SUBJECT_ID'].isin(alive_set & surv_set)]
alive_yuran = alive_yuran[alive_yuran['SUBJECT_ID'].isin(alive_set & surv_set)]

# 前處理

alive_yuran['wave_id'] = alive_yuran['wave_id'].astype(str).str.strip()
surv_final_same['wave_id'] = surv_final_same['wave_id'].astype(str).str.strip()

# 1. 建立每組 row 的 tuple set
set_surv = set(tuple(row) for row in surv_final_same[cols].values)
set_yuran = set(tuple(row) for row in alive_yuran[cols].values)

common = set_surv & set_yuran

print(f"共有 {len(common)} 組 row 在兩個表格都出現。")

only_in_yuran = set_yuran - set_surv
print("alive_yuran 有、surv_final_same 沒有的 row：")
for t in only_in_yuran:
    print(t)

共有 361 組 row 在兩個表格都出現。
alive_yuran 有、surv_final_same 沒有的 row：


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surv_final_same['wave_id'] = surv_final_same['wave_id'].astype(str).str.strip()


In [3]:
import ast
from typing import List, Union
import pandas as pd
import numpy as np

def _safe_literal(val: str):
    """安全地轉為 Python 物件；失敗就回 None。"""
    try:
        return ast.literal_eval(val)
    except Exception:
        return None

def _parse_icd9(val: str) -> List[int]:
    """將 '[1,2,3]' → [1,2,3]；不合法或空值回 []."""
    if pd.isna(val) or val.strip() in ("", "[]"):
        return []
    items = _safe_literal(val)
    if items is None:
        return []
    return [int(x) for x in items]

def _parse_ecg_list(val: str) -> List[pd.Timestamp]:
    """將 '[2160/7/5 18:27,...]' 轉 List[pd.Timestamp]；空值回 []."""
    if pd.isna(val) or val.strip() in ("", "[]"):
        return []
    # 去掉頭尾中括號再以逗號切
    ts_list = val.strip("[]").split(",")
    return [pd.to_datetime(ts.strip(), errors="coerce") for ts in ts_list]

def _parse_datetime(val: str) -> pd.Timestamp:
    return pd.to_datetime(val, errors="coerce")

def load_patient_csv(path: str) -> pd.DataFrame:
    date_cols = ["ECG_DATETIME", "ADMITTIME", "DISCHTIME", "DEATHTIME", "DOB"]
    converters = {
        **{c: _parse_datetime for c in date_cols},
        "ICD9_CODE": _parse_icd9,
        "ECG_DATETIME_y": _parse_ecg_list,
    }
    df = pd.read_csv(path, converters=converters)

    # age 若為 object → 轉 numeric
    if df["age"].dtype == "O":
        df.loc[:, "age"] = pd.to_numeric(df["age"], errors="coerce")

    return df


alive_42731 = load_patient_csv(".\experiment_data_from_yuran\活_ICD9_427_20250211_with_age.csv")
dead_42731 = load_patient_csv(".\experiment_data_from_yuran\死_ICD9_427_20250211_with_age.csv")

print(alive_42731.info())
print(dead_42731.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2793 entries, 0 to 2792
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   SUBJECT_ID      2793 non-null   int64         
 1   ECG_DATETIME    2793 non-null   datetime64[ns]
 2   HADM_ID         2793 non-null   int64         
 3   ICD9_CODE       2793 non-null   object        
 4   ADMITTIME       2793 non-null   datetime64[ns]
 5   DISCHTIME       2793 non-null   datetime64[ns]
 6   DEATHTIME       0 non-null      datetime64[ns]
 7   ECG_DATETIME_y  2793 non-null   object        
 8   GENDER          2793 non-null   object        
 9   DOB             2793 non-null   datetime64[ns]
 10  wave_id         2793 non-null   int64         
 11  age             2793 non-null   int64         
 12  death           2793 non-null   int64         
dtypes: datetime64[ns](5), int64(5), object(3)
memory usage: 283.8+ KB
None
<class 'pandas.core.frame.DataFrame'>

In [7]:
# Surv and alive_42731 分析
from datetime import timedelta
alive_42731_match = alive_42731[alive_42731['SUBJECT_ID'].isin(alive_set & surv_set)].reset_index()

alive_42731_set = set(alive_42731_match['SUBJECT_ID'].to_list())
print(f"Cover all alive :{(alive_set & surv_set)- alive_42731_set}")
print(f"Total : {len(alive_42731_match)}, subject id: {alive_42731_match['SUBJECT_ID'].nunique()}")

surv_stage2_filtered['T0'] = pd.to_datetime(surv_stage2_filtered['T0'])
tolerance = timedelta(seconds=60)

is_same_time = []

for _, row in alive_42731_match.iterrows():
    subject_id = row['SUBJECT_ID']
    ecg_time = pd.to_datetime(row['ECG_DATETIME'], errors='coerce')

    match_times = surv_stage2_filtered[
        surv_stage2_filtered['SUBJECT_ID'] == subject_id
    ]['T0']

    # 檢查是否存在任一時間差在 60 秒內
    is_match = ((match_times - ecg_time).abs() <= tolerance).any()
    is_same_time.append(is_match)

# 過濾不符合條件的
alive_42731_not_match = alive_42731_match[~pd.Series(is_same_time).values]
print(alive_42731_not_match)

print(surv_set-set(alive_42731['SUBJECT_ID'].to_list()))

print(set(alive_42731['SUBJECT_ID'].to_list()) & set(dead_42731['SUBJECT_ID'].to_list()))



Cover all alive :set()
Total : 361, subject id: 361
Empty DataFrame
Columns: [index, SUBJECT_ID, ECG_DATETIME, HADM_ID, ICD9_CODE, ADMITTIME, DISCHTIME, DEATHTIME, ECG_DATETIME_y, GENDER, DOB, wave_id, age, death]
Index: []
{47233, 82565, 20742, 21771, 40460, 24461, 1038, 81807, 75793, 76435, 30484, 12821, 72467, 88851, 25115, 90269, 43422, 78238, 80030, 75170, 76196, 8105, 20013, 92846, 12461, 62254, 63028, 59457, 65732, 29511, 49739, 91853, 59215, 50385, 32210, 16210, 73686, 83288, 5343, 15079, 95849, 15470, 82159, 52592, 58993, 55922, 57208, 24825, 23034, 75772}
set()


In [28]:
# Mort and dead_42731 分析
dead_42731_match = dead_42731[dead_42731['SUBJECT_ID'].isin(dead_set & mort_set)].reset_index()

dead_42731_set = set(dead_42731_match['SUBJECT_ID'].to_list())
print(f"Cover all alive :{(dead_set & mort_set)- dead_42731_set}")
print(f"Total : {len(dead_42731_match)}, subject id: {dead_42731_match['SUBJECT_ID'].nunique()}")

mort_stage2_filtered['T0'] = pd.to_datetime(mort_stage2_filtered['T0'])
tolerance = timedelta(seconds=60)

is_same_time = []

for _, row in dead_42731_match.iterrows():
    subject_id = row['SUBJECT_ID']
    ecg_time = pd.to_datetime(row['ECG_DATETIME'], errors='coerce')

    match_times = mort_stage2_filtered[
        mort_stage2_filtered['SUBJECT_ID'] == subject_id
    ]['T0']

    # 檢查是否存在任一時間差在 60 秒內
    is_match = ((match_times - ecg_time).abs() <= tolerance).any()
    is_same_time.append(is_match)

# 過濾不符合條件的
dead_42731_not_match = dead_42731_match[~pd.Series(is_same_time).values]
print(dead_42731_not_match)


Cover all alive :set()
Total : 96, subject id: 96
Empty DataFrame
Columns: [index, SUBJECT_ID, ECG_DATETIME, HADM_ID, ICD9_CODE, ADMITTIME, DISCHTIME, DEATHTIME, ECG_DATETIME_y, GENDER, DOB, wave_id, age, death]
Index: []


In [6]:
# 劃分出不在 活_ICD9_427 底下的紀錄
not_in_427 = set(surv_stage2_filtered_with_wave_id['SUBJECT_ID'].to_list()) - set(alive_42731['SUBJECT_ID'].to_list())

print(len(not_in_427),not_in_427)
surv_stage2_filtered_with_wave_id_not_in_427 = surv_stage2_filtered_with_wave_id[surv_stage2_filtered_with_wave_id['SUBJECT_ID'].isin(not_in_427)]
surv_stage2_filtered_with_wave_id_not_in_427.to_csv(os.path.join(LOGS,"surv_stage2_filtered_with_wave_id_not_in_427.csv"),index=False)

50 {47233, 82565, 20742, 21771, 40460, 24461, 1038, 81807, 75793, 76435, 30484, 12821, 72467, 88851, 25115, 90269, 43422, 78238, 80030, 75170, 76196, 8105, 20013, 92846, 12461, 62254, 63028, 59457, 65732, 29511, 49739, 91853, 59215, 50385, 32210, 16210, 73686, 83288, 5343, 15079, 95849, 15470, 82159, 52592, 58993, 55922, 57208, 24825, 23034, 75772}


In [8]:
in_427_not_in_final = (set(surv_stage2_filtered_with_wave_id['SUBJECT_ID'].to_list()) & set(alive_42731['SUBJECT_ID'].to_list())) - alive_set
print(len(in_427_not_in_final),in_427_not_in_final)
surv_stage2_filtered_with_wave_id_addition = surv_stage2_filtered_with_wave_id[surv_stage2_filtered_with_wave_id['SUBJECT_ID'].isin(in_427_not_in_final)]
print(surv_stage2_filtered_with_wave_id_addition['SUBJECT_ID'])

1039 {65537, 10241, 51202, 75779, 61441, 96259, 40967, 96261, 83981, 30733, 16399, 10257, 63512, 43033, 86041, 79900, 43037, 45088, 24609, 53282, 22565, 63525, 98344, 4136, 92203, 69675, 90158, 94256, 69681, 77873, 43060, 88117, 30775, 69693, 96321, 18498, 67653, 59462, 86086, 63559, 63563, 59469, 69709, 4175, 8274, 4180, 6229, 98390, 22616, 94297, 82010, 18524, 94301, 53342, 63582, 71774, 47203, 67684, 57445, 82021, 49255, 96361, 53355, 2157, 26734, 12400, 47216, 22642, 14458, 92287, 98434, 45186, 63621, 28808, 22664, 73868, 20620, 45199, 57489, 77975, 26781, 57511, 65703, 4266, 8363, 51377, 86193, 55473, 71857, 55477, 18614, 10423, 63669, 78010, 96445, 98494, 6335, 71872, 12482, 82115, 75972, 14532, 41163, 22731, 47309, 88269, 98517, 214, 63701, 61656, 217, 65753, 67803, 61658, 14551, 43233, 76001, 73955, 49380, 28897, 78050, 86245, 41194, 84206, 69871, 22766, 65779, 41204, 45300, 63733, 22774, 98555, 22782, 92415, 41217, 76034, 82179, 98565, 262, 63750, 51466, 63755, 80142, 4369, 90

In [None]:
surv_stage2_filtered_with_wave_id_addition['T1_lead2'] = pd.to_datetime(surv_stage2_filtered_with_wave_id_addition['T1_lead2'])
surv_stage2_filtered_with_wave_id_addition['OUTTIME'] = pd.to_datetime(surv_stage2_filtered_with_wave_id_addition['OUTTIME'])
surv_stage2_filtered_with_wave_id_addition['DISCHTIME'] = pd.to_datetime(surv_stage2_filtered_with_wave_id_addition['DISCHTIME'])
surv_stage2_filtered_with_wave_id_addition['T0'] = pd.to_datetime(surv_stage2_filtered_with_wave_id_addition['T0'])
surv_stage2_filtered_with_wave_id_addition['INTIME'] = pd.to_datetime(surv_stage2_filtered_with_wave_id_addition['INTIME'])
surv_stage2_filtered_with_wave_id_addition['ADMITTIME'] = pd.to_datetime(surv_stage2_filtered_with_wave_id_addition['ADMITTIME'])


mask = (
    (surv_stage2_filtered_with_wave_id_addition['T1_lead2'] <= surv_stage2_filtered_with_wave_id_addition['OUTTIME'])&
    (surv_stage2_filtered_with_wave_id_addition['T1_lead2'] <= surv_stage2_filtered_with_wave_id_addition['DISCHTIME'])&
    (surv_stage2_filtered_with_wave_id_addition['INTIME'] <= surv_stage2_filtered_with_wave_id_addition['T0'])&
    (surv_stage2_filtered_with_wave_id_addition['ADMITTIME']<= surv_stage2_filtered_with_wave_id_addition['T0'])

)   
surv_stage2_filtered_with_wave_id_addition=surv_stage2_filtered_with_wave_id_addition[mask]
print(surv_stage2_filtered_with_wave_id_addition)

      SUBJECT_ID  HADM_ID  ICUSTAY_ID              INTIME             OUTTIME  \
0            214   197273      241941 2188-10-28 21:26:20 2188-11-03 11:45:06   
1            217   155173      226300 2126-09-30 10:59:50 2126-10-01 12:16:46   
2            262   106019      243312 2153-09-25 18:02:24 2153-09-27 13:59:18   
4            625   145523      274000 2178-04-24 21:30:15 2178-05-03 19:11:31   
5            695   177128      289542 2178-08-05 07:39:02 2178-08-13 14:07:01   
...          ...      ...         ...                 ...                 ...   
1445       99759   157932      216511 2161-03-10 09:43:03 2161-03-11 16:52:32   
1446       99776   136231      262109 2171-07-08 04:15:10 2171-07-18 17:15:38   
1447       99796   144804      285715 2115-02-12 09:56:20 2115-02-13 17:03:25   
1448       99830   176834      211489 2187-08-20 20:46:45 2187-09-04 15:48:21   
1449       99922   123563      259919 2107-04-07 10:52:03 2107-04-08 10:55:22   

               ADMITTIME   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surv_stage2_filtered_with_wave_id_addition['T1_lead2'] = pd.to_datetime(surv_stage2_filtered_with_wave_id_addition['T1_lead2'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surv_stage2_filtered_with_wave_id_addition['OUTTIME'] = pd.to_datetime(surv_stage2_filtered_with_wave_id_addition['OUTTIME'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guid

In [26]:
cols = ['SUBJECT_ID','HADM_ID','wave_id']
modified_alive_427 = load_patient_csv(".\experiment_data_from_yuran\modified_活人_ICD9_427_20250211_with_age+TIME_LENGTH.csv")
modified_alive_427 = modified_alive_427[cols]
modified_alive_427_rename = modified_alive_427.rename(columns={
    'wave_id':'wave_id_test'
}).reset_index()

# Step 1: 統一型別 + 去除空白
for col in ['SUBJECT_ID', 'HADM_ID']:
    modified_alive_427_rename[col] = modified_alive_427_rename[col].astype(str).str.strip()
    surv_stage2_filtered_with_wave_id_addition[col] = surv_stage2_filtered_with_wave_id_addition[col].astype(str).str.strip()

modified_alive_427_rename['wave_id_test'] = modified_alive_427_rename['wave_id_test'].astype(int)
print(modified_alive_427_rename.head())
surv_stage2_filtered_with_wave_id_addition_merge = surv_stage2_filtered_with_wave_id_addition.merge(
    modified_alive_427_rename,
    on=['SUBJECT_ID','HADM_ID'],
    how="left"
)
# 檢查有哪些是空值
null_rows = surv_stage2_filtered_with_wave_id_addition_merge[surv_stage2_filtered_with_wave_id_addition_merge['wave_id_test'].isnull()]
print("有 NaN 的筆數：", len(null_rows))
print(null_rows[['SUBJECT_ID', 'HADM_ID', 'wave_id_test']])

surv_stage2_filtered_with_wave_id_addition_merge['wave_id_test'] = (
    surv_stage2_filtered_with_wave_id_addition_merge['wave_id_test']
    .fillna(-1)  # 用 -1 代表無對應資料
    .astype(int)
)
surv_stage2_filtered_with_wave_id_addition_merge['wave_id'] = surv_stage2_filtered_with_wave_id_addition_merge['wave_id'].astype(int)
print(surv_stage2_filtered_with_wave_id_addition_merge.info())


mask_equal_wave = (
    surv_stage2_filtered_with_wave_id_addition_merge['wave_id'] == (surv_stage2_filtered_with_wave_id_addition_merge['wave_id_test'])
)

print(surv_stage2_filtered_with_wave_id_addition_merge[mask_equal_wave])
surv_stage2_filtered_with_wave_id_addition_merge[mask_equal_wave].to_csv(os.path.join(LOGS, "surv_stage2_filtered_with_wave_id_427_not_in_final.csv"),index=False)

   index SUBJECT_ID HADM_ID  wave_id_test
0      0         30  104557       3524877
1      1         79  181542       3887555
2      2         85  112077       3647298
3      3        194  124794       3400942
4      4        214  197273       3502927
有 NaN 的筆數： 6
    SUBJECT_ID HADM_ID  wave_id_test
72        8109  192031           NaN
152      16122  166498           NaN
265      25313  148253           NaN
300      29660  167837           NaN
512      54197  154990           NaN
796      79556  148982           NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1039 entries, 0 to 1038
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   SUBJECT_ID            1039 non-null   object        
 1   HADM_ID               1039 non-null   object        
 2   ICUSTAY_ID            1039 non-null   int64         
 3   INTIME                1039 non-null   datetime64[ns]
 4   OUTTIME         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surv_stage2_filtered_with_wave_id_addition[col] = surv_stage2_filtered_with_wave_id_addition[col].astype(str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  surv_stage2_filtered_with_wave_id_addition[col] = surv_stage2_filtered_with_wave_id_addition[col].astype(str).str.strip()
