In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Get segment

In [3]:
data = pd.read_csv('noninvasive_data.csv')
data['time'] = (data['time'] * 3 - 360).round()

In [4]:
def split_dataframe_by_id(data):
    # Dictionary to hold the smaller DataFrames
    dataframes = {}
    unique_ids = data['ID'].unique()
    for uid in unique_ids:
        dataframes[uid] = data[data['ID'] == uid]
    return dataframes

In [5]:
def find_segments_in_df(df, target_duration_minutes, missing_data_threshold, training_segment_minutes, testing_segment_minutes):
    df['time'] = pd.to_numeric(df['time'], errors='coerce')
    df = df.dropna(subset=['time'])

    segments = []
    current_segment = []
    current_start = df['time'].iloc[0] if not df['time'].empty else None

    #因為原始資料其實是20秒測量一次，所以只需timestamp長度乘3
    target_duration = target_duration_minutes * 3
    training_segment_seconds = training_segment_minutes * 3
    testing_segment_seconds = testing_segment_minutes * 3

    for timestamp in df['time']:
        if timestamp - current_start <= target_duration:
            current_segment.append(timestamp)
        else:
            # 檢查前幾分鐘和最後幾分鐘的缺失數據比例
            first_period = [t for t in current_segment if t - current_start <= training_segment_seconds]
            last_period = [t for t in current_segment if timestamp - t <= testing_segment_seconds]

            if len(first_period) >= (training_segment_seconds * (1 - missing_data_threshold)) and len(last_period) >= (testing_segment_seconds * (1 - missing_data_threshold)):
                segments.append((current_segment[0], current_segment[-1]))

            current_segment = [timestamp]
            current_start = timestamp

    # 檢查最後一個片段
    first_period = [t for t in current_segment if t - current_start <= training_segment_seconds]
    last_period = [t for t in current_segment if timestamp - t <= testing_segment_seconds]

    if len(current_segment) >= (target_duration * (1 - missing_data_threshold)) and len(first_period) >= (training_segment_seconds * (1 - missing_data_threshold)) and len(last_period) >= (testing_segment_seconds * (1 - missing_data_threshold)):
        segments.append((current_segment[0], current_segment[-1]))

    return segments

def calculate_segment_counts(results):
    # 計算每個工作表中的片段數量
    segment_counts = {sheet: len(segments) for sheet, segments in results.items()}
    df_segment_counts = pd.DataFrame.from_dict(segment_counts, orient='index', columns=['Segment Count'])
    total_segments = sum(segment_counts.values())
    return df_segment_counts, total_segments



In [6]:
def process_segment(df, segment_number):
    # 新增一列表示 segment 編號
    df['Segment_Number'] = segment_number
    return df

# 定義保存處理後的 segment 到 DataFrame 的函數
def save_processed_segments_to_df(df, segments, combined_df, segment_start_number):
    for i, (start, end) in enumerate(segments):
        segment_df = df[(df['time'] >= start) & (df['time'] <= end)].copy()
        segment_number = segment_start_number + i
        segment_df = process_segment(segment_df, segment_number)
        combined_df = pd.concat([combined_df, segment_df], ignore_index=True)
    return combined_df, segment_start_number + len(segments)

# 初始化一個空的 DataFrame 用於保存合併結果
 

In [7]:
def adjust_timestamps_intervals(df):
    segments = df['Segment_Number'].unique()
    for segment in segments:
        segment_df = df[df['Segment_Number'] == segment]
        if not segment_df.empty:
            # 計算新時間戳並保持間隔
            new_timestamps = segment_df['time'] - segment_df['time'].iloc[0] + 1
            df.loc[df['Segment_Number'] == segment, 'time'] = new_timestamps
    return df

In [8]:
def fill_and_extend_segments(df, target_duration_minutes, fill_method, fill_value):
    target_length = target_duration_minutes * 3
    df_list = []
    segments = df['Segment_Number'].unique()

    # Convert time to integers
    df['time'] = df['time'].astype(int)

    for segment in segments:
        segment_df = df[df['Segment_Number'] == segment].copy()
        segment_df = segment_df.drop_duplicates(subset='time')  # Drop duplicates
        segment_df.set_index('time', inplace=True)

        # Create a complete index range
        idx = pd.RangeIndex(start=segment_df.index.min(), stop=segment_df.index.max(), step=1)

        # Reindex and fill missing values based on fill_method
        if fill_method == 'ffill':
            segment_df = segment_df.reindex(idx).ffill()
        elif fill_method == 'bfill':
            segment_df = segment_df.reindex(idx).bfill()
        elif fill_method == 'pad':
            segment_df = segment_df.reindex(idx).pad()
        elif fill_method == 'interpolate':
            segment_df = segment_df.reindex(idx).interpolate(method='linear')
        elif fill_method == 'fill_value' and fill_value is not None:
            segment_df = segment_df.reindex(idx).fillna(fill_value)
        elif fill_method == 'nan':
            segment_df = segment_df.reindex(idx)
        else:
            raise ValueError(f"Unsupported fill_method: {fill_method}")

        # Check if the segment length is less than the target length
        if len(segment_df) < target_length:
            last_valid_index = segment_df.index[-1]
            extend_idx = pd.RangeIndex(start=last_valid_index + 1, stop=last_valid_index + 1 + target_length - len(segment_df))

            # Create an extension DataFrame with the same columns, filled with NaN values
            if fill_method == 'nan':
                extend_df = pd.DataFrame(index=extend_idx, columns=segment_df.columns)
            else:
                last_valid_row = segment_df.iloc[-1]
                extend_df = pd.DataFrame([last_valid_row] * len(extend_idx), index=extend_idx)

            # Concatenate the original and extension DataFrames
            segment_df = pd.concat([segment_df, extend_df])

        # Ensure the 'Segment_Number' column is correctly set
        segment_df['Segment_Number'] = segment
        df_list.append(segment_df.reset_index())

    return pd.concat(df_list, ignore_index=True)

In [9]:
def split_segments(df, training_segment_minutes, testing_segment_minutes):
    training_length = training_segment_minutes * 3
    testing_length = testing_segment_minutes * 3
    
    X_set = pd.DataFrame()
    Y_set = pd.DataFrame()
    window_set = pd.DataFrame()
    
    segments = df['Segment_Number'].unique()
    
    for segment in segments:
        segment_df = df[df['Segment_Number'] == segment].copy()
        
        X_set = pd.concat([X_set, segment_df.head(training_length)], ignore_index=True)
        Y_set = pd.concat([Y_set, segment_df.tail(testing_length)], ignore_index=True)
        
        if len(segment_df) > training_length + testing_length:
            window_set = pd.concat([window_set, segment_df.iloc[training_length:-testing_length]], ignore_index=True)
        else:
            window_set = pd.concat([window_set, segment_df.iloc[training_length:]], ignore_index=True)
    
    return X_set, Y_set, window_set

In [10]:
def Get_Y_Set_Label(Y_set, threshold=65):
    # 获取所有唯一的 Segment_Number
    segments = Y_set['Segment_Number'].unique()
    
    # 初始化一个 numpy 数组来存储每个 segment 的结果
    np_Y_set = np.zeros(len(segments), dtype=int)
    
    for i, segment in enumerate(segments):
        segment_df = Y_set[Y_set['Segment_Number'] == segment]
        if (segment_df['MAP'] < threshold).any():
            np_Y_set[i] = 1
    
    # 统计有多少个1
    count_of_ones = np.sum(np_Y_set)
    
    return np_Y_set, count_of_ones

In [11]:
#以ID分割df
split_dfs=split_dataframe_by_id(data)

#設定需求，獲取符合需求的segment
target_duration_minutes=20   #觀察區間+window+預測區間的分鐘數
missing_data_threshold=0.40  #觀察區間及預測區間容許的missing data比例
training_segment_minutes=10  #觀察區間的長度
testing_segment_minutes=10   #預測區間的長度
segment_start_number=1
fill_method='ffill'
fill_value=None
selected_segments = {}

for key, df in split_dfs.items():
    try:
        selected_segments[key] = find_segments_in_df(df,
                                                     target_duration_minutes, 
                                                     missing_data_threshold,
                                                     training_segment_minutes,
                                                     testing_segment_minutes)
    except Exception as e:
        selected_segments[key] = str(e)


# 計算 segment 的數量
df_segment_counts, total_segments = calculate_segment_counts(selected_segments)

print(df_segment_counts)
print("Total number of segments:", total_segments)

combined_df = pd.DataFrame()

# 處理並合併所有區間到一個 DataFrame
for key, segments in selected_segments.items():
    if isinstance(segments, list):  # 確認結果是 segment 的列表
        combined_df, segment_start_number = save_processed_segments_to_df(split_dfs[key], segments, combined_df, segment_start_number)



readjusted_df = adjust_timestamps_intervals(combined_df)
extended_df = fill_and_extend_segments(readjusted_df,
                                      target_duration_minutes,
                                      fill_method,
                                      fill_value=None)
extended_df.rename(columns={'index': 'timepoint'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time'] = pd.to_numeric(df['time'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time'] = pd.to_numeric(df['time'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time'] = pd.to_numeric(df['time'], errors='coerce')
A value is trying to be set on a copy

      Segment Count
#_1               2
#_2              22
#_3               4
#_4              24
#_5               0
#_6               2
#_7              12
#_8               5
#_9              22
#_10              8
#_11             10
#_12             20
#_13             51
#_14             33
#_15             49
#_16             29
#_17              1
#_18              7
#_19             47
#_20             29
#_21             23
#_22              9
#_23              3
#_24              2
#_25              2
#_26              2
#_27              1
#_28              3
Total number of segments: 422


In [12]:
extended_df

Unnamed: 0,timepoint,si1m,CO,CI,SV,SVI,SVV,SVR,SVRI,PR,SYS,DIA,MAP,ID,Segment_Number
0,1,Y,4.3,2.6,44.0,27.0,5.0,1673.0,2745.0,98.0,132.0,78.0,97.0,#_1,1
1,2,Y,4.9,3.0,52.0,31.0,10.0,1525.0,2500.0,96.0,143.0,80.0,102.0,#_1,1
2,3,Y,4.9,3.0,51.0,31.0,10.0,1520.0,2495.0,96.0,137.0,86.0,101.0,#_1,1
3,4,N,5.2,3.2,51.0,31.0,10.0,1598.0,2620.0,101.0,157.0,86.0,111.0,#_1,1
4,5,N,5.1,3.1,52.0,32.0,7.0,1678.0,2750.0,98.0,158.0,85.0,114.0,#_1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25315,56,Y,6.2,3.4,58.0,32.0,15.0,660.0,1195.0,108.0,80.0,51.0,59.0,#_28,422
25316,57,Y,5.9,3.2,56.0,31.0,11.0,705.0,1275.0,106.0,81.0,52.0,60.0,#_28,422
25317,58,Y,5.8,3.2,55.0,30.0,14.0,753.0,1360.0,106.0,82.0,54.0,63.0,#_28,422
25318,59,Y,6.2,3.4,58.0,32.0,7.0,703.0,1270.0,107.0,83.0,54.0,63.0,#_28,422


In [13]:
X_set, Y_set, window_set = split_segments(extended_df, training_segment_minutes, testing_segment_minutes)

In [14]:
np_Y_set, count_of_ones = Get_Y_Set_Label(Y_set, threshold=65)

features = ['CO', 'CI', 'SV', 'SVI', 'SVV', 'SVR', 'SVRI', 'PR', 'SYS', 'DIA', 'MAP']

grouped_data = X_set.groupby('Segment_Number')[features]

# Collect the feature values for each group into a list
np_segments = [group.values for _, group in grouped_data]

# Stack these lists into a 3D numpy array
np_X_set = np.array(np_segments)
np_X_set = np_X_set.transpose(0, 2, 1)

# Display the shape of the resulting array

print(np_Y_set.shape)
print(np_X_set.shape)
print("Count of 1:", count_of_ones)

(422,)
(422, 11, 30)
Count of 1: 112


In [15]:
shuffle_indices = np.random.permutation(np_X_set.shape[0])

# 使用打乱的索引来打乱 array_3d 和 y_set
np_X_set_shuffled = np_X_set[shuffle_indices].astype(np.float32)  # 转换为 float32 类型
np_Y_set_shuffled = np_Y_set[shuffle_indices].astype(np.int32) 

In [16]:
n_total = np_X_set_shuffled.shape[0]
n_train = int(0.6 * n_total)
n_val = int(0.2 * n_total)
n_test = n_total - n_train - n_val

# 切割训练集、验证集和测试集
X_train = np_X_set_shuffled[:n_train]
Y_train = np_Y_set_shuffled[:n_train]

X_val = np_X_set_shuffled[n_train:n_train+n_val]
Y_val = np_Y_set_shuffled[n_train:n_train+n_val]

X_test = np_X_set_shuffled[n_train+n_val:]
Y_test = np_Y_set_shuffled[n_train+n_val:]

# 对每个特征进行缩放
scalers = {}
for i in range(X_train.shape[1]):
    scalers[i] = MinMaxScaler()
    
    # Reshape each feature's time points for scaling
    X_train_feature = X_train[:, i, :].reshape(-1, 1)
    X_val_feature = X_val[:, i, :].reshape(-1, 1)
    X_test_feature = X_test[:, i, :].reshape(-1, 1)
    
    # Fit the scaler on the training data for this feature
    scalers[i].fit(X_train_feature)
    
    # Transform the training, validation, and testing data for this feature
    X_train[:, i, :] = scalers[i].transform(X_train_feature).reshape(n_train, -1)
    X_val[:, i, :] = scalers[i].transform(X_val_feature).reshape(n_val, -1)
    X_test[:, i, :] = scalers[i].transform(X_test_feature).reshape(n_test, -1)

# 保存切分后的数据集
np.save('X_train_ffill.npy', X_train)
np.save('Y_train_ffill.npy', Y_train)
np.save('X_val_ffill.npy', X_val)
np.save('Y_val_ffill.npy', Y_val)
np.save('X_test_ffill.npy', X_test)
np.save('Y_test_ffill.npy', Y_test)
