In [1]:
# !pip install imbalanced-learn

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN

# 資料前處理

In [2]:
def data_preprocessing(train_data_path):
    # 檢查有無缺失值
    train_data = pd.read_csv(train_data_path, index_col=0)
    missing_data = check_missing_data(train_data)
    total_missing = missing_data.sum()
    print(f"Total missing values: {total_missing}")

    # 檢查與處理異常值
    train_data = handle_outliers(train_data)

    return train_data

# 檢查有幾個缺失值
def check_missing_data(data):
    return data.isnull().sum()

# 檢查異常值
# 使用 IQR 方式偵測異常值
def detect_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    outliers = (data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))
    return outliers

# 處理異常值
# 平均數填補異常值
def handle_outliers(data):
    outliers = detect_outliers(data)
    for column in data.columns:
        data.loc[outliers[column], column] = data[column].mean()
    return data


# 資料擴增

In [3]:
# RandomOverSampler

def data_oversampling_RandomOverSampler(data_path, label_path):
  #處理label
  label=pd.read_csv(label_path, index_col=0)
  label = label.squeeze()
  label_encoder = LabelEncoder()
  encoded_label = label_encoder.fit_transform(label)

  #RandomOverSampler
  randomOS = RandomOverSampler(random_state=42)
  data=pd.read_csv(data_path, index_col=0)
  resampled_data, resampled_label = randomOS.fit_resample(data, encoded_label)

  #將擴增後的資料轉換回原本的格式與名稱
  resampled_data = pd.DataFrame(resampled_data, columns=data.columns)
  resampled_label = pd.Series(resampled_label)

  resampled_label = label_encoder.inverse_transform(resampled_label)
  resampled_label = pd.Series(resampled_label, name='class')

  #將data與label合併在一起
  combined_data = resampled_data.copy()
  combined_data['class'] = resampled_label

  #存入新的csv檔
  resampled_data.to_csv(f'resampled_train_data_ROS.csv', index=False)
  print(f"Resampled data saved to 'resampled_train_data_ROS.csv'")

  resampled_label.to_csv(f'resampled_train_label_ROS.csv', index=False)
  print(f"Resampled data saved to 'resampled_train_label_ROS.csv'")

In [4]:
def data_oversampling_SMOTE(data_path, label_path):
    # 處理label
    label = pd.read_csv(label_path, index_col=0)
    label = label.squeeze()
    label_encoder = LabelEncoder()
    encoded_label = label_encoder.fit_transform(label)

    # SMOTE
    smote = SMOTE(random_state=42)
    data = pd.read_csv(data_path, index_col=0)
    resampled_data, resampled_label = smote.fit_resample(data, encoded_label)

    # 將擴增後的數據轉換回原本的格式與名稱
    resampled_data = pd.DataFrame(resampled_data, columns=data.columns)

    # 將擴增後的標籤解碼回原始標籤
    resampled_label = label_encoder.inverse_transform(resampled_label)
    resampled_label = pd.Series(resampled_label, name='class')

    # 儲存擴增後的數據
    resampled_data.to_csv('resampled_train_data_SMOTE.csv', index=False)
    print(f"Resampled data saved to 'resampled_train_data_SMOTE.csv'")

    # 儲存擴增後的標籤
    resampled_label.to_csv('resampled_train_label_SMOTE.csv', index=False)
    print(f"Resampled labels saved to 'resampled_train_label_SMOTE.csv'")


In [5]:
# BorderlineSmote

def data_oversampling_BorderlineSmote(data_path, label_path):
  #處理label
  label=pd.read_csv(label_path, index_col=0)
  label = label.squeeze()
  label_encoder = LabelEncoder()
  encoded_label = label_encoder.fit_transform(label)

  #BordelineSmote
  data=pd.read_csv(data_path, index_col=0)
  bsmote = BorderlineSMOTE(random_state=42)
  resampled_data, resampled_label = bsmote.fit_resample(data, encoded_label)

  #將擴增後的資料轉換回原本的格式與名稱
  resampled_data = pd.DataFrame(resampled_data, columns=data.columns)
  resampled_label = pd.Series(resampled_label)

  resampled_label = label_encoder.inverse_transform(resampled_label)
  resampled_label = pd.Series(resampled_label, name='class')

  #將data與label合併在一起
  combined_data = resampled_data.copy()
  combined_data['class'] = resampled_label

  #存入新的csv檔
  resampled_data.to_csv(f'resampled_train_data_bSMOTE.csv', index=False)
  print(f"Resampled data saved to 'resampled_train_data_bSMOTE.csv'")
    #存入新的csv檔
  resampled_label.to_csv(f'resampled_train_label_bSMOTE.csv', index=False)
  print(f"Resampled data saved to 'resampled_train_data_bSMOTE.csv'")


In [10]:
# ADASYN

def data_oversampling_ADASYN(data_path, label_path):
  #處理label
  label=pd.read_csv(label_path, index_col=0)
  label = label.squeeze()
  label_encoder = LabelEncoder()
  encoded_label = label_encoder.fit_transform(label)

  #ADASYN
  adasyn = ADASYN(random_state=42)
  data=pd.read_csv(data_path, index_col=0)
  resampled_data, resampled_label = adasyn.fit_resample(data, encoded_label)

  #將擴增後的資料轉換回原本的格式與名稱
  resampled_data = pd.DataFrame(resampled_data, columns=data.columns)
  resampled_label = pd.Series(resampled_label)

  resampled_label = label_encoder.inverse_transform(resampled_label)
  resampled_label = pd.Series(resampled_label, name='class')

  #將data與label合併在一起
  combined_data = resampled_data.copy()
  combined_data['class'] = resampled_label

  #存入新的csv檔
  resampled_data.to_csv(f'resampled_train_data_ADASYN.csv', index=False)
  print(f"Resampled data saved to 'resampled_train_data_ADASYN.csv'")

  #存入新的csv檔
  resampled_label.to_csv(f'resampled_train_label_ADASYN.csv', index=False)
  print(f"Resampled data saved to 'resampled_train_data_ADASYN.csv'")

# 執行

In [8]:
train_path = "dataset/gene expression cancer RNA-Seq Data Set/train_data.csv"
label_path = "dataset/gene expression cancer RNA-Seq Data Set/train_label.csv"
#資料前處理
data_preprocessing('train_data.csv')

#資料擴增
data_oversampling_RandomOverSampler('test_data.csv', 'test_label.csv')
data_oversampling_SMOTE('test_data.csv', 'test_label.csv')
data_oversampling_BorderlineSmote('test_data.csv', 'test_label.csv')
data_oversampling_ADASYN('test_data.csv', 'test_label.csv')

Total missing values: 0
Resampled data saved to 'resampled_train_data_ROS.csv'
Resampled data saved to 'resampled_train_data_SMOTE.csv'
Resampled labels saved to 'resampled_train_label_SMOTE.csv'
Resampled data saved to 'resampled_train_data_bSMOTE.csv'


In [11]:
train_path = "dataset/gene expression cancer RNA-Seq Data Set/train_data.csv"
label_path = "dataset/gene expression cancer RNA-Seq Data Set/train_label.csv"
# data_oversampling_RandomOverSampler(train_path, label_path)
# data_oversampling_BorderlineSmote(train_path, label_path)
data_oversampling_ADASYN(train_path, label_path)

Resampled data saved to 'resampled_train_data_ADASYN.csv'
Resampled data saved to 'resampled_train_data_ADASYN.csv'


In [9]:
# 設置顯示的最大行數和列數
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# 查看數據
newdata = pd.read_csv('resampled_train_data_SMOTE.csv')
print(newdata)
newlabel = pd.read_csv('resampled_train_label_SMOTE.csv')
print(newlabel)

       gene_0    gene_1    gene_2    gene_3    gene_4  gene_5    gene_6  \
0    0.000000  0.000000  2.249354  6.873001  9.621334     0.0  9.786846   
1    0.000000  4.094565  4.704501  7.545559  9.752299     0.0  8.595821   
2    0.000000  4.143565  3.043834  6.211337  8.791908     0.0  7.691960   
3    0.000000  2.215150  1.868845  6.287153  9.273963     0.0  8.317725   
4    0.000000  2.988103  1.811471  5.763507  8.604753     0.0  7.335855   
..        ...       ...       ...       ...       ...     ...       ...   
675  0.000000  3.888510  2.965087  6.583370  9.387170     0.0  8.675558   
676  0.000000  4.357514  2.948582  5.878367  9.808163     0.0  8.097849   
677  1.167396  3.873845  4.250375  6.007119  9.809416     0.0  7.290269   
678  0.000000  3.899760  3.873176  6.628683  9.103905     0.0  7.336151   
679  0.000000  2.699529  3.274111  6.828970  9.558878     0.0  5.287159   

       gene_7  gene_8  gene_9   gene_10   gene_11   gene_12   gene_13  \
0    0.701150     0.0     