## 实验：调整不均衡数据

### 本地环境配置

In [1]:
from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams['font.family'] = 'sans-serif'    # 用来正常显示中文
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False   # 设置正常显示符号

# 设置输入输出路径
import os
base_path = os.environ.get("BASE_PATH",'../data/')
data_path = os.path.join(base_path + "lab1/")
result_path = "result/"
os.makedirs(result_path, exist_ok=True)

# 忽略第三方支援库更新兼容性提示
import warnings
warnings.simplefilter('ignore') 

### 指令格式

我有一份不均衡数据，需要在上面训练机器学习模型，预测变量是 **<font color="#0000dd">[ 目标变量 ]</font>**，如何在 Python 中对我的数据执行过采样或者欠采样

### 指令示例

我有一份不均衡数据，需要在上面训练机器学习模型，预测变量是 **[ smoking ]**，如何在 Python 中对我的数据执行过采样或者欠采样

<img src="./img/6.png" width=80%>

### ChatGPT 代码测试

补充代码 ——

In [2]:
import pandas as pd
df = pd.read_csv(data_path + "smoking.csv")
df.describe()

Unnamed: 0,ID,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
count,55692.0,55692.0,55692.0,55692.0,55692.0,55692.0,55692.0,55692.0,55692.0,55692.0,...,55692.0,55692.0,55692.0,55692.0,55692.0,55692.0,55692.0,55692.0,55692.0,55692.0
mean,27845.5,44.182917,164.649321,65.864936,82.046418,1.012623,1.007443,1.025587,1.026144,121.494218,...,57.290347,114.964501,14.622592,1.087212,0.885738,26.182935,27.036037,39.952201,0.213334,0.367288
std,16077.039933,12.071418,9.194597,12.820306,9.274223,0.486873,0.485964,0.157902,0.159564,13.675989,...,14.738963,40.926476,1.564498,0.404882,0.221524,19.35546,30.947853,50.290539,0.409665,0.48207
min,0.0,20.0,130.0,30.0,51.0,0.1,0.1,1.0,1.0,71.0,...,4.0,1.0,4.9,1.0,0.1,6.0,1.0,1.0,0.0,0.0
25%,13922.75,40.0,160.0,55.0,76.0,0.8,0.8,1.0,1.0,112.0,...,47.0,92.0,13.6,1.0,0.8,19.0,15.0,17.0,0.0,0.0
50%,27845.5,40.0,165.0,65.0,82.0,1.0,1.0,1.0,1.0,120.0,...,55.0,113.0,14.8,1.0,0.9,23.0,21.0,25.0,0.0,0.0
75%,41768.25,55.0,170.0,75.0,88.0,1.2,1.2,1.0,1.0,130.0,...,66.0,136.0,15.8,1.0,1.0,28.0,31.0,43.0,0.0,1.0
max,55691.0,85.0,190.0,135.0,129.0,9.9,9.9,2.0,2.0,240.0,...,618.0,1860.0,21.1,6.0,11.6,1311.0,2914.0,999.0,1.0,1.0


In [3]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df.smoking==0]
df_minority = df[df.smoking==1]

过采样 ——

In [4]:
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.smoking.value_counts()

1    35237
0    35237
Name: smoking, dtype: int64

欠采样 ——

In [5]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df.smoking==0]
df_minority = df[df.smoking==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=len(df_minority),     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.smoking.value_counts()


1    20455
0    20455
Name: smoking, dtype: int64