# 样本不均衡的处理

## 1. 容易出现样本不均衡的业务场景
1）异常检测场景；
2）客户流失场景；
3）罕见事件场景；
4）发生频率低的场景。

## 2. 样本不均衡问题的处理方法
1）通过过抽样和欠抽样解决样本不均衡（适合用于大数据分布不平衡的情况）；

2）通过正负样本的惩罚权重解决样本不均衡；

3）通过集成方法解决样本不均衡（模型的时效性可能较低）；

4）通过特征选择解决样本不均衡。

In [1]:
!pip install imbalanced-learn

Collecting imbalanced-learn
[?25l  Downloading https://files.pythonhosted.org/packages/c8/73/36a13185c2acff44d601dc6107b5347e075561a49e15ddd4e69988414c3e/imbalanced_learn-0.6.2-py3-none-any.whl (163kB)
[K    100% |████████████████████████████████| 163kB 590kB/s ta 0:00:01
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.6.2


In [2]:
import pandas as pd
import numpy as np
# 加载过抽样处理库
from imblearn.over_sampling import SMOTE
# 加载欠抽样处理库
from imblearn.under_sampling import RandomUnderSampler
# 加载svm中的分类算法svc
from sklearn.svm import SVC
# 加载简单集成处理库
from imblearn.ensemble import EasyEnsembleClassifier

In [6]:
df = pd.read_table('./data/data_imbalance.txt', sep='\t', names=['col1', 'col2', 'col3', 'col4', 'col5', 'label'])
x = df.iloc[:, :-1]
y = df.iloc[:, -1:]
df.head()

Unnamed: 0,col1,col2,col3,col4,col5,label
0,1,-1,1,-1,-1,1
1,-1,0,1,-1,-1,0
2,0,1,0,0,1,0
3,0,1,2,-2,1,1
4,-2,1,1,-2,1,1


In [15]:
group_data_original = df.groupby('label').count()
print(group_data_original)

       col1  col2  col3  col4  col5
label                              
0       475   475   475   475   475
1       525   525   525   525   525


In [13]:
# 使用SMOTE方法进行过抽样处理
model_smote = SMOTE()
x_smote_resampled, y_smote_resampled = model_smote.fit_sample(x, y)
type(x_smote_resampled)

pandas.core.frame.DataFrame

In [19]:
df_smote = pd.concat((x_smote_resampled, y_smote_resampled), axis=1)

In [20]:
group_data_smote = df_smote.groupby('label').count()
print(group_data_smote)

       col1  col2  col3  col4  col5
label                              
0       525   525   525   525   525
1       525   525   525   525   525


In [21]:
# 使用RandomUnderSampler方法进行欠抽样
model_random_unsample = RandomUnderSampler()
x_random_unsample_resampled, y_random_unsample_resampled = model_random_unsample.fit_sample(x, y)
type(x_random_unsample_resampled)

pandas.core.frame.DataFrame

In [22]:
df_random_unsample = pd.concat((x_random_unsample_resampled, y_random_unsample_resampled), axis=1)

In [25]:
group_data_random_unsample = df_random_unsample.groupby('label').count()
print(group_data_random_unsample)

       col1  col2  col3  col4  col5
label                              
0       475   475   475   475   475
1       475   475   475   475   475


In [28]:
# 使用svm的权重调节样本不平衡
model_svm = SVC(class_weight='balanced', verbose=True)
model_svm.fit(x, y)

[LibSVM]

  y = column_or_1d(y, warn=True)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

In [29]:
dir(model_svm)

['C',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_proba',
 '_compute_kernel',
 '_decision_function',
 '_dense_decision_function',
 '_dense_fit',
 '_dense_predict',
 '_dense_predict_proba',
 '_dual_coef_',
 '_estimator_type',
 '_gamma',
 '_get_coef',
 '_get_param_names',
 '_get_tags',
 '_impl',
 '_intercept_',
 '_more_tags',
 '_n_support',
 '_pairwise',
 '_predict_log_proba',
 '_predict_proba',
 '_sparse',
 '_sparse_decision_function',
 '_sparse_fit',
 '_sparse_kernels',
 '_sparse_predict',
 '_sparse_predict_proba',
 '_validate_for_predict',
 '_validate_targets',
 '_warn_from_fit_status'

In [33]:
model_svm.decision_function(x)

array([ 0.78712569, -0.99993913, -0.97702444, -0.9996362 ,  0.92637626,
        0.90911895, -1.00048358,  0.070914  , -0.84303836, -0.84303836,
       -0.32472368, -0.30436802, -0.80572525, -0.84303836, -1.00033519,
       -0.84303836, -0.09530146, -0.97702444,  0.99965475, -0.999746  ,
       -0.30436802,  0.78712569,  0.1797976 ,  0.42603493, -0.77251816,
       -0.85455126, -0.84303836, -0.9996362 ,  0.03763785, -0.3216061 ,
       -0.84303836,  0.03763785,  0.03893904, -0.99993913, -0.999746  ,
        0.19759631,  0.99994054, -0.30436802, -1.00005541, -0.9996362 ,
       -0.9996362 , -0.09530146,  0.2285374 ,  1.00004595,  0.070914  ,
       -0.84303836, -0.22995243, -0.99995557,  0.19759631, -0.47056508,
       -0.999746  ,  0.08957967,  1.00004595, -0.84303836, -0.703534  ,
       -0.22995243,  0.90911895,  0.03763785, -1.0002273 , -0.97702444,
        0.78712569,  0.03763785, -0.84303836,  0.070914  ,  1.00012763,
       -0.97702444, -0.75151785, -0.30436802, -0.99993913,  1.00

In [39]:
from imblearn.ensemble import _easy_ensemble as easy_ensemble
# 使用集成方法EasyEnsample处理不平衡问题
model_easy_ensample = EasyEnsembleClassifier()

In [41]:
model_easy_ensample.fit(x, y)

  y = column_or_1d(y, warn=True)


EasyEnsembleClassifier(base_estimator=None, n_estimators=10, n_jobs=None,
                       random_state=None, replacement=False,
                       sampling_strategy='auto', verbose=0, warm_start=False)