In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix,classification_report
# from xgboost import XGBClassifier
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.impute import SimpleImputer
import time

ALL = False
NO_SMOTE = True
SUB_PROCESSED_DIR = 'processed_62'
TEST_RATIO = 0.1

dtypes = {
    'msisdn': 'str',
    'start_time': 'str',
    'end_time': 'str',
    'call_event': 'category',
    'other_party': 'str',
    'ismultimedia': 'category',
    'home_area_code': 'str',
    'visit_area_code': 'str',
    'called_home_code': 'str',
    'called_code': 'str',
    'a_serv_type': 'int',
    'long_type1': 'int',
    'roam_type': 'int',
    'a_product_id': 'str',
    'open_datetime': 'str',
    'call_duration': 'int32',
    'cfee': 'float64',
    'lfee': 'float64',
    'hour': 'int8',
    'dayofweek': 'int',
    'phone1_type': 'int',
    'phone2_type': 'int',
    'phone1_loc_city': 'str',
    'phone1_loc_province': 'str',
    'phone2_loc_city': 'str',
    'phone2_loc_province': 'str',
    'update_time': 'str',
    'date': 'str',
    'date_c': 'str'
}

# 判断 processed 文件夹是否存在
import os
if not os.path.exists(f'../self_data/{SUB_PROCESSED_DIR}'):
    print("Creating processed data folder...")
    # 读取CSV文件
    labeled_data = pd.read_csv('../self_data/all_trainSet_res.csv', dtype=dtypes)
    labels = pd.read_csv('../self_data/all_trainSet_ans.csv', dtype=dtypes)

    validation_data = pd.read_csv('../self_data/sorted_validationSet_res_with_head.csv', dtype=dtypes)

    # 按照 msisdn 切分 train_data 和 test_data
    train_data_msisdn, test_data_msisdn = train_test_split(labels['msisdn'], test_size=TEST_RATIO, random_state=42, stratify=labels['is_sa'])
    train_data = labeled_data[labeled_data['msisdn'].isin(train_data_msisdn)]
    train_labels = labels[labels['msisdn'].isin(train_data_msisdn)]
    assert len(train_data['msisdn'].unique()) == len(train_data_msisdn)

    test_data = labeled_data[labeled_data['msisdn'].isin(test_data_msisdn)]
    test_labels = labels[labels['msisdn'].isin(test_data_msisdn)]
    assert len(test_data['msisdn'].unique()) == len(test_data_msisdn)


    # # 遍历 groupby('msisdn') 的结果，对每个 msisdn 进行数据增强
    # # ------
    from tqdm import tqdm
    import os
    import sys
    from utils.augmentation import Augmentation

    addition_train_data = []
    addition_train_labels = []

    times = 2
    ratio_range = 0.1
    pbar = tqdm(train_data.groupby('msisdn'))
    for msisdn, group in pbar:
        if msisdn == 0:
            continue
        # print(f"Augmenting msisdn {msisdn}")
        pbar.set_description(f"Augmenting msisdn {msisdn}")
        label = train_labels[train_labels['msisdn'] == msisdn].iloc[0]['is_sa']
        aug = Augmentation(group, label, 'msisdn', 'is_sa')
        # 对正负样本进行平衡 样本比 1:4
        if label == 1:
            res_df, res_labels = aug.times(ratio=ratio_range, times=3+4*times, method='mask')

            addition_train_data.append(res_df)
            addition_train_labels.append(res_labels)

            # res_df, res_labels = aug.times(window_size=100, step_size=80, times=1, method='sliding_window')

            # addition_train_data.append(res_df)
            # addition_train_labels.append(res_labels)
        else:
            res_df, res_labels = aug.times(ratio=ratio_range, times=times, method='mask')

            addition_train_data.append(res_df)
            addition_train_labels.append(res_labels)

            # res_df, res_labels = aug.times(window_size=100, step_size=80, times=1, method='sliding_window')

            # addition_train_data.append(res_df)
            # addition_train_labels.append(res_labels)
            
    addition_train_data = pd.concat(addition_train_data)
    addition_train_labels = pd.concat(addition_train_labels)

    # 将新数据加入到train_data中
    train_data = pd.concat([train_data, addition_train_data], ignore_index=True).reset_index(drop=True)
    train_labels = pd.concat([train_labels, addition_train_labels], ignore_index=True).reset_index(drop=True)

    # 按照 msisdn, start_time 排序
    sort_start_time = time.time()
    train_data = train_data.sort_values(by=['msisdn', 'start_time']).reset_index(drop=True)
    train_labels = train_labels.sort_values(by=['msisdn']).reset_index(drop=True)
    print('sort time:', time.time() - sort_start_time)

    labels_aug = pd.concat([train_labels, test_labels], ignore_index=True).reindex()
    # ------------------

    # save
    print("Saving processed data...")
    os.makedirs(f'../self_data/{SUB_PROCESSED_DIR}', exist_ok=True)
    train_data.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/train_data.csv', index=False)
    train_labels.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/train_labels.csv', index=False)
    test_data.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/test_data.csv', index=False)
    test_labels.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/test_labels.csv', index=False)

    labels_aug.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/labels_aug.csv', index=False)

    validation_data.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/validation_data.csv', index=False)
    # TODO: test_data

else:
    print("Reading processed data...")
    train_data = pd.read_csv(f'../self_data/{SUB_PROCESSED_DIR}/train_data.csv', dtype=dtypes)
    train_labels = pd.read_csv(f'../self_data/{SUB_PROCESSED_DIR}/train_labels.csv', dtype=dtypes)
    test_data = pd.read_csv(f'../self_data/{SUB_PROCESSED_DIR}/test_data.csv', dtype=dtypes)
    test_labels = pd.read_csv(f'../self_data/{SUB_PROCESSED_DIR}/test_labels.csv', dtype=dtypes)

    labels_aug = pd.read_csv(f'../self_data/{SUB_PROCESSED_DIR}/labels_aug.csv', dtype=dtypes)

    validation_data = pd.read_csv(f'../self_data/{SUB_PROCESSED_DIR}/validation_data.csv', dtype=dtypes)

labeled_data_aug = pd.concat([train_data, test_data], ignore_index=True).reindex()
assert len(labeled_data_aug['msisdn'].unique()) == len(labels_aug['msisdn'].unique())

# 转换时间格式
labeled_data_aug['start_time'] = pd.to_datetime(labeled_data_aug['start_time'], format='%Y%m%d%H%M%S')
labeled_data_aug['end_time'] = pd.to_datetime(labeled_data_aug['end_time'], format='%Y%m%d%H%M%S')
labeled_data_aug['open_datetime'] = pd.to_datetime(labeled_data_aug['open_datetime'], format='%Y%m%d%H%M%S')
labeled_data_aug['update_time'] = pd.to_datetime(labeled_data_aug['update_time'])
labeled_data_aug['date'] = pd.to_datetime(labeled_data_aug['date'])

validation_data['start_time'] = pd.to_datetime(validation_data['start_time'], format='%Y%m%d%H%M%S')
validation_data['end_time'] = pd.to_datetime(validation_data['end_time'], format='%Y%m%d%H%M%S')
validation_data['open_datetime'] = pd.to_datetime(validation_data['open_datetime'], format='%Y%m%d%H%M%S',errors='coerce')
validation_data['update_time'] = pd.to_datetime(validation_data['update_time'])
validation_data['date'] = pd.to_datetime(validation_data['date'])

Reading processed data...


In [2]:
# 为每条记录添加start_time_diff，记录 start_time 与上一条记录的 start_time 之差 (单位：秒)
start_time_diff = labeled_data_aug.groupby('msisdn')['start_time'].diff().dt.total_seconds().fillna(0).reset_index(drop=True)
# 将该列加入到数据集中
labeled_data_aug['start_time_diff'] = start_time_diff.copy()
# time_diff_start2end = train_data.groupby('msisdn')['end_time'].diff().dt.total_seconds().fillna(0)
start_time_diff = validation_data.groupby('msisdn')['start_time'].diff().dt.total_seconds().fillna(0).reset_index(drop=True)
validation_data['start_time_diff'] = start_time_diff.copy()

数据特征处理

In [3]:
# 聚合特征
def aggregate_features(data):
    return data.groupby('msisdn').agg({
    'call_duration': [
        ('sum', 'sum'), 
        ('mean', 'mean'), 
        ('max', 'max'), 
        ('std', 'std'),
        ('quantile_25', lambda x: x.quantile(0.25)), 
        ('quantile_50', lambda x: x.quantile(0.50)), 
        ('quantile_75', lambda x: x.quantile(0.75)),
    ],
    'cfee': [
        ('sum', 'sum'),
        ('std', 'std'), 
        ('mean', 'mean'),
    ],
    'lfee': [
        ('sum', 'sum'), 
        ('mean', 'mean'),
        ('std', 'std'),
    ],
    'hour': [
        ('mean', 'mean'), 
        ('std', 'std'), 
        ('max', 'max'), 
        ('min', 'min'),
    ],
    'dayofweek': [
        ('std', 'std'), 
        ('magic', lambda x: x.value_counts().mean()), 
        ('work_day_num', lambda x: x[x.isin([1,2,3,4,5])].count()), 
        ('weekend_num', lambda x: x[x.isin([6,7])].count()),
        ('mode', lambda x: x.mode().values[0]),
        ('work_day_weekend_diff', lambda x: (x[x.isin([1,2,3,4,5])].count() - x[x.isin([6,7])].count()) / (x[x.isin([1,2,3,4,5])].count() + x[x.isin([6,7])].count())),
    ],
    # 'home_area_code': [
    #     ('home_area_code_nunique', 'nunique')
    # ],
    'visit_area_code': [
        ('nunique', 'nunique'),
        ('times_not_at_home_area', lambda x: x[x != x.shift()].count()/x.count())
    ],
    'called_home_code': [
        ('nunique', 'nunique'),
        ('called_diff_home_code', lambda x: x[x != x.shift()].count() / x.count())
    ],
    'called_code': [
        ('nunique', 'nunique'),
        ('diff', lambda x: x[x != x.shift()].count()/ x.count())
    ],
    'open_datetime': [
        ('open_count', 'nunique')
    ],
    'other_party': [
        ('account_person_num', 'nunique'),
        ('called_diff_home_code', lambda x: x[x != x.shift()].count() / x.count())
    ],
    'a_serv_type': [
        ('call_num', lambda x: x[x.isin([1, 3])].count()), 
        ('called_num', lambda x: x[x == 2].count()),
        ('call_called_normalized_diff', lambda x: (x[x.isin([1, 3])].count() - x[x == 2].count()) /  (x[x.isin([1, 3])].count() + x[x == 2].count())),
    ],
    'start_time_diff': [
        ('start_time_diff_mean', 'mean'), 
        ('start_time_diff_std', 'std'), 
        ('max', 'max'), 
        ('coefficient_of_variation', lambda x: x.std() / x.mean()),
    ], 
    # 'phone1_type': [
    #     ('nunique', 'nunique'),
    #     ('mode', lambda x: x.mode().values[0])
    # ],
    # 'distance': [
    #     ('sum', 'sum'), 
    #     ('std', 'std'), 
    #     ('max', 'max'), 
    #     ('quantile_25', lambda x: x.quantile(0.25)), 
    #     ('quantile_50', lambda x: x.quantile(0.50)), 
    #     ('quantile_75', lambda x: x.quantile(0.75)),
    # ]
})

labeled_aug_features = aggregate_features(labeled_data_aug)
validation_features = aggregate_features(validation_data)

labeled_aug_features.columns = ['+'.join(col).strip() for col in labeled_aug_features.columns.values]
validation_features.columns = ['+'.join(col).strip() for col in validation_features.columns.values]

labeled_aug_features.columns = labeled_aug_features.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '').str.replace('(', '').str.replace(')', '').str.replace(',', '').str.replace(' ', '_')
validation_features.columns = validation_features.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '').str.replace('(', '').str.replace(')', '').str.replace(',', '').str.replace(' ', '_')

# 重置索引
labeled_aug_features = labeled_aug_features.reset_index()
validation_features = validation_features.reset_index()

# 合并标签数据
labeled_aug_features = labeled_aug_features.merge(labels_aug, on='msisdn', how='left')
# 打印结果
# labeled_aug_features

# # 添加 ae 的编码特征
# labeled_ae = pd.read_csv('../data/ae/train.csv', dtype=dtypes)
# valid_ae = pd.read_csv('../data/ae/val.csv', dtype=dtypes)
# labeled_aug_features = labeled_aug_features.merge(labeled_ae, on='msisdn', how='left')
# validation_features = validation_features.merge(valid_ae, on='msisdn', how='left')

  ('called_diff_home_code', lambda x: x[x != x.shift()].count() / x.count())
  ('diff', lambda x: x[x != x.shift()].count()/ x.count())


In [4]:
labeled_aug_features.columns

Index(['msisdn', 'call_duration+sum', 'call_duration+mean',
       'call_duration+max', 'call_duration+std', 'call_duration+quantile_25',
       'call_duration+quantile_50', 'call_duration+quantile_75', 'cfee+sum',
       'cfee+std', 'cfee+mean', 'lfee+sum', 'lfee+mean', 'lfee+std',
       'hour+mean', 'hour+std', 'hour+max', 'hour+min', 'dayofweek+std',
       'dayofweek+magic', 'dayofweek+work_day_num', 'dayofweek+weekend_num',
       'dayofweek+mode', 'dayofweek+work_day_weekend_diff',
       'visit_area_code+nunique', 'visit_area_code+times_not_at_home_area',
       'called_home_code+nunique', 'called_home_code+called_diff_home_code',
       'called_code+nunique', 'called_code+diff', 'open_datetime+open_count',
       'other_party+account_person_num', 'other_party+called_diff_home_code',
       'a_serv_type+call_num', 'a_serv_type+called_num',
       'a_serv_type+call_called_normalized_diff',
       'start_time_diff+start_time_diff_mean',
       'start_time_diff+start_time_diff_std

In [5]:
len(labeled_aug_features.columns)

41

In [6]:
def get_nan(train):
    # 获取 train 中的 nan值
    train_nan = train[train.isnull().T.any()]
    # 统计 每列含有的 nan 数量
    for col in train.columns:
        if train[col].isnull().sum() > 0:
            print(col, train[col].isnull().sum())

    return train_nan
get_nan(labeled_aug_features), get_nan(validation_features)

call_duration+std 146
cfee+std 146
lfee+std 146
hour+std 146
dayofweek+std 146
called_home_code+called_diff_home_code 1
called_code+diff 1
start_time_diff+start_time_diff_std 146
start_time_diff+coefficient_of_variation 146
call_duration+std 78
cfee+std 78
lfee+std 78
hour+std 78
dayofweek+std 78
start_time_diff+start_time_diff_std 78
start_time_diff+coefficient_of_variation 78


(         msisdn  call_duration+sum  call_duration+mean  call_duration+max  \
 330     1005436                 25                25.0                 25   
 945     1010686                 16                16.0                 16   
 1755    1013277                351               351.0                351   
 1950    1013990                 56                56.0                 56   
 2127    1015735                 66                66.0                 66   
 ...         ...                ...                 ...                ...   
 128202  2419249                  2                 2.0                  2   
 129292  2423456                 14                14.0                 14   
 132326  2500562                 23                23.0                 23   
 132693  2502357                 74                74.0                 74   
 132764  2502602                507               507.0                507   
 
         call_duration+std  call_duration+quantile_25  \
 330 

In [7]:
# 一般只有 std 会出现 nan 值故所有的 nan 值填充为 0
labeled_aug_features = labeled_aug_features.fillna(0)
validation_features = validation_features.fillna(0)

def get_nan(train):
    # 获取 train 中的 nan值
    train_nan = train[train.isnull().T.any()]
    # 统计 每列含有的 nan 数量
    for col in train.columns:
        if train[col].isnull().sum() > 0:
            print(col, train[col].isnull().sum())

    return train_nan
get_nan(labeled_aug_features), get_nan(validation_features)

(Empty DataFrame
 Columns: [msisdn, call_duration+sum, call_duration+mean, call_duration+max, call_duration+std, call_duration+quantile_25, call_duration+quantile_50, call_duration+quantile_75, cfee+sum, cfee+std, cfee+mean, lfee+sum, lfee+mean, lfee+std, hour+mean, hour+std, hour+max, hour+min, dayofweek+std, dayofweek+magic, dayofweek+work_day_num, dayofweek+weekend_num, dayofweek+mode, dayofweek+work_day_weekend_diff, visit_area_code+nunique, visit_area_code+times_not_at_home_area, called_home_code+nunique, called_home_code+called_diff_home_code, called_code+nunique, called_code+diff, open_datetime+open_count, other_party+account_person_num, other_party+called_diff_home_code, a_serv_type+call_num, a_serv_type+called_num, a_serv_type+call_called_normalized_diff, start_time_diff+start_time_diff_mean, start_time_diff+start_time_diff_std, start_time_diff+max, start_time_diff+coefficient_of_variation, is_sa]
 Index: []
 
 [0 rows x 41 columns],
 Empty DataFrame
 Columns: [msisdn, call_du

In [9]:
# # # 交叉特征
# # # 将所有特征两两相乘
# from itertools import combinations
# from tqdm import tqdm

# def cross_features(data):
#     cross_features = []
#     new_features = []
#     cross_cols = data.columns.tolist()
#     rm_cols = ['msisdn', 'is_sa']
#     for col in rm_cols:
#         if col in cross_cols:
#             cross_cols.remove(col)

#     for i, j in tqdm(combinations(cross_cols, 2), total=len(cross_cols) * (len(cross_cols) - 1) // 2):
#         new_features.append(data[i] * data[j])
#         cross_features.append(f'{i}_cross_{j}')
#     new_features = pd.concat(new_features, axis=1)
#     new_features.columns = cross_features
#     data = pd.concat([data, new_features], axis=1)
#     return data, cross_features

# labeled_aug_features, _ = cross_features(labeled_aug_features)
# validation_features, _ = cross_features(validation_features)

In [8]:
X = labeled_aug_features.drop(['msisdn'], axis=1)
y = labeled_aug_features['is_sa']
X_validation = validation_features.drop(['msisdn'], axis=1)

n_sample = y.shape[0]
n_pos_sample = y[y ==1].shape[0]
n_neg_sample = y[y == 0].shape[0]
print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('特征维数：', X.shape[1])

样本个数：133591; 正样本占29.31%; 负样本占70.69%
特征维数： 40


In [9]:
labeled_aug_features.columns

Index(['msisdn', 'call_duration+sum', 'call_duration+mean',
       'call_duration+max', 'call_duration+std', 'call_duration+quantile_25',
       'call_duration+quantile_50', 'call_duration+quantile_75', 'cfee+sum',
       'cfee+std', 'cfee+mean', 'lfee+sum', 'lfee+mean', 'lfee+std',
       'hour+mean', 'hour+std', 'hour+max', 'hour+min', 'dayofweek+std',
       'dayofweek+magic', 'dayofweek+work_day_num', 'dayofweek+weekend_num',
       'dayofweek+mode', 'dayofweek+work_day_weekend_diff',
       'visit_area_code+nunique', 'visit_area_code+times_not_at_home_area',
       'called_home_code+nunique', 'called_home_code+called_diff_home_code',
       'called_code+nunique', 'called_code+diff', 'open_datetime+open_count',
       'other_party+account_person_num', 'other_party+called_diff_home_code',
       'a_serv_type+call_num', 'a_serv_type+called_num',
       'a_serv_type+call_called_normalized_diff',
       'start_time_diff+start_time_diff_mean',
       'start_time_diff+start_time_diff_std

In [10]:
# TODO use all_X to impute
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

In [11]:
imputer2 = SimpleImputer(strategy='most_frequent')
X_validation = imputer2.fit_transform(X_validation)

In [12]:
X.shape

(133591, 40)

In [13]:
X_validation.shape

(13005, 39)

In [14]:
y.shape

(133591,)

In [15]:
assert len(train_labels) + len(test_labels) == len(labeled_aug_features)
len(train_labels), len(test_labels)

(129732, 3859)

In [16]:
from sklearn.utils import shuffle
# 将 msisdn 和 is_sa 并入 X 再划分
train_data_msisdn = train_labels['msisdn']
test_data_msisdn = test_labels['msisdn']
X_df = pd.DataFrame(X, columns=labeled_aug_features.drop(['msisdn'], axis=1).columns)
X_df = pd.concat([labeled_aug_features[['msisdn']], X_df], axis=1)
train_set = X_df[X_df['msisdn'].isin(train_data_msisdn)][X_df.columns[1:]]
test_set = X_df[X_df['msisdn'].isin(test_data_msisdn)][X_df.columns[1:]]

print(f"1 samples / 0 samples in train set: {len(train_set[train_set['is_sa'] == 1])} / {len(train_set[train_set['is_sa'] == 0])}")
print(f"1 samples / 0 samples in test set: {len(test_set[test_set['is_sa'] == 1])} / {len(test_set[test_set['is_sa'] == 0])}")

if ALL:
    # if not NO_SMOTE:
    #     smote = SMOTE(random_state=42)    # 处理过采样的方法
    #     X, y = smote.fit_resample(X, y)
    # X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42, shuffle=True)
    train_len = len(test_set) + len(train_set)
    test_len = 0
else:
    # X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y,test_size= 0.3,random_state=42, shuffle=True)
    # X_train, y_train = shuffle(X_train, y_train, random_state=42)
    train_len, test_len = len(train_set), len(test_set)

    # if not NO_SMOTE:
    #     smote = SMOTE(random_state=42)    # 处理过采样的方法
    #     X_train, y_train = smote.fit_resample(X_train, y_train)
    #     print('通过SMOTE方法平衡正负样本后')
    #     n_sample = y_train.shape[0]
    #     n_pos_sample = y_train[y_train == 1].shape[0]
    #     n_neg_sample = y_train[y_train == 0].shape[0]
    #     print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
    #                                                     n_pos_sample / n_sample,
    #                                                     n_neg_sample / n_sample))
    #     print('特征维数：', X.shape[1])

1 samples / 0 samples in train set: 38768 / 90964
1 samples / 0 samples in test set: 384 / 3475


In [17]:
columns = labeled_aug_features.columns.tolist()
columns.remove('msisdn')
valid_set = np.c_[X_validation, np.zeros(X_validation.shape[0])]
valid_set = pd.DataFrame(valid_set, columns=columns)
valid_set['is_sa'] = -1

In [18]:
train_set.head()

Unnamed: 0,call_duration+sum,call_duration+mean,call_duration+max,call_duration+std,call_duration+quantile_25,call_duration+quantile_50,call_duration+quantile_75,cfee+sum,cfee+std,cfee+mean,...,other_party+account_person_num,other_party+called_diff_home_code,a_serv_type+call_num,a_serv_type+called_num,a_serv_type+call_called_normalized_diff,start_time_diff+start_time_diff_mean,start_time_diff+start_time_diff_std,start_time_diff+max,start_time_diff+coefficient_of_variation,is_sa
0,6369.0,68.483871,1129.0,128.481906,20.0,37.0,78.0,0.0,0.0,0.0,...,44.0,0.860215,62.0,31.0,0.333333,13200.580645,24019.16918,105436.0,1.819554,0.0
1,6127.0,72.940476,1129.0,134.267612,21.75,41.0,78.5,0.0,0.0,0.0,...,40.0,0.869048,55.0,29.0,0.309524,14614.928571,25121.045843,105436.0,1.718862,0.0
2,6067.0,72.22619,1129.0,134.270752,21.75,41.0,78.5,0.0,0.0,0.0,...,41.0,0.869048,55.0,29.0,0.309524,14614.928571,25133.971996,105436.0,1.719746,0.0
3,375.0,20.833333,156.0,39.165186,3.5,8.0,13.5,0.0,0.0,0.0,...,6.0,0.333333,13.0,5.0,0.444444,14917.611111,23149.372122,74415.0,1.551815,1.0
4,370.0,21.764706,156.0,40.164551,3.0,8.0,15.0,0.0,0.0,0.0,...,6.0,0.352941,13.0,4.0,0.529412,15795.117647,26285.288742,74415.0,1.66414,1.0


In [19]:
test_set.describe()

Unnamed: 0,call_duration+sum,call_duration+mean,call_duration+max,call_duration+std,call_duration+quantile_25,call_duration+quantile_50,call_duration+quantile_75,cfee+sum,cfee+std,cfee+mean,...,other_party+account_person_num,other_party+called_diff_home_code,a_serv_type+call_num,a_serv_type+called_num,a_serv_type+call_called_normalized_diff,start_time_diff+start_time_diff_mean,start_time_diff+start_time_diff_std,start_time_diff+max,start_time_diff+coefficient_of_variation,is_sa
count,3859.0,3859.0,3859.0,3859.0,3859.0,3859.0,3859.0,3859.0,3859.0,3859.0,...,3859.0,3859.0,3859.0,3859.0,3859.0,3859.0,3859.0,3859.0,3859.0,3859.0
mean,10406.521897,98.551688,1019.947396,161.949909,22.429386,44.574112,101.2454,118.738015,2.351438,1.248072,...,35.999223,0.723654,61.325214,56.421353,-0.020429,26362.870725,41034.896147,167544.5,1.821371,0.099508
std,12760.882888,92.2224,1033.736896,173.551887,16.044677,37.889841,113.887055,700.271881,21.844227,22.155837,...,58.313255,0.163836,84.329243,65.66806,0.383948,34537.860324,50357.713939,138566.6,0.744165,0.299381
min,9.0,7.5,9.0,0.0,3.0,7.5,8.75,0.0,0.0,0.0,...,1.0,0.020408,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
25%,2636.0,49.842071,327.5,58.460836,15.0,28.0,54.0,0.0,0.0,0.0,...,10.0,0.651446,15.0,17.0,-0.262935,8164.690789,15486.485752,79475.0,1.424679,0.0
50%,6210.0,74.507692,702.0,108.303908,20.0,37.5,75.5,0.0,0.0,0.0,...,21.0,0.755814,36.0,37.0,-0.012658,15687.1,24991.270244,125400.0,1.694879,0.0
75%,13446.5,114.641912,1437.0,206.710735,25.25,50.0,111.0,0.0,0.0,0.0,...,42.0,0.833333,74.0,70.0,0.201156,30406.071646,46539.805125,203318.5,2.030944,0.0
max,148852.0,1585.351351,12031.0,2985.972392,557.5,1078.0,2462.75,13060.0,1158.022021,1345.5,...,1182.0,1.0,1188.0,918.0,1.0,419491.666667,724139.196063,1255654.0,14.622485,1.0


In [20]:
all_set = pd.concat([train_set, test_set, valid_set], axis=0).reset_index(drop=True)
labeled_data_len = train_set.shape[0] + test_set.shape[0]

In [21]:
test_set.shape, train_set.shape, valid_set.shape, all_set.shape

((3859, 40), (129732, 40), (13005, 40), (146596, 40))

数据增强
NOTE: 因为有些聚类不能处理字符串数据，所以在这里把转换为数值格式的数据进行处理，而不是一开始针对原数据做增强

In [22]:
NO_SMOTE = True
NO_ADASYN = True
CLUSTER = True
SHUFFLE = True

In [24]:
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import NearestNeighbors
from imblearn.over_sampling import ADASYN

def cluster_oversample(data, label_col='is_sa', cluster_method='kmeans++', k=5, sample_ratio=1.0):
    # 1. 提取标签为0和1的样本
    normal_data = data[data[label_col] == 0].drop(columns=[label_col])
    malicious_count = len(data[data[label_col] == 1])
    normal_count = len(normal_data)
    
    # 计算需要生成的新样本总数并分成四份
    target_sample_count = int((normal_count - malicious_count) * sample_ratio)
    samples_per_cluster_per_method = (target_sample_count // 4) // k  # 每种方法生成的样本数量
    new_samples = []

    # 2. 聚类
    if cluster_method == 'kmeans++':
        kmeans = KMeans(n_clusters=k, init='k-means++', random_state=0)
        labels = kmeans.fit_predict(normal_data)
        clusters = {i: normal_data[labels == i] for i in range(k)}
    elif cluster_method == 'other_methods':
        pass
        # TODO: 其他聚类方法

    # 3. 逐方法生成新样本

    # 方法1：聚类中心加噪声
    for cluster_id, cluster_data in clusters.items():
        cluster_center = cluster_data.mean(axis=0).values
        noise = np.random.normal(0, 0.01, size=(samples_per_cluster_per_method, cluster_data.shape[1]))
        synthetic_samples = cluster_center + noise
        synthetic_samples_df = pd.DataFrame(synthetic_samples, columns=normal_data.columns)
        new_samples.append(synthetic_samples_df)
    
    # 方法2：聚类内样本插值 (C-SMOTE)
    for cluster_id, cluster_data in clusters.items():
        nn = NearestNeighbors(n_neighbors=2).fit(cluster_data)
        interpolated_samples = []
        for _ in range(samples_per_cluster_per_method):
            sample_idx = np.random.randint(0, len(cluster_data))
            sample_point = cluster_data.iloc[[sample_idx]]
            _, neighbors = nn.kneighbors(sample_point)  # 直接传入 DataFrame 保持一致性
            neighbor_idx = neighbors[0, 1]
            sample_a = cluster_data.iloc[sample_idx].values
            sample_b = cluster_data.iloc[neighbor_idx].values
            interpolated_sample = sample_a + np.random.rand() * (sample_b - sample_a)
            interpolated_samples.append(interpolated_sample)
        interpolated_samples_df = pd.DataFrame(interpolated_samples, columns=normal_data.columns)
        new_samples.append(interpolated_samples_df)

    # 方法3：自适应聚类过采样 (ACO)
    for cluster_id, cluster_data in clusters.items():
        cluster_center = cluster_data.mean(axis=0).values
        noise = np.random.normal(0, 0.01, size=(samples_per_cluster_per_method, cluster_data.shape[1]))
        adaptive_samples = cluster_center + noise
        adaptive_samples_df = pd.DataFrame(adaptive_samples, columns=normal_data.columns)
        new_samples.append(adaptive_samples_df)

    # 方法4：基于聚类的 SMOTE (Cluster SMOTE)
    for cluster_id, cluster_data in clusters.items():
        smote_samples = []
        for _ in range(samples_per_cluster_per_method):
            sample_pair = cluster_data.sample(2)
            interpolated_sample = sample_pair.mean().values.reshape(1, -1)
            smote_samples.append(interpolated_sample[0])
        smote_samples_df = pd.DataFrame(smote_samples, columns=normal_data.columns)
        new_samples.append(smote_samples_df)

    # 4. 合并生成的新样本并添加标签
    new_samples = pd.concat(new_samples, ignore_index=True)
    new_samples[label_col] = 1

    # 打印新生成样本的总数
    print(f"Total new samples generated: {len(new_samples)} (Target: {target_sample_count})")

    return data, new_samples


def fly_augmentation(train_set: pd.DataFrame, ratio: float, mode: str) -> pd.DataFrame:
    # Calculate the sample size for each class based on the ratio
    num_samples = int(ratio * len(train_set))
    
    # Separate the dataset by class
    class_0 = train_set[train_set['is_sa'] == 0]
    class_1 = train_set[train_set['is_sa'] == 1]
    
    # Sample from each class without replacement to avoid changing the total number of rows
    sampled_class_0 = class_0.sample(n=num_samples, replace=False, random_state=42)
    sampled_class_1 = class_1.sample(n=num_samples, replace=False, random_state=42)
    
    # Concatenate the samples from both classes
    augmented_data = pd.concat([sampled_class_0, sampled_class_1])
    
    # Get feature columns (excluding the last column 'is_sa')
    features = augmented_data.columns[:-1]
    
    # Mode "intra-class": Shuffle features within each class
    if mode == "intra-class":
        for col in features:
            augmented_data.loc[augmented_data['is_sa'] == 0, col] = np.random.permutation(augmented_data[augmented_data['is_sa'] == 0][col].values)
            augmented_data.loc[augmented_data['is_sa'] == 1, col] = np.random.permutation(augmented_data[augmented_data['is_sa'] == 1][col].values)
    
    # Mode "inter-class": Shuffle features between classes
    elif mode == "inter-class":
        for col in features:
            combined_values = np.concatenate((augmented_data[augmented_data['is_sa'] == 0][col].values,
                                              augmented_data[augmented_data['is_sa'] == 1][col].values))
            shuffled_values = np.random.permutation(combined_values)
            
            # Split the shuffled values back into the two classes
            augmented_data.loc[augmented_data['is_sa'] == 0, col] = shuffled_values[:num_samples]
            augmented_data.loc[augmented_data['is_sa'] == 1, col] = shuffled_values[num_samples:]
    
    # Replace the sampled original data with the augmented data in train_set
    train_set.update(augmented_data)
    
    return train_set.reset_index(drop=True)

In [26]:
labeled_set, valid_set = all_set.iloc[:labeled_data_len].copy(), all_set.iloc[labeled_data_len:].copy()
labeled_set.reset_index(drop=True, inplace=True)
valid_set.reset_index(drop=True, inplace=True)
# 有一些值在SMOTE后对数变换后为 NaN，需要删除这些数据
print(labeled_set.isnull().sum().sum())
labeled_set = labeled_set.dropna()
print(labeled_set.isnull().sum().sum())
assert valid_set.shape[0] == validation_features.shape[0]

# 重新划分训练集和测试集
if not ALL:
    train_set, test_set = labeled_set.iloc[:train_len].copy(), labeled_set.iloc[train_len:].copy()
    train_set.reset_index(drop=True, inplace=True)
    test_set.reset_index(drop=True, inplace=True)

    # remove_columns = ['distance_distance_std', "start_time_diff_start_time_diff_max", "distance_distance_quantile_75"]

    # remove_columns = ["lfee_lfee_std", "lfee_lfee_mean", 'call_duration_call_duration_max', "distance_distance_quantile_50", "call_duration_call_duration_quantile_25"]
    # remove_columns = ["7", "6", "lfee_lfee_mean", "hour_hour_std", "1", "call_duration_call_duration_quantile_75", "3", "cfee_cfee_std", "start_time_diff_start_time_diff_max", "call_duration_call_duration_max", "dayofweek_dayofweek_mode", "distance_distance_quantile_75", "cfee_cfee_mean"] # , "visit_area_code_visit_area_code_nunique", "visit_area_code_visit_area_code_nunique"
    # remove_columns = ['visit_area_code+nunique_cross_start_time_diff+max', "distance+std"]
    # remove_columns = ['dayofweek+std', 'start_time_diff+max', 'distance+quantile_75', 'lfee+mean', 'lfee+std', 'lfee+sum', 'cfee+sum', '6', 'visit_area_code+nunique']
    remove_columns = ['cfee+std', 'start_time_diff+start_time_diff_std', 'lfee+mean', 'lfee+sum', 'lfee+std']
    train_set = train_set.drop(remove_columns, axis=1)
    test_set = test_set.drop(remove_columns, axis=1)
    valid_set = valid_set.drop(remove_columns, axis=1)
    
    # 对采样数据做 smote
    if not NO_SMOTE:
        smote = SMOTE(random_state=42)    # 处理过采样的方法
        X_train, y_train = smote.fit_resample(train_set.drop(['is_sa'], axis=1), train_set['is_sa'])
        train_set = pd.concat([X_train, y_train], axis=1)
        print('通过SMOTE方法平衡正负样本后')
        n_sample = y_train.shape[0]
        n_pos_sample = y_train[y_train == 1].shape[0]
        n_neg_sample = y_train[y_train == 0].shape[0]
        print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                        n_pos_sample / n_sample,
                                                        n_neg_sample / n_sample))
    elif not NO_ADASYN:
        adasyn = ADASYN(random_state=42)
        X_train, y_train = adasyn.fit_resample(train_set.drop(['is_sa'], axis=1), train_set['is_sa'])
        train_set = pd.concat([X_train, y_train], axis=1)
        print('通过ADASYN方法平衡正负样本后')
        n_sample = y_train.shape[0]
        n_pos_sample = y_train[y_train == 1].shape[0]
        n_neg_sample = y_train[y_train == 0].shape[0]
        print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                        n_pos_sample / n_sample,
                                                        n_neg_sample / n_sample))
    elif CLUSTER:
        train_set, new_sample = cluster_oversample(train_set, label_col='is_sa', cluster_method='kmeans++', k=5, sample_ratio=1.0)
        if SHUFFLE:
            # 只针对原数据做shuffle
            train_set = fly_augmentation(train_set, ratio=0.01, mode='intra-class')
        train_set = pd.concat([train_set, new_sample], ignore_index=True)
        print('通过cluster sampling方法平衡正负样本后')
        n_sample = len(train_set)
        n_pos_sample = len(train_set[train_set['is_sa'] == 1])
        n_neg_sample = len(train_set[train_set['is_sa'] == 0])
        print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                        n_pos_sample / n_sample,
                                                        n_neg_sample / n_sample))
    assert train_set.shape[1] == test_set.shape[1] == valid_set.shape[1]
else:
    if not NO_SMOTE:
        # BUG:
        # 对 all_set 做 smote
        smote = SMOTE(random_state=42)    # 处理过采样的方法
        X_train, y_train = smote.fit_resample(labeled_set.drop(['is_sa'], axis=1), labeled_set['is_sa'])
        labeled_set = pd.concat([X_train, y_train], axis=1)
        print('通过SMOTE方法平衡正负样本后')
        n_sample = y_train.shape[0]
        n_pos_sample = y_train[y_train == 1].shape[0]
        n_neg_sample = y_train[y_train == 0].shape[0]
        print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                        n_pos_sample / n_sample,
                                                        n_neg_sample / n_sample))
    elif not NO_ADASYN:
        adasyn = ADASYN(random_state=42)
        X_train, y_train = adasyn.fit_resample(labeled_set.drop(['is_sa'], axis=1), labeled_set['is_sa'])
        labeled_set = pd.concat([X_train, y_train], axis=1)
        print('通过ADASYN方法平衡正负样本后')
        n_sample = y_train.shape[0]
        n_pos_sample = y_train[y_train == 1].shape[0]
        n_neg_sample = y_train[y_train == 0].shape[0]
        print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                        n_pos_sample / n_sample,
                                                        n_neg_sample / n_sample))
    elif CLUSTER:
        labeled_set, new_sample = cluster_oversample(labeled_set, label_col='is_sa', cluster_method='kmeans++', k=5, sample_ratio=1.0)
        if SHUFFLE:
            # 只针对原数据做shuffle
            labeled_set = fly_augmentation(labeled_set, ratio=0.01, mode='intra-class')
        labeled_set = pd.concat([labeled_set, new_sample], ignore_index=True)
        print('通过cluster sampling方法平衡正负样本后')
        n_sample = len(train_set)
        n_pos_sample = len(train_set[train_set['is_sa'] == 1])
        n_neg_sample = len(train_set[train_set['is_sa'] == 0])
        print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                        n_pos_sample / n_sample,
                                                        n_neg_sample / n_sample))
        
    remove_columns = ['0', '1', '2', '3', '4', '5', '6', '7', 'cfee+std', 'start_time_diff+start_time_diff_std', 'lfee+mean', 'lfee+sum', 'lfee+std']
    labeled_set = labeled_set.drop(remove_columns, axis=1)
    valid_set = valid_set.drop(remove_columns, axis=1)
    test_set = test_set.drop(remove_columns, axis=1)
    assert labeled_set.shape[1] == valid_set.shape[1] == test_set.shape[1]
    

0
0


  super()._check_params_vs_input(X, default_n_init=10)


Total new samples generated: 52180 (Target: 52196)
通过cluster sampling方法平衡正负样本后
样本个数：181912; 正样本占50.00%; 负样本占50.00%


In [27]:
train_set.tail()

Unnamed: 0,call_duration+sum,call_duration+mean,call_duration+max,call_duration+std,call_duration+quantile_25,call_duration+quantile_50,call_duration+quantile_75,cfee+sum,cfee+mean,hour+mean,...,open_datetime+open_count,other_party+account_person_num,other_party+called_diff_home_code,a_serv_type+call_num,a_serv_type+called_num,a_serv_type+call_called_normalized_diff,start_time_diff+start_time_diff_mean,start_time_diff+max,start_time_diff+coefficient_of_variation,is_sa
181907,792.5,52.861111,321.5,82.759453,15.125,24.5,43.125,0.0,0.0,15.019841,...,1.0,7.0,0.634921,7.0,9.0,-0.15873,78023.896825,491457.5,1.678701,1.0
181908,1320.5,336.95098,856.5,426.525167,90.5,174.5,495.25,0.0,0.0,14.333333,...,1.0,4.0,0.627451,2.5,7.5,-0.705882,155626.617647,479804.0,1.496259,1.0
181909,1060.5,75.75,215.0,68.400737,28.75,48.5,99.125,0.0,0.0,13.571429,...,1.0,9.5,0.75,2.0,12.0,-0.714286,80028.678571,556940.0,1.882867,1.0
181910,4024.5,244.291667,1286.0,376.860831,30.125,70.75,231.75,0.0,0.0,13.760417,...,1.0,9.0,0.921131,14.5,4.0,0.589286,56870.799107,414895.5,1.820382,1.0
181911,1263.5,52.674731,180.0,43.160178,18.5,43.0,75.125,0.0,0.0,15.822581,...,1.0,12.0,0.509857,1.5,23.0,-0.833333,53228.601254,492249.5,2.069673,1.0


In [28]:
new_sample.tail()

Unnamed: 0,call_duration+sum,call_duration+mean,call_duration+max,call_duration+std,call_duration+quantile_25,call_duration+quantile_50,call_duration+quantile_75,cfee+sum,cfee+mean,hour+mean,...,open_datetime+open_count,other_party+account_person_num,other_party+called_diff_home_code,a_serv_type+call_num,a_serv_type+called_num,a_serv_type+call_called_normalized_diff,start_time_diff+start_time_diff_mean,start_time_diff+max,start_time_diff+coefficient_of_variation,is_sa
52175,792.5,52.861111,321.5,82.759453,15.125,24.5,43.125,0.0,0.0,15.019841,...,1.0,7.0,0.634921,7.0,9.0,-0.15873,78023.896825,491457.5,1.678701,1
52176,1320.5,336.95098,856.5,426.525167,90.5,174.5,495.25,0.0,0.0,14.333333,...,1.0,4.0,0.627451,2.5,7.5,-0.705882,155626.617647,479804.0,1.496259,1
52177,1060.5,75.75,215.0,68.400737,28.75,48.5,99.125,0.0,0.0,13.571429,...,1.0,9.5,0.75,2.0,12.0,-0.714286,80028.678571,556940.0,1.882867,1
52178,4024.5,244.291667,1286.0,376.860831,30.125,70.75,231.75,0.0,0.0,13.760417,...,1.0,9.0,0.921131,14.5,4.0,0.589286,56870.799107,414895.5,1.820382,1
52179,1263.5,52.674731,180.0,43.160178,18.5,43.0,75.125,0.0,0.0,15.822581,...,1.0,12.0,0.509857,1.5,23.0,-0.833333,53228.601254,492249.5,2.069673,1


In [117]:
train_set.shape, test_set.shape, valid_set.shape
# ((129732, 35), (3859, 35), (13005, 35))
# ((181928, 35), (3859, 35), (13005, 35))

((181912, 35), (3859, 35), (13005, 35))

In [None]:
# 使用 autogluon 训练
from autogluon.tabular import TabularPredictor
# import ray
# 使用防止过拟合的超参数
# hyperparameters = {
#     'GBM': {'lambda_l1': 1e-2, 'lambda_l2': 1e-2},
#     'FASTAI': {'dropout_prob': 0.2}
# }
# ray.shutdown()
# ray.init(include_dashboard=True, object_store_memory=10**9)  # Increase object store memory

# 输入数据X_train, y_train
if not ALL:
    # 交叉验证训练
    model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(train_set, presets='medium_quality', time_limit=3600)
    # , excluded_model_types=['KNN']
    # model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(train_set, presets='best_quality', time_limit=3600)
else:
    model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(labeled_set, presets='best_quality', num_bag_folds=10, time_limit=3600)

In [None]:
if not ALL:
    print(model.evaluate(test_set))

In [None]:
feature_importance = model.feature_importance(test_set if not ALL else labeled_set)
print(feature_importance)
feature_importance

In [None]:
# leaderboard
if not ALL:
    leaderboard = model.leaderboard(test_set, silent=True)
    print(leaderboard)
else:
    leaderboard = model.leaderboard(labeled_set, silent=True)
    print(leaderboard)
leaderboard

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# 假设 model 已经训练好，并且 test_set 已经定义
if not ALL:
    y_pred = model.predict(test_set)
    y_true = test_set['is_sa']
    
    # 打印分类报告
    print(classification_report(y_true, y_pred))
    
    # 计算混淆矩阵
    cm = confusion_matrix(y_true, y_pred)
    
    # 可视化混淆矩阵
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted Negative', 'Predicted Positive'], yticklabels=['Actual Negative', 'Actual Positive'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    os.makedirs("../vis", exist_ok=True)
    plt.savefig("../vis/confusion_matrix.png")
    plt.show()

In [None]:
# 模型决策阈值微调
threadhold = 0.2
if not ALL:
    y_pred_proba = model.predict_proba(test_set)
    # print(y_pred_proba)
    y_pred = (y_pred_proba.iloc[:, 1] > threadhold).astype(int)
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

In [None]:
valid_set

In [None]:
test_set.shape

In [None]:
# 预测
y_validation_pred = model.predict(valid_set.drop('is_sa', axis=1))

# 将预测结果与 msisdn 对应起来
validation_results = validation_features[['msisdn']].copy()
validation_results['is_sa'] = y_validation_pred.astype(int)

print(validation_results.describe())

# 保存结果到CSV文件
import time
time_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
file_name = './valid_large_data_{}.csv'.format(time_str) if ALL else './valid_small_data_{}.csv'.format(time_str)
validation_results.to_csv(file_name, index=False)
print(file_name)

In [36]:
# import numpy as np
# from sklearn.metrics import f1_score, classification_report, confusion_matrix

# # 自动调整最佳阈值
# if not ALL:
#     y_true = test_set['is_sa']
#     y_pred_proba = model.predict_proba(test_set)

#     thresholds = np.arange(0.0, 1.0, 0.01)
#     f1_scores = []

#     for threshold in thresholds:
#         y_pred = (y_pred_proba.iloc[:, 1] >= threshold).astype(int)
#         f1 = f1_score(y_true, y_pred)
#         f1_scores.append(f1)

#     best_threshold = thresholds[np.argmax(f1_scores)]
#     print(f'最佳阈值：{best_threshold}')
#     print(f'最佳 F1 分数：{max(f1_scores)}')

#     # 使用最佳阈值进行预测
#     y_pred = (y_pred_proba.iloc[:, 1] >= best_threshold).astype(int)
#     print(classification_report(y_true, y_pred))
#     print(confusion_matrix(y_true, y_pred))

In [37]:
# # 阈值微调版结果
# best_threshold = 0.3
# # 使用最佳决策阈值进行预测
# y_validation_pred_proba = model.predict_proba(valid_set.drop('is_sa', axis=1))
# y_validation_pred = (y_validation_pred_proba.iloc[:, 1] >= best_threshold).astype(int)

# # 将预测结果与 msisdn 对应起来
# validation_results = validation_features[['msisdn']].copy()
# validation_results['is_sa'] = y_validation_pred.astype(int)

# print(validation_results.describe())

# # 保存结果到CSV文件
# import time
# time_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
# file_name = './valid_large_data_{}.csv'.format(time_str) if ALL else './valid_small_data_{}.csv'.format(time_str)
# validation_results.to_csv(file_name, index=False)
# print(file_name)