In [411]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix,classification_report
from xgboost import XGBClassifier
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import time

ALL = False
NO_SMOTE = True
TEST_RATIO = 0.1

dtypes = {
    'msisdn': 'str',
    'start_time': 'str',
    'end_time': 'str',
    'call_event': 'category',
    'other_party': 'str',
    'ismultimedia': 'category',
    'home_area_code': 'str',
    'visit_area_code': 'str',
    'called_home_code': 'str',
    'called_code': 'str',
    'a_serv_type': 'int',
    'long_type1': 'int',
    'roam_type': 'int',
    'a_product_id': 'str',
    'open_datetime': 'str',
    'call_duration': 'int32',
    'cfee': 'float64',
    'lfee': 'float64',
    'hour': 'int8',
    'dayofweek': 'int',
    'phone1_type': 'int',
    'phone2_type': 'int',
    'phone1_loc_city': 'str',
    'phone1_loc_province': 'str',
    'phone2_loc_city': 'str',
    'phone2_loc_province': 'str',
    'update_time': 'str',
    'date': 'str',
    'date_c': 'str'
}

# 判断 processed 文件夹是否存在
import os
if not os.path.exists('../self_data/processed'):
    print("Creating processed data folder...")
    # 读取CSV文件
    labeled_data = pd.read_csv('../self_data/sorted_trainSet_res_with_head.csv', dtype=dtypes)
    labels = pd.read_csv('../self_data/trainSet_ans_with_head.csv', dtype=dtypes)

    validation_data = pd.read_csv('../self_data/sorted_validationSet_res_with_head.csv', dtype=dtypes)

    # 按照 msisdn 切分 train_data 和 test_data
    train_data_msisdn, test_data_msisdn = train_test_split(labels['msisdn'], test_size=TEST_RATIO, random_state=42, stratify=labels['is_sa'])
    train_data = labeled_data[labeled_data['msisdn'].isin(train_data_msisdn)]
    train_labels = labels[labels['msisdn'].isin(train_data_msisdn)]
    assert len(train_data['msisdn'].unique()) == len(train_data_msisdn)

    test_data = labeled_data[labeled_data['msisdn'].isin(test_data_msisdn)]
    test_labels = labels[labels['msisdn'].isin(test_data_msisdn)]
    assert len(test_data['msisdn'].unique()) == len(test_data_msisdn)


    # # 遍历 groupby('msisdn') 的结果，对每个 msisdn 进行数据增强
    # # ------
    from tqdm import tqdm
    import os
    import sys
    from utils.augmentation import Augmentation

    addition_train_data = []
    addition_train_labels = []

    times = 1
    ratio_range = 0.1
    pbar = tqdm(train_data.groupby('msisdn'))
    for msisdn, group in pbar:
        if msisdn == 0:
            continue
        # print(f"Augmenting msisdn {msisdn}")
        pbar.set_description(f"Augmenting msisdn {msisdn}")
        label = train_labels[train_labels['msisdn'] == msisdn].iloc[0]['is_sa']
        aug = Augmentation(group, label, 'msisdn', 'is_sa')
        # 对正负样本进行平衡 样本比 1:4
        if label == 1:
            res_df, res_labels = aug.times(ratio=ratio_range, times=8+9*times, method='mask')

            addition_train_data.append(res_df)
            addition_train_labels.append(res_labels)

            # res_df, res_labels = aug.times(window_size=100, step_size=80, times=1, method='sliding_window')

            # addition_train_data.append(res_df)
            # addition_train_labels.append(res_labels)
        else:
            res_df, res_labels = aug.times(ratio=ratio_range, times=times, method='mask')

            addition_train_data.append(res_df)
            addition_train_labels.append(res_labels)

            # res_df, res_labels = aug.times(window_size=100, step_size=80, times=1, method='sliding_window')

            # addition_train_data.append(res_df)
            # addition_train_labels.append(res_labels)
            
    addition_train_data = pd.concat(addition_train_data)
    addition_train_labels = pd.concat(addition_train_labels)

    # 将新数据加入到train_data中
    train_data = pd.concat([train_data, addition_train_data], ignore_index=True).reset_index(drop=True)
    train_labels = pd.concat([train_labels, addition_train_labels], ignore_index=True).reset_index(drop=True)

    # 按照 msisdn, start_time 排序
    sort_start_time = time.time()
    train_data = train_data.sort_values(by=['msisdn', 'start_time']).reset_index(drop=True)
    train_labels = train_labels.sort_values(by=['msisdn']).reset_index(drop=True)
    print('sort time:', time.time() - sort_start_time)

    labels_aug = pd.concat([train_labels, test_labels], ignore_index=True).reindex()
    # ------------------

    # save
    print("Saving processed data...")
    os.makedirs('../self_data/processed', exist_ok=True)
    train_data.to_csv('../self_data/processed/train_data.csv', index=False)
    train_labels.to_csv('../self_data/processed/train_labels.csv', index=False)
    test_data.to_csv('../self_data/processed/test_data.csv', index=False)
    test_labels.to_csv('../self_data/processed/test_labels.csv', index=False)

    labels_aug.to_csv('../self_data/processed/labels_aug.csv', index=False)

    validation_data.to_csv('../self_data/processed/validation_data.csv', index=False)
    # TODO: test_data

else:
    print("Reading processed data...")
    train_data = pd.read_csv('../self_data/processed/train_data.csv', dtype=dtypes)
    train_labels = pd.read_csv('../self_data/processed/train_labels.csv', dtype=dtypes)
    test_data = pd.read_csv('../self_data/processed/test_data.csv', dtype=dtypes)
    test_labels = pd.read_csv('../self_data/processed/test_labels.csv', dtype=dtypes)

    labels_aug = pd.read_csv('../self_data/processed/labels_aug.csv', dtype=dtypes)

    validation_data = pd.read_csv('../self_data/processed/validation_data.csv', dtype=dtypes)

labeled_data_aug = pd.concat([train_data, test_data], ignore_index=True).reindex()
assert len(labeled_data_aug['msisdn'].unique()) == len(labels_aug['msisdn'].unique())

# 转换时间格式
labeled_data_aug['start_time'] = pd.to_datetime(labeled_data_aug['start_time'], format='%Y%m%d%H%M%S')
labeled_data_aug['end_time'] = pd.to_datetime(labeled_data_aug['end_time'], format='%Y%m%d%H%M%S')
labeled_data_aug['open_datetime'] = pd.to_datetime(labeled_data_aug['open_datetime'], format='%Y%m%d%H%M%S')
labeled_data_aug['update_time'] = pd.to_datetime(labeled_data_aug['update_time'])
labeled_data_aug['date'] = pd.to_datetime(labeled_data_aug['date'])

validation_data['start_time'] = pd.to_datetime(validation_data['start_time'], format='%Y%m%d%H%M%S')
validation_data['end_time'] = pd.to_datetime(validation_data['end_time'], format='%Y%m%d%H%M%S')
validation_data['open_datetime'] = pd.to_datetime(validation_data['open_datetime'], format='%Y%m%d%H%M%S',errors='coerce')
validation_data['update_time'] = pd.to_datetime(validation_data['update_time'])
validation_data['date'] = pd.to_datetime(validation_data['date'])

Creating processed data folder...


Augmenting msisdn 2522865: 100%|██████████| 31357/31357 [03:46<00:00, 138.62it/s]


sort time: 12.179929733276367
Saving processed data...


In [412]:
# 为每条记录添加start_time_diff，记录 start_time 与上一条记录的 start_time 之差 (单位：秒)
start_time_diff = labeled_data_aug.groupby('msisdn')['start_time'].diff().dt.total_seconds().fillna(0).reset_index(drop=True)
# 将该列加入到数据集中
labeled_data_aug['start_time_diff'] = start_time_diff.copy()
# time_diff_start2end = train_data.groupby('msisdn')['end_time'].diff().dt.total_seconds().fillna(0)
start_time_diff = validation_data.groupby('msisdn')['start_time'].diff().dt.total_seconds().fillna(0).reset_index(drop=True)
validation_data['start_time_diff'] = start_time_diff.copy()

数据特征处理

In [413]:
# 聚合特征
def aggregate_features(data):
    return data.groupby('msisdn').agg({
    'call_duration': [
        ('sum', 'sum'), 
        ('mean', 'mean'), 
        ('max', 'max'), 
        ('std', 'std'),
        ('quantile_25', lambda x: x.quantile(0.25)), 
        ('quantile_50', lambda x: x.quantile(0.50)), 
        ('quantile_75', lambda x: x.quantile(0.75)),
    ],
    'cfee': [
        ('sum', 'sum'),
        ('std', 'std'), 
        ('mean', 'mean'),
    ],
    'lfee': [
        ('sum', 'sum'), 
        ('mean', 'mean'),
        ('std', 'std'),
    ],
    'hour': [
        ('mean', 'mean'), 
        ('std', 'std'), 
        ('max', 'max'), 
        ('min', 'min'),
    ],
    'dayofweek': [
        ('std', 'std'), 
        ('magic', lambda x: x.value_counts().mean()), 
        ('work_day_num', lambda x: x[x.isin([1,2,3,4,5])].count()), 
        ('weekend_num', lambda x: x[x.isin([6,7])].count()),
        ('mode', lambda x: x.mode().values[0]),
        ('work_day_weekend_diff', lambda x: (x[x.isin([1,2,3,4,5])].count() - x[x.isin([6,7])].count()) / (x[x.isin([1,2,3,4,5])].count() + x[x.isin([6,7])].count())),
    ],
    # 'home_area_code': [
    #     ('home_area_code_nunique', 'nunique')
    # ],
    'visit_area_code': [
        ('nunique', 'nunique'),
        ('times_not_at_home_area', lambda x: x[x != x.shift()].count()/x.count())
    ],
    'called_home_code': [
        ('nunique', 'nunique'),
        ('called_diff_home_code', lambda x: x[x != x.shift()].count() / x.count())
    ],
    'called_code': [
        ('nunique', 'nunique'),
        ('diff', lambda x: x[x != x.shift()].count()/ x.count())
    ],
    'open_datetime': [
        ('open_count', 'nunique')
    ],
    'other_party': [
        ('account_person_num', 'nunique'),
        ('called_diff_home_code', lambda x: x[x != x.shift()].count() / x.count())
    ],
    'a_serv_type': [
        ('call_num', lambda x: x[x.isin([1, 3])].count()), 
        ('called_num', lambda x: x[x == 2].count()),
        ('call_called_normalized_diff', lambda x: (x[x.isin([1, 3])].count() - x[x == 2].count()) /  (x[x.isin([1, 3])].count() + x[x == 2].count())),
    ],
    'start_time_diff': [
        ('start_time_diff_mean', 'mean'), 
        ('start_time_diff_std', 'std'), 
        ('max', 'max'), 
        ('coefficient_of_variation', lambda x: x.std() / x.mean()),
    ], 
    # 'phone1_type': [
    #     ('nunique', 'nunique'),
    #     ('mode', lambda x: x.mode().values[0])
    # ],
    # 'distance': [
    #     ('sum', 'sum'), 
    #     ('std', 'std'), 
    #     ('max', 'max'), 
    #     ('quantile_25', lambda x: x.quantile(0.25)), 
    #     ('quantile_50', lambda x: x.quantile(0.50)), 
    #     ('quantile_75', lambda x: x.quantile(0.75)),
    # ]
})

labeled_aug_features = aggregate_features(labeled_data_aug)
validation_features = aggregate_features(validation_data)

labeled_aug_features.columns = ['+'.join(col).strip() for col in labeled_aug_features.columns.values]
validation_features.columns = ['+'.join(col).strip() for col in validation_features.columns.values]

labeled_aug_features.columns = labeled_aug_features.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '').str.replace('(', '').str.replace(')', '').str.replace(',', '').str.replace(' ', '_')
validation_features.columns = validation_features.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '').str.replace('(', '').str.replace(')', '').str.replace(',', '').str.replace(' ', '_')

# 重置索引
labeled_aug_features = labeled_aug_features.reset_index()
validation_features = validation_features.reset_index()

# 合并标签数据
labeled_aug_features = labeled_aug_features.merge(labels_aug, on='msisdn', how='left')
# 打印结果
# labeled_aug_features

# # 添加 ae 的编码特征
# labeled_ae = pd.read_csv('../data/ae/train.csv', dtype=dtypes)
# valid_ae = pd.read_csv('../data/ae/val.csv', dtype=dtypes)
# labeled_aug_features = labeled_aug_features.merge(labeled_ae, on='msisdn', how='left')
# validation_features = validation_features.merge(valid_ae, on='msisdn', how='left')

In [414]:
labeled_aug_features.columns

Index(['msisdn', 'call_duration+sum', 'call_duration+mean',
       'call_duration+max', 'call_duration+std', 'call_duration+quantile_25',
       'call_duration+quantile_50', 'call_duration+quantile_75', 'cfee+sum',
       'cfee+std', 'cfee+mean', 'lfee+sum', 'lfee+mean', 'lfee+std',
       'hour+mean', 'hour+std', 'hour+max', 'hour+min', 'dayofweek+std',
       'dayofweek+magic', 'dayofweek+work_day_num', 'dayofweek+weekend_num',
       'dayofweek+mode', 'dayofweek+work_day_weekend_diff',
       'visit_area_code+nunique', 'visit_area_code+times_not_at_home_area',
       'called_home_code+nunique', 'called_home_code+called_diff_home_code',
       'called_code+nunique', 'called_code+diff', 'open_datetime+open_count',
       'other_party+account_person_num', 'other_party+called_diff_home_code',
       'a_serv_type+call_num', 'a_serv_type+called_num',
       'a_serv_type+call_called_normalized_diff',
       'start_time_diff+start_time_diff_mean',
       'start_time_diff+start_time_diff_std

In [415]:
len(labeled_aug_features.columns)

41

In [416]:
def get_nan(train):
    # 获取 train 中的 nan值
    train_nan = train[train.isnull().T.any()]
    # 统计 每列含有的 nan 数量
    for col in train.columns:
        if train[col].isnull().sum() > 0:
            print(col, train[col].isnull().sum())

    return train_nan
get_nan(labeled_aug_features), get_nan(validation_features)

call_duration+std 122
cfee+std 122
lfee+std 122
hour+std 122
dayofweek+std 122
start_time_diff+start_time_diff_std 122
start_time_diff+coefficient_of_variation 122
call_duration+std 78
cfee+std 78
lfee+std 78
hour+std 78
dayofweek+std 78
start_time_diff+start_time_diff_std 78
start_time_diff+coefficient_of_variation 78


(         msisdn  call_duration+sum  call_duration+mean  call_duration+max  \
 911     1013277                351               351.0                351   
 1506    1017415                  9                 9.0                  9   
 1533    1017498                 10                10.0                 10   
 2660    1022208                  7                 7.0                  7   
 3350    1025250                  9                 9.0                  9   
 ...         ...                ...                 ...                ...   
 102178  2419249                  2                 2.0                  2   
 103169  2423456                 14                14.0                 14   
 105731  2500562                 23                23.0                 23   
 105981  2502357                 74                74.0                 74   
 106029  2502602                507               507.0                507   
 
         call_duration+std  call_duration+quantile_25  \
 911 

In [417]:
# 一般只有 std 会出现 nan 值故所有的 nan 值填充为 0
labeled_aug_features = labeled_aug_features.fillna(0)
validation_features = validation_features.fillna(0)

def get_nan(train):
    # 获取 train 中的 nan值
    train_nan = train[train.isnull().T.any()]
    # 统计 每列含有的 nan 数量
    for col in train.columns:
        if train[col].isnull().sum() > 0:
            print(col, train[col].isnull().sum())

    return train_nan
get_nan(labeled_aug_features), get_nan(validation_features)

(Empty DataFrame
 Columns: [msisdn, call_duration+sum, call_duration+mean, call_duration+max, call_duration+std, call_duration+quantile_25, call_duration+quantile_50, call_duration+quantile_75, cfee+sum, cfee+std, cfee+mean, lfee+sum, lfee+mean, lfee+std, hour+mean, hour+std, hour+max, hour+min, dayofweek+std, dayofweek+magic, dayofweek+work_day_num, dayofweek+weekend_num, dayofweek+mode, dayofweek+work_day_weekend_diff, visit_area_code+nunique, visit_area_code+times_not_at_home_area, called_home_code+nunique, called_home_code+called_diff_home_code, called_code+nunique, called_code+diff, open_datetime+open_count, other_party+account_person_num, other_party+called_diff_home_code, a_serv_type+call_num, a_serv_type+called_num, a_serv_type+call_called_normalized_diff, start_time_diff+start_time_diff_mean, start_time_diff+start_time_diff_std, start_time_diff+max, start_time_diff+coefficient_of_variation, is_sa]
 Index: []
 
 [0 rows x 41 columns],
 Empty DataFrame
 Columns: [msisdn, call_du

In [418]:
# # # 交叉特征
# # # 将所有特征两两相乘
# from itertools import combinations
# from tqdm import tqdm

# def cross_features(data):
#     cross_features = []
#     new_features = []
#     cross_cols = data.columns.tolist()
#     rm_cols = ['msisdn', 'is_sa']
#     for col in rm_cols:
#         if col in cross_cols:
#             cross_cols.remove(col)

#     for i, j in tqdm(combinations(cross_cols, 2), total=len(cross_cols) * (len(cross_cols) - 1) // 2):
#         new_features.append(data[i] * data[j])
#         cross_features.append(f'{i}_cross_{j}')
#     new_features = pd.concat(new_features, axis=1)
#     new_features.columns = cross_features
#     data = pd.concat([data, new_features], axis=1)
#     return data, cross_features

# labeled_aug_features, _ = cross_features(labeled_aug_features)
# validation_features, _ = cross_features(validation_features)

In [419]:
X = labeled_aug_features.drop(['msisdn'], axis=1)
y = labeled_aug_features['is_sa']
X_validation = validation_features.drop(['msisdn'], axis=1)

n_sample = y.shape[0]
n_pos_sample = y[y ==1].shape[0]
n_neg_sample = y[y == 0].shape[0]
print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('特征维数：', X.shape[1])

样本个数：106586; 正样本占44.65%; 负样本占55.35%
特征维数： 40


In [420]:
labeled_aug_features.columns

Index(['msisdn', 'call_duration+sum', 'call_duration+mean',
       'call_duration+max', 'call_duration+std', 'call_duration+quantile_25',
       'call_duration+quantile_50', 'call_duration+quantile_75', 'cfee+sum',
       'cfee+std', 'cfee+mean', 'lfee+sum', 'lfee+mean', 'lfee+std',
       'hour+mean', 'hour+std', 'hour+max', 'hour+min', 'dayofweek+std',
       'dayofweek+magic', 'dayofweek+work_day_num', 'dayofweek+weekend_num',
       'dayofweek+mode', 'dayofweek+work_day_weekend_diff',
       'visit_area_code+nunique', 'visit_area_code+times_not_at_home_area',
       'called_home_code+nunique', 'called_home_code+called_diff_home_code',
       'called_code+nunique', 'called_code+diff', 'open_datetime+open_count',
       'other_party+account_person_num', 'other_party+called_diff_home_code',
       'a_serv_type+call_num', 'a_serv_type+called_num',
       'a_serv_type+call_called_normalized_diff',
       'start_time_diff+start_time_diff_mean',
       'start_time_diff+start_time_diff_std

In [421]:
# TODO use all_X to impute
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

In [422]:
imputer2 = SimpleImputer(strategy='most_frequent')
X_validation = imputer2.fit_transform(X_validation)

In [423]:
X.shape

(106586, 40)

In [424]:
X_validation.shape

(13005, 39)

In [425]:
y.shape

(106586,)

In [426]:
assert len(train_labels) + len(test_labels) == len(labeled_aug_features)
len(train_labels), len(test_labels)

(103101, 3485)

In [427]:
from sklearn.utils import shuffle
# 将 msisdn 和 is_sa 并入 X 再划分
train_data_msisdn = train_labels['msisdn']
test_data_msisdn = test_labels['msisdn']
X_df = pd.DataFrame(X, columns=labeled_aug_features.drop(['msisdn'], axis=1).columns)
X_df = pd.concat([labeled_aug_features[['msisdn']], X_df], axis=1)
train_set = X_df[X_df['msisdn'].isin(train_data_msisdn)][X_df.columns[1:]]
test_set = X_df[X_df['msisdn'].isin(test_data_msisdn)][X_df.columns[1:]]

print(f"1 samples / 0 samples in train set: {len(train_set[train_set['is_sa'] == 1])} / {len(train_set[train_set['is_sa'] == 0])}")
print(f"1 samples / 0 samples in test set: {len(test_set[test_set['is_sa'] == 1])} / {len(test_set[test_set['is_sa'] == 0])}")

if ALL:
    # if not NO_SMOTE:
    #     smote = SMOTE(random_state=42)    # 处理过采样的方法
    #     X, y = smote.fit_resample(X, y)
    # X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42, shuffle=True)
    train_len = len(test_set) + len(train_set)
    test_len = 0
else:
    # X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y,test_size= 0.3,random_state=42, shuffle=True)
    # X_train, y_train = shuffle(X_train, y_train, random_state=42)
    train_len, test_len = len(train_set), len(test_set)

    # if not NO_SMOTE:
    #     smote = SMOTE(random_state=42)    # 处理过采样的方法
    #     X_train, y_train = smote.fit_resample(X_train, y_train)
    #     print('通过SMOTE方法平衡正负样本后')
    #     n_sample = y_train.shape[0]
    #     n_pos_sample = y_train[y_train == 1].shape[0]
    #     n_neg_sample = y_train[y_train == 0].shape[0]
    #     print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
    #                                                     n_pos_sample / n_sample,
    #                                                     n_neg_sample / n_sample))
    #     print('特征维数：', X.shape[1])

1 samples / 0 samples in train set: 47281 / 55820
1 samples / 0 samples in test set: 308 / 3177


In [428]:
columns = labeled_aug_features.columns.tolist()
columns.remove('msisdn')
valid_set = np.c_[X_validation, np.zeros(X_validation.shape[0])]
valid_set = pd.DataFrame(valid_set, columns=columns)
valid_set['is_sa'] = -1

In [429]:
train_set.head()

Unnamed: 0,call_duration+sum,call_duration+mean,call_duration+max,call_duration+std,call_duration+quantile_25,call_duration+quantile_50,call_duration+quantile_75,cfee+sum,cfee+std,cfee+mean,...,other_party+account_person_num,other_party+called_diff_home_code,a_serv_type+call_num,a_serv_type+called_num,a_serv_type+call_called_normalized_diff,start_time_diff+start_time_diff_mean,start_time_diff+start_time_diff_std,start_time_diff+max,start_time_diff+coefficient_of_variation,is_sa
0,50338.0,149.815476,3203.0,334.553766,32.75,62.5,142.0,0.0,0.0,0.0,...,88.0,0.872024,211.0,125.0,0.255952,3747.264881,7927.046295,48952.0,2.115422,0.0
1,43371.0,143.138614,3203.0,305.958694,33.0,62.0,142.0,0.0,0.0,0.0,...,87.0,0.884488,187.0,116.0,0.234323,4155.382838,8659.361523,50345.0,2.08389,0.0
2,7295.0,117.66129,1800.0,282.01546,17.0,44.0,82.75,0.0,0.0,0.0,...,21.0,0.83871,26.0,36.0,-0.16129,19985.5,26985.692469,110742.0,1.350264,0.0
3,6208.0,110.857143,1800.0,286.432694,16.75,41.5,80.25,0.0,0.0,0.0,...,19.0,0.785714,26.0,30.0,-0.071429,22106.196429,28629.042189,110742.0,1.295069,0.0
4,4249.0,184.73913,1531.0,360.264451,21.0,33.0,97.5,0.0,0.0,0.0,...,12.0,0.73913,12.0,11.0,0.043478,47118.478261,60066.827501,227180.0,1.274804,0.0


In [430]:
test_set.describe()

Unnamed: 0,call_duration+sum,call_duration+mean,call_duration+max,call_duration+std,call_duration+quantile_25,call_duration+quantile_50,call_duration+quantile_75,cfee+sum,cfee+std,cfee+mean,...,other_party+account_person_num,other_party+called_diff_home_code,a_serv_type+call_num,a_serv_type+called_num,a_serv_type+call_called_normalized_diff,start_time_diff+start_time_diff_mean,start_time_diff+start_time_diff_std,start_time_diff+max,start_time_diff+coefficient_of_variation,is_sa
count,3485.0,3485.0,3485.0,3485.0,3485.0,3485.0,3485.0,3485.0,3485.0,3485.0,...,3485.0,3485.0,3485.0,3485.0,3485.0,3485.0,3485.0,3485.0,3485.0,3485.0
mean,9804.094692,97.324593,990.799713,156.926369,23.398852,46.075036,99.936872,122.167001,2.070676,0.85915,...,31.816643,0.723246,56.826973,54.088379,-0.019748,28695.673056,43060.409023,170406.7,1.749504,0.088379
std,12274.417132,92.804423,1053.773678,164.080622,35.728454,59.290552,112.926398,769.352833,12.274767,4.592863,...,42.385809,0.165465,74.210569,67.108748,0.381513,39155.13904,54178.585989,140844.2,0.658823,0.283886
min,7.0,4.935484,7.0,0.0,1.0,2.0,3.0,0.0,0.0,0.0,...,1.0,0.011136,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
25%,2436.0,50.428571,315.0,57.03132,15.25,28.0,53.5,0.0,0.0,0.0,...,10.0,0.647059,15.0,16.0,-0.254902,8595.212329,15862.56166,79670.0,1.397543,0.0
50%,5792.0,74.318519,675.0,109.181838,20.0,38.0,76.0,0.0,0.0,0.0,...,20.0,0.753521,33.0,36.0,-0.012987,16427.697368,26186.300882,131402.0,1.653368,0.0
75%,12563.0,113.38806,1357.0,199.55531,25.5,50.0,109.0,0.0,0.0,0.0,...,39.0,0.833333,70.0,70.0,0.2,32864.459459,49118.532476,210387.0,1.968172,0.0
max,206591.0,1967.333333,14400.0,2521.727466,1773.5,2106.0,2230.5,19200.0,347.486038,97.510204,...,1043.0,1.0,1222.0,1347.0,1.0,605595.5,856441.369412,1211191.0,13.787435,1.0


In [431]:
all_set = pd.concat([train_set, test_set, valid_set], axis=0).reset_index(drop=True)
labeled_data_len = train_set.shape[0] + test_set.shape[0]

In [432]:
test_set.shape, train_set.shape, valid_set.shape, all_set.shape

((3485, 40), (103101, 40), (13005, 40), (119591, 40))

In [433]:
labeled_set, valid_set = all_set.iloc[:labeled_data_len].copy(), all_set.iloc[labeled_data_len:].copy()
labeled_set.reset_index(drop=True, inplace=True)
valid_set.reset_index(drop=True, inplace=True)
# 有一些值在SMOTE后对数变换后为 NaN，需要删除这些数据
print(labeled_set.isnull().sum().sum())
labeled_set = labeled_set.dropna()
print(labeled_set.isnull().sum().sum())
assert valid_set.shape[0] == validation_features.shape[0]

# 重新划分训练集和测试集
if not ALL:
    train_set, test_set = labeled_set.iloc[:train_len].copy(), labeled_set.iloc[train_len:].copy()
    train_set.reset_index(drop=True, inplace=True)
    test_set.reset_index(drop=True, inplace=True)

    # remove_columns = ['distance_distance_std', "start_time_diff_start_time_diff_max", "distance_distance_quantile_75"]

    # remove_columns = ["lfee_lfee_std", "lfee_lfee_mean", 'call_duration_call_duration_max', "distance_distance_quantile_50", "call_duration_call_duration_quantile_25"]
    # remove_columns = ["7", "6", "lfee_lfee_mean", "hour_hour_std", "1", "call_duration_call_duration_quantile_75", "3", "cfee_cfee_std", "start_time_diff_start_time_diff_max", "call_duration_call_duration_max", "dayofweek_dayofweek_mode", "distance_distance_quantile_75", "cfee_cfee_mean"] # , "visit_area_code_visit_area_code_nunique", "visit_area_code_visit_area_code_nunique"
    # remove_columns = ['visit_area_code+nunique_cross_start_time_diff+max', "distance+std"]
    # remove_columns = ['dayofweek+std', 'start_time_diff+max', 'distance+quantile_75', 'lfee+mean', 'lfee+std', 'lfee+sum', 'cfee+sum', '6', 'visit_area_code+nunique']
    remove_columns = ['cfee+std', 'start_time_diff+start_time_diff_std', 'lfee+mean', 'lfee+sum', 'lfee+std']
    train_set = train_set.drop(remove_columns, axis=1)
    test_set = test_set.drop(remove_columns, axis=1)
    valid_set = valid_set.drop(remove_columns, axis=1)

    # 对采样数据做 smote
    if not NO_SMOTE:
        smote = SMOTE(random_state=42)    # 处理过采样的方法
        X_train, y_train = smote.fit_resample(train_set.drop(['is_sa'], axis=1), train_set['is_sa'])
        train_set = pd.concat([X_train, y_train], axis=1)
        print('通过SMOTE方法平衡正负样本后')
        n_sample = y_train.shape[0]
        n_pos_sample = y_train[y_train == 1].shape[0]
        n_neg_sample = y_train[y_train == 0].shape[0]
        print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                        n_pos_sample / n_sample,
                                                        n_neg_sample / n_sample))
    assert train_set.shape[1] == test_set.shape[1] == valid_set.shape[1]
else:
    if not NO_SMOTE:
        # BUG:
        # 对 all_set 做 smote
        smote = SMOTE(random_state=42)    # 处理过采样的方法
        X_train, y_train = smote.fit_resample(labeled_set.drop(['is_sa'], axis=1), labeled_set['is_sa'])
        labeled_set = pd.concat([X_train, y_train], axis=1)
        print('通过SMOTE方法平衡正负样本后')
        n_sample = y_train.shape[0]
        n_pos_sample = y_train[y_train == 1].shape[0]
        n_neg_sample = y_train[y_train == 0].shape[0]
        print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                        n_pos_sample / n_sample,
                                                        n_neg_sample / n_sample))
    remove_columns = ['0', '1', '2', '3', '4', '5', '6', '7', 'cfee+std', 'start_time_diff+start_time_diff_std', 'lfee+mean', 'lfee+sum', 'lfee+std']
    labeled_set = labeled_set.drop(remove_columns, axis=1)
    valid_set = valid_set.drop(remove_columns, axis=1)
    test_set = test_set.drop(remove_columns, axis=1)
    assert labeled_set.shape[1] == valid_set.shape[1] == test_set.shape[1]
    

0
0


In [434]:
train_set.shape, test_set.shape, valid_set.shape

((103101, 35), (3485, 35), (13005, 35))

In [435]:
# 使用 autogluon 训练
from autogluon.tabular import TabularPredictor
# import ray
# 使用防止过拟合的超参数
# hyperparameters = {
#     'GBM': {'lambda_l1': 1e-2, 'lambda_l2': 1e-2},
#     'FASTAI': {'dropout_prob': 0.2}
# }
# ray.shutdown()
# ray.init(include_dashboard=True, object_store_memory=10**9)  # Increase object store memory

# 输入数据X_train, y_train
if not ALL:
    # 交叉验证训练
    model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(train_set, presets='medium_quality', time_limit=3600)
    # , excluded_model_types=['KNN']
    # model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(train_set, presets='best_quality', time_limit=3600)
else:
    model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(labeled_set, presets='best_quality', num_bag_folds=10, time_limit=3600)

No path specified. Models will be saved in: "AutogluonModels\ag-20241026_081412"
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels\ag-20241026_081412"
AutoGluon Version:  1.0.0
Python Version:     3.10.14
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19041
CPU Count:          48
Memory Avail:       79.93 GB / 127.90 GB (62.5%)
Disk Space Avail:   440.55 GB / 3726.01 GB (11.8%)
Train Data Rows:    103101
Train Data Columns: 34
Label Column:       is_sa
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    81842.36 MB
	Train Data (Original)  Memory Usage: 26.74 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manuall

[1000]	valid_set's binary_logloss: 0.307886	valid_set's f1: 0.883362
[2000]	valid_set's binary_logloss: 0.22209	valid_set's f1: 0.933219
[3000]	valid_set's binary_logloss: 0.168001	valid_set's f1: 0.960447
[4000]	valid_set's binary_logloss: 0.128864	valid_set's f1: 0.975314
[5000]	valid_set's binary_logloss: 0.102131	valid_set's f1: 0.983066
[6000]	valid_set's binary_logloss: 0.0816987	valid_set's f1: 0.988266
[7000]	valid_set's binary_logloss: 0.0654806	valid_set's f1: 0.991289
[8000]	valid_set's binary_logloss: 0.0531248	valid_set's f1: 0.993464
[9000]	valid_set's binary_logloss: 0.043851	valid_set's f1: 0.996075
[10000]	valid_set's binary_logloss: 0.0367146	valid_set's f1: 0.996075


	0.9969	 = Validation score   (f1)
	70.13s	 = Training   runtime
	0.15s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 3527.55s of the 3527.55s of remaining time.


[1000]	valid_set's binary_logloss: 0.220865	valid_set's f1: 0.92906
[2000]	valid_set's binary_logloss: 0.126602	valid_set's f1: 0.973043
[3000]	valid_set's binary_logloss: 0.0793142	valid_set's f1: 0.986911
[4000]	valid_set's binary_logloss: 0.0519063	valid_set's f1: 0.992567
[5000]	valid_set's binary_logloss: 0.0355525	valid_set's f1: 0.99476
[6000]	valid_set's binary_logloss: 0.025189	valid_set's f1: 0.997378
[7000]	valid_set's binary_logloss: 0.0189064	valid_set's f1: 0.996939


	0.9978	 = Validation score   (f1)
	57.62s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: RandomForestGini ... Training model for up to 3469.23s of the 3469.22s of remaining time.
	0.9917	 = Validation score   (f1)
	4.9s	 = Training   runtime
	0.15s	 = Validation runtime
Fitting model: RandomForestEntr ... Training model for up to 3463.87s of the 3463.86s of remaining time.
	0.9908	 = Validation score   (f1)
	5.64s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 3457.74s of the 3457.73s of remaining time.
	0.9917	 = Validation score   (f1)
	395.53s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesGini ... Training model for up to 3062.15s of the 3062.15s of remaining time.
	0.9925	 = Validation score   (f1)
	3.22s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: ExtraTreesEntr ... Training model for up to 3058.31s of the 3058.3s of remaining time.
	0.993	 = Validation 

[1000]	valid_set's binary_logloss: 0.0907586	valid_set's f1: 0.986445
[2000]	valid_set's binary_logloss: 0.0312138	valid_set's f1: 0.996503


	0.9965	 = Validation score   (f1)
	24.91s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 2356.13s of remaining time.
	Ensemble Weights: {'LightGBM': 1.0}
	0.9978	 = Validation score   (f1)
	3.47s	 = Training   runtime
	0.01s	 = Validation runtime
AutoGluon training complete, total runtime = 1247.52s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20241026_081412")


In [436]:
if not ALL:
    print(model.evaluate(test_set))

{'f1': 0.3603238866396761, 'accuracy': 0.909325681492109, 'balanced_accuracy': 0.6292145452910325, 'mcc': 0.32634558004122227, 'roc_auc': 0.7855109165307466, 'precision': 0.478494623655914, 'recall': 0.288961038961039}


In [437]:
feature_importance = model.feature_importance(test_set if not ALL else labeled_set)
print(feature_importance)
feature_importance

Computing feature importance via permutation shuffling for 34 features using 3485 rows with 5 shuffle sets...
	27.2s	= Expected runtime (5.44s per shuffle set)
	8.55s	= Actual runtime (Completed 5 of 5 shuffle sets)


                                          importance    stddev   p_value  n  \
other_party+account_person_num              0.067638  0.011225  0.000088  5   
a_serv_type+call_num                        0.055238  0.019711  0.001655  5   
a_serv_type+called_num                      0.052602  0.015196  0.000750  5   
visit_area_code+times_not_at_home_area      0.047843  0.008584  0.000119  5   
called_home_code+nunique                    0.046502  0.018793  0.002607  5   
call_duration+std                           0.041007  0.017624  0.003252  5   
dayofweek+magic                             0.038490  0.029082  0.020789  5   
call_duration+mean                          0.033043  0.008392  0.000459  5   
dayofweek+work_day_num                      0.031716  0.008099  0.000469  5   
start_time_diff+start_time_diff_mean        0.029990  0.007931  0.000536  5   
hour+min                                    0.029955  0.011824  0.002394  5   
start_time_diff+coefficient_of_variation    0.022076

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
other_party+account_person_num,0.067638,0.011225,8.8e-05,5,0.09075,0.044526
a_serv_type+call_num,0.055238,0.019711,0.001655,5,0.095823,0.014654
a_serv_type+called_num,0.052602,0.015196,0.00075,5,0.083891,0.021313
visit_area_code+times_not_at_home_area,0.047843,0.008584,0.000119,5,0.065518,0.030167
called_home_code+nunique,0.046502,0.018793,0.002607,5,0.085197,0.007807
call_duration+std,0.041007,0.017624,0.003252,5,0.077295,0.004719
dayofweek+magic,0.03849,0.029082,0.020789,5,0.09837,-0.02139
call_duration+mean,0.033043,0.008392,0.000459,5,0.050321,0.015765
dayofweek+work_day_num,0.031716,0.008099,0.000469,5,0.048392,0.01504
start_time_diff+start_time_diff_mean,0.02999,0.007931,0.000536,5,0.046319,0.013661


In [438]:
# leaderboard
if not ALL:
    leaderboard = model.leaderboard(test_set, silent=True)
    print(leaderboard)
else:
    leaderboard = model.leaderboard(labeled_set, silent=True)
    print(leaderboard)
leaderboard

                  model  score_test  score_val eval_metric  pred_time_test  \
0            LightGBMXT    0.383142   0.996945          f1        0.284588   
1              CatBoost    0.379182   0.991721          f1        0.085360   
2         LightGBMLarge    0.372188   0.996503          f1        0.124593   
3              LightGBM    0.360324   0.997816          f1        0.144055   
4   WeightedEnsemble_L2    0.360324   0.997816          f1        0.150942   
5        NeuralNetTorch    0.356633   0.967686          f1        0.090710   
6               XGBoost    0.352000   0.996078          f1        0.506597   
7       NeuralNetFastAI    0.337514   0.935403          f1        0.258186   
8      RandomForestEntr    0.331126   0.990777          f1        0.452391   
9      RandomForestGini    0.300683   0.991656          f1        0.495867   
10       ExtraTreesGini    0.275862   0.992527          f1        0.550454   
11       ExtraTreesEntr    0.270270   0.992976          f1      

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT,0.383142,0.996945,f1,0.284588,0.147289,70.128597,0.284588,0.147289,70.128597,1,True,3
1,CatBoost,0.379182,0.991721,f1,0.08536,0.014973,395.525809,0.08536,0.014973,395.525809,1,True,7
2,LightGBMLarge,0.372188,0.996503,f1,0.124593,0.03843,24.905385,0.124593,0.03843,24.905385,1,True,13
3,LightGBM,0.360324,0.997816,f1,0.144055,0.067579,57.616534,0.144055,0.067579,57.616534,1,True,4
4,WeightedEnsemble_L2,0.360324,0.997816,f1,0.150942,0.074316,61.083804,0.006886,0.006737,3.467269,2,True,14
5,NeuralNetTorch,0.356633,0.967686,f1,0.09071,0.054067,471.605707,0.09071,0.054067,471.605707,1,True,12
6,XGBoost,0.352,0.996078,f1,0.506597,0.087234,103.302012,0.506597,0.087234,103.302012,1,True,11
7,NeuralNetFastAI,0.337514,0.935403,f1,0.258186,0.044569,97.351106,0.258186,0.044569,97.351106,1,True,10
8,RandomForestEntr,0.331126,0.990777,f1,0.452391,0.130725,5.639832,0.452391,0.130725,5.639832,1,True,6
9,RandomForestGini,0.300683,0.991656,f1,0.495867,0.14554,4.903846,0.495867,0.14554,4.903846,1,True,5


In [439]:
# 在testset 上计算指标
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

if not ALL:
    y_pred = model.predict(test_set)
    y_true = test_set['is_sa']
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95      3177
         1.0       0.48      0.29      0.36       308

    accuracy                           0.91      3485
   macro avg       0.71      0.63      0.66      3485
weighted avg       0.89      0.91      0.90      3485

[[3080   97]
 [ 219   89]]


In [440]:
# 模型决策阈值微调
threadhold = 0.2
if not ALL:
    y_pred_proba = model.predict_proba(test_set)
    # print(y_pred_proba)
    y_pred = (y_pred_proba.iloc[:, 1] > threadhold).astype(int)
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.94      0.93      0.93      3177
         1.0       0.34      0.39      0.37       308

    accuracy                           0.88      3485
   macro avg       0.64      0.66      0.65      3485
weighted avg       0.89      0.88      0.88      3485

[[2944  233]
 [ 187  121]]


In [441]:
valid_set

Unnamed: 0,call_duration+sum,call_duration+mean,call_duration+max,call_duration+std,call_duration+quantile_25,call_duration+quantile_50,call_duration+quantile_75,cfee+sum,cfee+mean,hour+mean,...,open_datetime+open_count,other_party+account_person_num,other_party+called_diff_home_code,a_serv_type+call_num,a_serv_type+called_num,a_serv_type+call_called_normalized_diff,start_time_diff+start_time_diff_mean,start_time_diff+max,start_time_diff+coefficient_of_variation,is_sa
0,28781.0,150.685864,3433.0,368.413006,26.00,56.0,128.00,0.0,0.0,13.963351,...,1.0,29.0,0.691099,78.0,113.0,-0.183246,6604.434555,70979.0,1.677037,-1.0
1,45.0,15.000000,22.0,7.000000,11.50,15.0,18.50,0.0,0.0,16.666667,...,1.0,3.0,1.000000,2.0,1.0,0.333333,151026.333333,358005.0,1.227899,-1.0
2,10441.0,105.464646,1018.0,187.808104,22.00,47.0,72.50,0.0,0.0,16.969697,...,1.0,45.0,0.777778,40.0,59.0,-0.191919,12536.474747,89245.0,1.637631,-1.0
3,2784.0,49.714286,225.0,37.122752,24.75,39.5,66.50,0.0,0.0,13.714286,...,1.0,6.0,0.517857,30.0,26.0,0.071429,22129.500000,141632.0,1.444801,-1.0
4,19280.0,61.012658,439.0,67.178085,20.00,41.0,72.25,0.0,0.0,14.196203,...,1.0,178.0,0.715190,148.0,168.0,-0.063291,3864.389241,94609.0,3.541749,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13000,14677.0,119.325203,2491.0,357.738972,15.00,27.0,57.00,0.0,0.0,12.601626,...,1.0,39.0,0.723577,54.0,69.0,-0.121951,10255.146341,78981.0,1.548514,-1.0
13001,4440.0,80.727273,359.0,81.001247,20.00,50.0,108.00,0.0,0.0,13.181818,...,1.0,21.0,0.745455,14.0,41.0,-0.490909,22530.509091,328175.0,2.135565,-1.0
13002,14668.0,70.181818,1141.0,101.785622,22.00,40.0,82.00,0.0,0.0,15.655502,...,1.0,54.0,0.693780,100.0,109.0,-0.043062,5955.488038,75474.0,2.048014,-1.0
13003,3505.0,79.659091,368.0,81.336208,27.00,62.0,79.25,0.0,0.0,12.522727,...,1.0,21.0,0.704545,12.0,32.0,-0.454545,28175.477273,239119.0,1.656892,-1.0


In [442]:
test_set.shape

(3485, 35)

In [443]:
# 预测
y_validation_pred = model.predict(valid_set.drop('is_sa', axis=1))

# 将预测结果与 msisdn 对应起来
validation_results = validation_features[['msisdn']].copy()
validation_results['is_sa'] = y_validation_pred.astype(int)

print(validation_results.describe())

# 保存结果到CSV文件
import time
time_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
file_name = './valid_large_data_{}.csv'.format(time_str) if ALL else './valid_small_data_{}.csv'.format(time_str)
validation_results.to_csv(file_name, index=False)
print(file_name)

              is_sa
count  13005.000000
mean       0.068897
std        0.253288
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
./valid_small_data_20241026163513.csv


In [444]:
# import numpy as np
# from sklearn.metrics import f1_score, classification_report, confusion_matrix

# # 自动调整最佳阈值
# if not ALL:
#     y_true = test_set['is_sa']
#     y_pred_proba = model.predict_proba(test_set)

#     thresholds = np.arange(0.0, 1.0, 0.01)
#     f1_scores = []

#     for threshold in thresholds:
#         y_pred = (y_pred_proba.iloc[:, 1] >= threshold).astype(int)
#         f1 = f1_score(y_true, y_pred)
#         f1_scores.append(f1)

#     best_threshold = thresholds[np.argmax(f1_scores)]
#     print(f'最佳阈值：{best_threshold}')
#     print(f'最佳 F1 分数：{max(f1_scores)}')

#     # 使用最佳阈值进行预测
#     y_pred = (y_pred_proba.iloc[:, 1] >= best_threshold).astype(int)
#     print(classification_report(y_true, y_pred))
#     print(confusion_matrix(y_true, y_pred))

In [445]:
# # 阈值微调版结果
# best_threshold = 0.3
# # 使用最佳决策阈值进行预测
# y_validation_pred_proba = model.predict_proba(valid_set.drop('is_sa', axis=1))
# y_validation_pred = (y_validation_pred_proba.iloc[:, 1] >= best_threshold).astype(int)

# # 将预测结果与 msisdn 对应起来
# validation_results = validation_features[['msisdn']].copy()
# validation_results['is_sa'] = y_validation_pred.astype(int)

# print(validation_results.describe())

# # 保存结果到CSV文件
# import time
# time_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
# file_name = './valid_large_data_{}.csv'.format(time_str) if ALL else './valid_small_data_{}.csv'.format(time_str)
# validation_results.to_csv(file_name, index=False)
# print(file_name)