In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix,classification_report
from xgboost import XGBClassifier
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

ALL = False
NO_SMOTE = False

# 判断 processed 文件夹是否存在
import os
if not os.path.exists('../data/processed'):
    print("Creating processed data folder...")
    # 读取CSV文件
    train_data = pd.read_csv('../data/raw/trainSet_res_with_distances.csv', dtype={'msisdn': 'str'})
    train_labels = pd.read_csv('../data/raw/trainSet_ans.csv', dtype={'msisdn': 'str'})

    validation_data = pd.read_csv('../data/raw/validationSet_res_with_distances.csv', dtype={'msisdn': 'str'})


    # 遍历 groupby('msisdn') 的结果，对每个 msisdn 进行数据增强
    # ------
    from tqdm import tqdm
    import os
    import sys
    sys.path.append(os.path.join(os.path.dirname('./'), '../'))
    from utils.augmentation import Augmentation

    addition_train_data = []
    addition_train_labels = []

    times = 4
    ratio_range = 0.1
    pbar = tqdm(train_data.groupby('msisdn'))
    for msisdn, group in pbar:
        if msisdn == 0:
            continue
        # print(f"Augmenting msisdn {msisdn}")
        pbar.set_description(f"Augmenting msisdn {msisdn}")
        label = train_labels[train_labels['msisdn'] == msisdn].iloc[0]['is_sa']
        aug = Augmentation(group, label, 'msisdn', 'is_sa')
        # 对正负样本进行平衡 样本比 1:4
        if label == 1:
            res_df, res_labels = aug.times(ratio=ratio_range, times=times * 4, method='mask')
        else:
            res_df, res_labels = aug.times(ratio=ratio_range, times=times, method='mask')
        addition_train_data.append(res_df)
        addition_train_labels.append(res_labels)
    addition_train_data = pd.concat(addition_train_data)
    addition_train_labels = pd.concat(addition_train_labels)

    # 将新数据加入到train_data中
    train_data = pd.concat([train_data, addition_train_data], ignore_index=True).reset_index(drop=True)
    train_labels = pd.concat([train_labels, addition_train_labels], ignore_index=True).reset_index(drop=True)
    # ------------------

    # save
    print("Saving processed data...")
    os.makedirs('../data/processed', exist_ok=True)
    train_data.to_csv('../data/processed/train_data.csv', index=False)
    train_labels.to_csv('../data/processed/train_labels.csv', index=False)
    validation_data.to_csv('../data/processed/validation_data.csv', index=False)

else:
    print("Reading processed data...")
    train_data = pd.read_csv('../data/processed/train_data.csv', dtype={'msisdn': 'str'})
    train_labels = pd.read_csv('../data/processed/train_labels.csv', dtype={'msisdn': 'str'})

    # 读取验证集
    validation_data = pd.read_csv('../data/processed/validation_data.csv', dtype={'msisdn': 'str'})


# 转换时间格式
train_data['start_time'] = pd.to_datetime(train_data['start_time'], format='%Y%m%d%H%M%S')
train_data['end_time'] = pd.to_datetime(train_data['end_time'], format='%Y%m%d%H%M%S')
train_data['open_datetime'] = pd.to_datetime(train_data['open_datetime'], format='%Y%m%d%H%M%S')
train_data['update_time'] = pd.to_datetime(train_data['update_time'])
train_data['date'] = pd.to_datetime(train_data['date'])

validation_data['start_time'] = pd.to_datetime(validation_data['start_time'], format='%Y%m%d%H%M%S')
validation_data['end_time'] = pd.to_datetime(validation_data['end_time'], format='%Y%m%d%H%M%S')
validation_data['open_datetime'] = pd.to_datetime(validation_data['open_datetime'], format='%Y%m%d%H%M%S',errors='coerce')
validation_data['update_time'] = pd.to_datetime(validation_data['update_time'])
validation_data['date'] = pd.to_datetime(validation_data['date'])

Reading processed data...


In [76]:
len(train_data), len(validation_data)

(4132071, 232250)

In [77]:
# 为每条记录添加start_time_diff，记录 start_time 与上一条记录的 start_time 之差 (单位：秒)
start_time_diff = train_data.groupby('msisdn')['start_time'].diff().dt.total_seconds().fillna(0).reset_index(drop=True)
# 将该列加入到数据集中
train_data['start_time_diff'] = start_time_diff.copy()
# time_diff_start2end = train_data.groupby('msisdn')['end_time'].diff().dt.total_seconds().fillna(0)
start_time_diff = validation_data.groupby('msisdn')['start_time'].diff().dt.total_seconds().fillna(0).reset_index(drop=True)
validation_data['start_time_diff'] = start_time_diff.copy()

In [78]:
train_labels

Unnamed: 0,msisdn,is_sa
0,1109993,1
1,1017493,1
2,1114848,1
3,1243192,1
4,1270247,1
...,...,...
26687,1303962_4,0
26688,1304080_1,0
26689,1304080_2,0
26690,1304080_3,0


数据特征处理

In [79]:
# 聚合特征
def aggregate_features(data):
    return data.groupby('msisdn').agg({
    'call_duration': [
        # ('call_duration_sum', 'sum'), 
        ('call_duration_mean', 'mean'), 
        ('call_duration_max', 'max'), 
        ('call_duration_std', 'std'),
        ('call_duration_quantile_25', lambda x: x.quantile(0.25)), 
        ('call_duration_quantile_50', lambda x: x.quantile(0.50)), 
        ('call_duration_quantile_75', lambda x: x.quantile(0.75))
    ],
    'cfee': [
        # ('cfee_sum', 'sum'),
        ('cfee_std', 'std'), 
        ('cfee_mean', 'mean'),
    ],
    'lfee': [
        # ('lfee_sum', 'sum'), 
        ('lfee_mean', 'mean'),
        ('lfee_std', 'std'),
    ],
    'hour': [
        ('hour_mean', 'mean'), 
        ('hour_std', 'std'), 
        # ('hour_max', 'max'), 
        ('hour_min', 'min'),
    ],
    'dayofweek': [
        ('dayofweek_std', 'std'), 
        ('magic_dayofweek', lambda x: x.value_counts().mean()), 
        # ('work_day_num', lambda x: x[x.isin([1,2,3,4,5])].count()), 
        # ('weekend_num', lambda x: x[x.isin([6,7])].count()),
        ('dayofweek_mode', lambda x: x.mode().values[0]),
        ('work_day_weekend_diff', lambda x: (x[x.isin([1,2,3,4,5])].count() - x[x.isin([6,7])].count()) / (x[x.isin([1,2,3,4,5])].count() + x[x.isin([6,7])].count())),
    ],
    # 'home_area_code': [
    #     ('home_area_code_nunique', 'nunique')
    # ],
    'visit_area_code': [
        ('visit_area_code_nunique', 'nunique'),
        ('times_not_at_home_area', lambda x: x[x != x.shift()].count())
    ],
    'called_home_code': [
        ('called_home_code_nunique', 'nunique'),
        ('called_diff_home_code', lambda x: x[x != x.shift()].count())
    ],
    'called_code': [
        # ('called_code_nunique', 'nunique')
        ('called_code_diff', lambda x: x[x != x.shift()].count())
    ],
    'open_datetime': [
        ('open_count', 'nunique')
    ],
    'other_party': [
        ('account_person_num', 'nunique'),
        ('called_diff_home_code', lambda x: x[x != x.shift()].count())
    ],
    'a_serv_type': [
        # ('call_num', lambda x: x[x.isin([1, 3])].count()), 
        # ('called_num', lambda x: x[x == 2].count()),
        ('call_called_normalized_diff', lambda x: (x[x.isin([1, 3])].count() - x[x == 2].count()) /  (x[x.isin([1, 3])].count() + x[x == 2].count())),
    ],
    'start_time_diff': [
        ('start_time_diff_mean', 'mean'), 
        ('start_time_diff_std', 'std'), 
        ('start_time_diff_max', 'max'), 
    ], 
    'distance': [
        # ('distance_sum', 'sum'), 
        ('distance_std', 'std'), 
        # ('distance_max', 'max'), 
        # ('distance_quantile_25', lambda x: x.quantile(0.25)), 
        ('distance_quantile_50', lambda x: x.quantile(0.50)), 
        ('distance_quantile_75', lambda x: x.quantile(0.75)),
    ]
})

train_features = aggregate_features(train_data)
validation_features = aggregate_features(validation_data)


train_features.columns = ['_'.join(col).strip() for col in train_features.columns.values]
validation_features.columns = ['_'.join(col).strip() for col in validation_features.columns.values]
train_features.columns = train_features.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '').str.replace('(', '').str.replace(')', '').str.replace(',', '').str.replace(' ', '_')
validation_features.columns = validation_features.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '').str.replace('(', '').str.replace(')', '').str.replace(',', '').str.replace(' ', '_')
# 重置索引
train_features = train_features.reset_index()
validation_features = validation_features.reset_index()

# 合并标签数据
train_features = train_features.merge(train_labels, on='msisdn', how='left')
# 打印结果
# train_features

In [80]:
y = train_features['is_sa']
X = train_features.drop(['msisdn', 'is_sa'], axis=1)
X_validation = validation_features.drop(['msisdn'], axis=1)

n_sample = y.shape[0]
n_pos_sample = y[y ==1].shape[0]
n_neg_sample = y[y == 0].shape[0]
print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('特征维数：', X.shape[1])

样本个数：26692; 正样本占43.59%; 负样本占56.41%
特征维数： 32


In [81]:
# TODO use all_X to impute
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

In [82]:
imputer2 = SimpleImputer(strategy='most_frequent')
X_validation = imputer2.fit_transform(X_validation)

In [83]:
if ALL:
    if not NO_SMOTE:
        smote = SMOTE(random_state=42)    # 处理过采样的方法
        X, y = smote.fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)
    train_len = len(y_train) + len(y_test)
    test_len = 0
else:
    X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y,test_size= 0.3,random_state=42)
    # X_test_imputed = imputer.transform(X)

    if not NO_SMOTE:
        smote = SMOTE(random_state=42)    # 处理过采样的方法
        X_train, y_train = smote.fit_resample(X_train, y_train)
        print('通过SMOTE方法平衡正负样本后')
        n_sample = y_train.shape[0]
        n_pos_sample = y_train[y_train == 1].shape[0]
        n_neg_sample = y_train[y_train == 0].shape[0]
        print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                        n_pos_sample / n_sample,
                                                        n_neg_sample / n_sample))
        print('特征维数：', X.shape[1])
    train_len, test_len = len(y_train), len(y_test)

通过SMOTE方法平衡正负样本后
样本个数：21078; 正样本占50.00%; 负样本占50.00%
特征维数： 32


In [84]:
X_test.shape

(8008, 32)

In [85]:
print(y_test.value_counts())

is_sa
0    4517
1    3491
Name: count, dtype: int64


In [86]:
X.shape

(26692, 32)

In [87]:
columns = train_features.columns.tolist()
columns.remove('msisdn')

In [88]:
assert X_validation.shape[1] == X_train.shape[1]

In [89]:
# 拼接 X_train 和 y_train np.array 为 dataframe
train_set = np.c_[X_train, y_train]
train_set = pd.DataFrame(train_set, columns=columns)
test_set = np.c_[X_test, y_test]
test_set = pd.DataFrame(test_set, columns=columns)
valid_set = np.c_[X_validation, np.zeros(X_validation.shape[0])]
valid_set = pd.DataFrame(valid_set, columns=columns)
valid_set['is_sa'] = -1

In [90]:
test_set.describe()

Unnamed: 0,call_duration_call_duration_mean,call_duration_call_duration_max,call_duration_call_duration_std,call_duration_call_duration_quantile_25,call_duration_call_duration_quantile_50,call_duration_call_duration_quantile_75,cfee_cfee_std,cfee_cfee_mean,lfee_lfee_mean,lfee_lfee_std,...,other_party_account_person_num,other_party_called_diff_home_code,a_serv_type_call_called_normalized_diff,start_time_diff_start_time_diff_mean,start_time_diff_start_time_diff_std,start_time_diff_start_time_diff_max,distance_distance_std,distance_distance_quantile_50,distance_distance_quantile_75,is_sa
count,8008.0,8008.0,8008.0,8008.0,8008.0,8008.0,8008.0,8008.0,8008.0,8008.0,...,8008.0,8008.0,8008.0,8008.0,8008.0,8008.0,8008.0,8008.0,8008.0,8008.0
mean,99.734032,1009.675824,158.926595,23.037306,46.598714,103.565591,6.55217,4.167897,0.069283,0.214347,...,62.609266,123.779595,0.224647,18449.546445,36121.672446,178055.7,279328.0,296213.5,429325.6,0.435939
std,80.031072,997.854488,147.999653,19.113342,41.072904,96.389185,30.289178,28.399881,1.24908,2.470435,...,134.434021,189.545303,0.492746,21692.426073,39438.513481,168950.1,296268.4,559416.3,676666.4,0.49591
min,7.333333,10.0,3.05505,2.0,6.0,8.0,0.0,0.0,-17.439024,0.0,...,1.0,1.0,-1.0,0.0,63.376672,0.0,0.0,0.0,0.0,0.0
25%,53.507979,365.0,64.83171,16.0,29.0,57.0,0.0,0.0,0.0,0.0,...,14.0,24.0,-0.109328,4516.782609,11707.978301,68995.75,14808.24,0.0,0.0,0.0
50%,79.0,735.0,113.945655,20.5,39.0,80.5,0.0,0.0,0.0,0.0,...,28.0,56.0,0.145731,11287.552381,23134.406152,134382.5,191298.1,0.0,0.0,0.0
75%,122.925796,1366.25,206.352759,26.0,54.0,121.0,0.0,0.0,0.0,0.0,...,60.0,146.0,0.636364,24010.476433,46038.546162,238766.0,464546.7,350600.5,653174.0,1.0
max,1586.888889,20580.0,2141.844441,1044.0,1707.0,2464.0,723.62589,878.878788,36.5,70.506646,...,2294.0,2303.0,1.0,470356.5,665184.54145,1136373.0,1929370.0,3879296.0,4182341.0,1.0


In [91]:
all_set = pd.concat([train_set, test_set, valid_set], axis=0).reset_index(drop=True)
labeled_data_len = train_set.shape[0] + test_set.shape[0]

In [92]:
test_set.shape, train_set.shape, valid_set.shape, all_set.shape

((8008, 33), (21078, 33), (1278, 33), (30364, 33))

In [93]:
labeled_set, valid_set = all_set.iloc[:labeled_data_len].copy(), all_set.iloc[labeled_data_len:].copy()
labeled_set.reset_index(drop=True, inplace=True)
valid_set.reset_index(drop=True, inplace=True)
# 有一些值在SMOTE后对数变换后为 NaN，需要删除这些数据
print(labeled_set.isnull().sum().sum())
labeled_set = labeled_set.dropna()
print(labeled_set.isnull().sum().sum())
assert valid_set.shape[0] == validation_features.shape[0]

0
0


In [94]:
# 重新划分训练集和测试集
if not ALL:
    train_set, test_set = labeled_set.iloc[:train_len].copy(), labeled_set.iloc[train_len:].copy()
    train_set.reset_index(drop=True, inplace=True)
    test_set.reset_index(drop=True, inplace=True)
    # # 从 test_set 和 y_test 中删除 数据增强获得的数据 无法实现
    # assert y.shape[0] == train_features.shape[0]
    # assert labeled_set.shape[0] == y.shape[0]
    # id_col = train_features['msisdn']
    # train_set_ids = id_col.iloc[train_set.index]
    # test_set_ids = id_col.iloc[test_set.index]

In [95]:
# 使用 autogluon 训练
from autogluon.tabular import TabularPredictor
# 输入数据X_train, y_train
if not ALL:
    model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(train_set, presets='medium_quality')
    # model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(train_set, presets='best_quality', time_limit=3600)
else:
    model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(labeled_set, presets='best_quality', time_limit=3600)

No path specified. Models will be saved in: "AutogluonModels\ag-20240719_042147"
Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240719_042147"
AutoGluon Version:  1.0.0
Python Version:     3.10.14
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19041
CPU Count:          12
Memory Avail:       16.17 GB / 31.90 GB (50.7%)
Disk Space Avail:   494.04 GB / 3726.01 GB (13.3%)
Train Data Rows:    21078
Train Data Columns: 32
Label Column:       is_sa
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    16525.12 MB
	Train Data (Original)  Memory Usage: 5.15 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtyp

Data preprocessing and feature engineering runtime = 0.39s ...
AutoGluon will gauge predictive performance using evaluation metric: 'f1'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 18970, Val Rows: 2108
User-specified model hyperparameters to be fit:
{
	'NN_TORCH': {},
	'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
	'CAT': {},
	'XGB': {},
	'FASTAI': {},
	'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
	'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'prob

[1000]	valid_set's binary_logloss: 0.0203782	valid_set's f1: 0.994759


	0.9952	 = Validation score   (f1)
	22.05s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: LightGBM ...
	0.9952	 = Validation score   (f1)
	19.33s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.9943	 = Validation score   (f1)
	2.33s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.9948	 = Validation score   (f1)
	2.95s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: CatBoost ...
	0.9948	 = Validation score   (f1)
	294.37s	 = Training   runtime
	0.19s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.9933	 = Validation score   (f1)
	2.06s	 = Training   runtime
	0.69s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.9933	 = Validation score   (f1)
	1.99s	 = Training   runtime
	0.46s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.9952	 = Validation score   (f1)
	74.54s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: XGBoos

In [96]:
if not ALL:
    print(model.evaluate(test_set))

{'f1': 0.9977077363896848, 'accuracy': 0.998001998001998, 'balanced_accuracy': 0.9979361205039278, 'mcc': 0.9959371700914779, 'roc_auc': 0.999865113790501, 'precision': 0.9979936944683291, 'recall': 0.9974219421369235}


In [97]:
feature_importance = model.feature_importance(test_set if not ALL else labeled_set)
print(feature_importance)
feature_importance

Computing feature importance via permutation shuffling for 32 features using 5000 rows with 5 shuffle sets...
	75.14s	= Expected runtime (15.03s per shuffle set)
	69.17s	= Actual runtime (Completed 5 of 5 shuffle sets)


                                           importance    stddev       p_value  \
called_home_code_called_home_code_nunique    0.059993  0.003924  2.183000e-06   
other_party_called_diff_home_code            0.057058  0.004983  6.909822e-06   
start_time_diff_start_time_diff_std          0.049684  0.002079  3.670601e-07   
start_time_diff_start_time_diff_mean         0.048098  0.001302  6.440163e-08   
start_time_diff_start_time_diff_max          0.026712  0.001701  1.964704e-06   
a_serv_type_call_called_normalized_diff      0.021138  0.001890  7.583840e-06   
distance_distance_std                        0.019565  0.000536  6.745924e-08   
other_party_account_person_num               0.018307  0.001581  6.613379e-06   
hour_hour_min                                0.016872  0.002029  2.464658e-05   
hour_hour_std                                0.015453  0.001032  2.374793e-06   
dayofweek_dayofweek_std                      0.015196  0.001016  2.383430e-06   
hour_hour_mean              

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
called_home_code_called_home_code_nunique,0.059993,0.003924,2.183e-06,5,0.068072,0.051914
other_party_called_diff_home_code,0.057058,0.004983,6.909822e-06,5,0.067319,0.046798
start_time_diff_start_time_diff_std,0.049684,0.002079,3.670601e-07,5,0.053965,0.045403
start_time_diff_start_time_diff_mean,0.048098,0.001302,6.440163e-08,5,0.050779,0.045417
start_time_diff_start_time_diff_max,0.026712,0.001701,1.964704e-06,5,0.030215,0.023209
a_serv_type_call_called_normalized_diff,0.021138,0.00189,7.58384e-06,5,0.025029,0.017247
distance_distance_std,0.019565,0.000536,6.745924e-08,5,0.020668,0.018462
other_party_account_person_num,0.018307,0.001581,6.613379e-06,5,0.021563,0.015051
hour_hour_min,0.016872,0.002029,2.464658e-05,5,0.02105,0.012693
hour_hour_std,0.015453,0.001032,2.374793e-06,5,0.017578,0.013328


In [98]:
# leaderboard
if not ALL:
    leaderboard = model.leaderboard(test_set, silent=True)
    print(leaderboard)
else:
    leaderboard = model.leaderboard(labeled_set, silent=True)
    print(leaderboard)
leaderboard

                  model  score_test  score_val eval_metric  pred_time_test  \
0        ExtraTreesEntr    0.997993   0.993321          f1        0.197394   
1        ExtraTreesGini    0.997993   0.993321          f1        0.208419   
2              CatBoost    0.997849   0.994754          f1        0.080335   
3   WeightedEnsemble_L2    0.997708   0.996194          f1        0.649467   
4              LightGBM    0.997706   0.995238          f1        0.154243   
5            LightGBMXT    0.997563   0.995238          f1        0.169005   
6      RandomForestGini    0.997561   0.994275          f1        0.238603   
7         LightGBMLarge    0.997135   0.995238          f1        0.101236   
8      RandomForestEntr    0.996845   0.994754          f1        0.214316   
9               XGBoost    0.996273   0.993801          f1        0.077575   
10       NeuralNetTorch    0.994848   0.991909          f1        0.152390   
11      NeuralNetFastAI    0.994425   0.995243          f1      

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesEntr,0.997993,0.993321,f1,0.197394,0.460066,1.992791,0.197394,0.460066,1.992791,1,True,9
1,ExtraTreesGini,0.997993,0.993321,f1,0.208419,0.689228,2.058898,0.208419,0.689228,2.058898,1,True,8
2,CatBoost,0.997849,0.994754,f1,0.080335,0.18716,294.369298,0.080335,0.18716,294.369298,1,True,7
3,WeightedEnsemble_L2,0.997708,0.996194,f1,0.649467,0.336609,377.480951,0.004326,0.009414,4.436157,2,True,14
4,LightGBM,0.997706,0.995238,f1,0.154243,0.03345,19.326614,0.154243,0.03345,19.326614,1,True,4
5,LightGBMXT,0.997563,0.995238,f1,0.169005,0.04361,22.053282,0.169005,0.04361,22.053282,1,True,3
6,RandomForestGini,0.997561,0.994275,f1,0.238603,0.09602,2.333634,0.238603,0.09602,2.333634,1,True,5
7,LightGBMLarge,0.997135,0.995238,f1,0.101236,0.025979,5.841022,0.101236,0.025979,5.841022,1,True,13
8,RandomForestEntr,0.996845,0.994754,f1,0.214316,0.131087,2.946624,0.214316,0.131087,2.946624,1,True,6
9,XGBoost,0.996273,0.993801,f1,0.077575,0.022041,4.100667,0.077575,0.022041,4.100667,1,True,11


In [99]:
# 在testset 上计算指标
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

if not ALL:
    y_pred = model.predict(test_set)
    y_true = test_set['is_sa']
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      4517
         1.0       1.00      1.00      1.00      3491

    accuracy                           1.00      8008
   macro avg       1.00      1.00      1.00      8008
weighted avg       1.00      1.00      1.00      8008

[[4510    7]
 [   9 3482]]


In [100]:
valid_set

Unnamed: 0,call_duration_call_duration_mean,call_duration_call_duration_max,call_duration_call_duration_std,call_duration_call_duration_quantile_25,call_duration_call_duration_quantile_50,call_duration_call_duration_quantile_75,cfee_cfee_std,cfee_cfee_mean,lfee_lfee_mean,lfee_lfee_std,...,other_party_account_person_num,other_party_called_diff_home_code,a_serv_type_call_called_normalized_diff,start_time_diff_start_time_diff_mean,start_time_diff_start_time_diff_std,start_time_diff_start_time_diff_max,distance_distance_std,distance_distance_quantile_50,distance_distance_quantile_75,is_sa
0,60.308682,1700.0,112.715262,19.50,37.0,62.50,0.000000,0.00,0.0,0.0,...,69.0,249.0,-0.421222,4071.543408,8323.861851,51194.0,2.081493e+05,0.0,180274.0,-1.0
1,54.165854,362.0,56.854232,17.00,37.0,65.00,0.000000,0.00,0.0,0.0,...,72.0,174.0,-0.160976,6229.775610,10913.381674,51724.0,2.102743e+04,0.0,0.0,-1.0
2,89.500000,670.0,175.297814,12.50,37.5,49.00,0.000000,0.00,0.0,0.0,...,8.0,14.0,-0.333333,65299.611111,72201.374485,217652.0,0.000000e+00,0.0,0.0,-1.0
3,46.458515,399.0,58.811953,14.00,27.0,53.00,0.000000,0.00,0.0,0.0,...,27.0,155.0,0.362445,5491.794760,11149.929456,65807.0,0.000000e+00,231682.0,231682.0,-1.0
4,38.430556,246.0,46.076735,13.00,19.0,42.50,6.804617,3.75,0.0,0.0,...,44.0,51.0,0.500000,16717.666667,39100.045311,242220.0,1.696337e+05,29847.5,280421.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1273,156.426230,1804.0,318.203739,23.00,45.0,96.50,0.000000,0.00,0.0,0.0,...,55.0,144.0,0.136612,6796.404372,17418.083154,148767.0,7.154820e+04,0.0,0.0,-1.0
1274,52.897872,621.0,69.502307,19.00,34.0,54.00,0.000000,0.00,0.0,0.0,...,36.0,130.0,0.055319,5298.587234,13151.545900,144098.0,1.749472e+05,0.0,0.0,-1.0
1275,346.552448,4061.0,637.773302,34.00,96.0,389.50,0.000000,0.00,0.0,0.0,...,23.0,114.0,0.230769,8752.293706,15958.753297,72131.0,3.078249e+05,0.0,0.0,-1.0
1276,22.500000,44.0,30.405592,11.75,22.5,33.25,0.000000,0.00,0.0,0.0,...,2.0,2.0,0.000000,91577.000000,129509.435401,183154.0,1.117818e+06,790417.0,1185625.5,-1.0


In [101]:
# 预测
y_validation_pred = model.predict(valid_set.drop('is_sa', axis=1))

# 将预测结果与 msisdn 对应起来
validation_results = validation_features[['msisdn']].copy()
validation_results['is_sa'] = y_validation_pred.astype(int)

print(validation_results.describe())

# 保存结果到CSV文件
file_name = './valid_large_data.csv' if ALL else './valid_small_data.csv'
validation_results.to_csv(file_name, index=False)
print(file_name)

             is_sa
count  1278.000000
mean      0.187011
std       0.390073
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max       1.000000
./valid_small_data.csv
