In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix,classification_report
from xgboost import XGBClassifier
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import time

ALL = False
NO_SMOTE = True

dtypes = {
    'msisdn': 'str',
    'start_time': 'str',
    'end_time': 'str',
    'call_event': 'category',
    'other_party': 'str',
    'ismultimedia': 'category',
    'home_area_code': 'str',
    'visit_area_code': 'str',
    'called_home_code': 'str',
    'called_code': 'str',
    'a_serv_type': 'int',
    'long_type1': 'int',
    'roam_type': 'int',
    'a_product_id': 'str',
    'open_datetime': 'str',
    'call_duration': 'int32',
    'cfee': 'float64',
    'lfee': 'float64',
    'hour': 'int8',
    'dayofweek': 'int',
    'phone1_type': 'int',
    'phone2_type': 'int',
    'phone1_loc_city': 'str',
    'phone1_loc_province': 'str',
    'phone2_loc_city': 'str',
    'phone2_loc_province': 'str',
    'update_time': 'str',
    'date': 'str',
    'date_c': 'str'
}

# 判断 processed 文件夹是否存在
import os
if not os.path.exists('../data/processed'):
    print("Creating processed data folder...")
    # 读取CSV文件
    labeled_data = pd.read_csv('../data/raw/trainSet_res_with_distances.csv', dtype=dtypes)
    labels = pd.read_csv('../data/raw/trainSet_ans.csv', dtype=dtypes)

    validation_data = pd.read_csv('../data/raw/validationSet_res_with_distances.csv', dtype=dtypes)

    # 按照 msisdn 切分 train_data 和 test_data
    train_data_msisdn, test_data_msisdn = train_test_split(labels['msisdn'], test_size=0.2, random_state=42, stratify=labels['is_sa'])
    train_data = labeled_data[labeled_data['msisdn'].isin(train_data_msisdn)]
    train_labels = labels[labels['msisdn'].isin(train_data_msisdn)]
    assert len(train_data['msisdn'].unique()) == len(train_data_msisdn)

    test_data = labeled_data[labeled_data['msisdn'].isin(test_data_msisdn)]
    test_labels = labels[labels['msisdn'].isin(test_data_msisdn)]
    assert len(test_data['msisdn'].unique()) == len(test_data_msisdn)


    # 遍历 groupby('msisdn') 的结果，对每个 msisdn 进行数据增强
    # ------
    from tqdm import tqdm
    import os
    import sys
    sys.path.append(os.path.join(os.path.dirname('./'), '../'))
    from utils.augmentation import Augmentation

    addition_train_data = []
    addition_train_labels = []

    times = 2
    ratio_range = 0.1
    pbar = tqdm(train_data.groupby('msisdn'))
    for msisdn, group in pbar:
        if msisdn == 0:
            continue
        # print(f"Augmenting msisdn {msisdn}")
        pbar.set_description(f"Augmenting msisdn {msisdn}")
        label = train_labels[train_labels['msisdn'] == msisdn].iloc[0]['is_sa']
        aug = Augmentation(group, label, 'msisdn', 'is_sa')
        # 对正负样本进行平衡 样本比 1:4
        if label == 1:
            res_df, res_labels = aug.times(ratio=ratio_range, times=3 + times * 4, method='mask')

            addition_train_data.append(res_df)
            addition_train_labels.append(res_labels)

            res_df, res_labels = aug.times(window_size=50, step_size=40, times=1, method='sliding_window')

            addition_train_data.append(res_df)
            addition_train_labels.append(res_labels)
        else:
            res_df, res_labels = aug.times(ratio=ratio_range, times=times, method='mask')

            addition_train_data.append(res_df)
            addition_train_labels.append(res_labels)

            res_df, res_labels = aug.times(window_size=50, step_size=40, times=1, method='sliding_window')

            addition_train_data.append(res_df)
            addition_train_labels.append(res_labels)
            
    addition_train_data = pd.concat(addition_train_data)
    addition_train_labels = pd.concat(addition_train_labels)

    # 将新数据加入到train_data中
    train_data = pd.concat([train_data, addition_train_data], ignore_index=True).reset_index(drop=True)
    train_labels = pd.concat([train_labels, addition_train_labels], ignore_index=True).reset_index(drop=True)
    # 按照 msisdn, start_time 排序
    sort_start_time = time.time()
    train_data = train_data.sort_values(by=['msisdn', 'start_time']).reset_index(drop=True)
    train_labels = train_labels.sort_values(by=['msisdn']).reset_index(drop=True)
    print('sort time:', time.time() - sort_start_time)

    labels_aug = pd.concat([train_labels, test_labels], ignore_index=True).reindex()
    # ------------------

    # save
    print("Saving processed data...")
    os.makedirs('../data/processed', exist_ok=True)
    train_data.to_csv('../data/processed/train_data.csv', index=False)
    train_labels.to_csv('../data/processed/train_labels.csv', index=False)
    test_data.to_csv('../data/processed/test_data.csv', index=False)
    test_labels.to_csv('../data/processed/test_labels.csv', index=False)

    labels_aug.to_csv('../data/processed/labels_aug.csv', index=False)

    validation_data.to_csv('../data/processed/validation_data.csv', index=False)
    # TODO: test_data

else:
    print("Reading processed data...")
    train_data = pd.read_csv('../data/processed/train_data.csv', dtype=dtypes)
    train_labels = pd.read_csv('../data/processed/train_labels.csv', dtype=dtypes)
    test_data = pd.read_csv('../data/processed/test_data.csv', dtype=dtypes)
    test_labels = pd.read_csv('../data/processed/test_labels.csv', dtype=dtypes)

    labels_aug = pd.read_csv('../data/processed/labels_aug.csv', dtype=dtypes)

    validation_data = pd.read_csv('../data/processed/validation_data.csv', dtype=dtypes)

labeled_data_aug = pd.concat([train_data, test_data], ignore_index=True).reindex()
assert len(labeled_data_aug['msisdn'].unique()) == len(labels_aug['msisdn'].unique())

# 转换时间格式
labeled_data_aug['start_time'] = pd.to_datetime(labeled_data_aug['start_time'], format='%Y%m%d%H%M%S')
labeled_data_aug['end_time'] = pd.to_datetime(labeled_data_aug['end_time'], format='%Y%m%d%H%M%S')
labeled_data_aug['open_datetime'] = pd.to_datetime(labeled_data_aug['open_datetime'], format='%Y%m%d%H%M%S')
labeled_data_aug['update_time'] = pd.to_datetime(labeled_data_aug['update_time'])
labeled_data_aug['date'] = pd.to_datetime(labeled_data_aug['date'])

validation_data['start_time'] = pd.to_datetime(validation_data['start_time'], format='%Y%m%d%H%M%S')
validation_data['end_time'] = pd.to_datetime(validation_data['end_time'], format='%Y%m%d%H%M%S')
validation_data['open_datetime'] = pd.to_datetime(validation_data['open_datetime'], format='%Y%m%d%H%M%S',errors='coerce')
validation_data['update_time'] = pd.to_datetime(validation_data['update_time'])
validation_data['date'] = pd.to_datetime(validation_data['date'])

Reading processed data...


In [59]:
# 为每条记录添加start_time_diff，记录 start_time 与上一条记录的 start_time 之差 (单位：秒)
start_time_diff = labeled_data_aug.groupby('msisdn')['start_time'].diff().dt.total_seconds().fillna(0).reset_index(drop=True)
# 将该列加入到数据集中
labeled_data_aug['start_time_diff'] = start_time_diff.copy()
# time_diff_start2end = train_data.groupby('msisdn')['end_time'].diff().dt.total_seconds().fillna(0)
start_time_diff = validation_data.groupby('msisdn')['start_time'].diff().dt.total_seconds().fillna(0).reset_index(drop=True)
validation_data['start_time_diff'] = start_time_diff.copy()

数据特征处理

In [60]:
# 聚合特征
def aggregate_features(data):
    return data.groupby('msisdn').agg({
    'call_duration': [
        # ('call_duration_sum', 'sum'), 
        ('call_duration_mean', 'mean'), 
        ('call_duration_max', 'max'), 
        # ('call_duration_std', 'std'),
        ('call_duration_quantile_25', lambda x: x.quantile(0.25)), 
        ('call_duration_quantile_50', lambda x: x.quantile(0.50)), 
        ('call_duration_quantile_75', lambda x: x.quantile(0.75))
    ],
    'cfee': [
        # ('cfee_sum', 'sum'),
        ('cfee_std', 'std'), 
        ('cfee_mean', 'mean'),
    ],
    'lfee': [
        # ('lfee_sum', 'sum'), 
        ('lfee_mean', 'mean'),
        ('lfee_std', 'std'),
    ],
    'hour': [
        ('hour_mean', 'mean'), 
        ('hour_std', 'std'), 
        # ('hour_max', 'max'), 
        ('hour_min', 'min'),
    ],
    'dayofweek': [
        ('dayofweek_std', 'std'), 
        # ('magic_dayofweek', lambda x: x.value_counts().mean()), 
        # ('work_day_num', lambda x: x[x.isin([1,2,3,4,5])].count()), 
        # ('weekend_num', lambda x: x[x.isin([6,7])].count()),
        ('dayofweek_mode', lambda x: x.mode().values[0]),
        ('work_day_weekend_diff', lambda x: (x[x.isin([1,2,3,4,5])].count() - x[x.isin([6,7])].count()) / (x[x.isin([1,2,3,4,5])].count() + x[x.isin([6,7])].count())),
    ],
    # 'home_area_code': [
    #     ('home_area_code_nunique', 'nunique')
    # ],
    'visit_area_code': [
        ('visit_area_code_nunique', 'nunique'),
        ('times_not_at_home_area', lambda x: x[x != x.shift()].count())
    ],
    'called_home_code': [
        ('called_home_code_nunique', 'nunique'),
        ('called_diff_home_code', lambda x: x[x != x.shift()].count())
    ],
    'called_code': [
        # ('called_code_nunique', 'nunique')
        ('called_code_diff', lambda x: x[x != x.shift()].count())
    ],
    'open_datetime': [
        ('open_count', 'nunique')
    ],
    'other_party': [
        ('account_person_num', 'nunique'),
        ('called_diff_home_code', lambda x: x[x != x.shift()].count())
    ],
    'a_serv_type': [
        # ('call_num', lambda x: x[x.isin([1, 3])].count()), 
        # ('called_num', lambda x: x[x == 2].count()),
        ('call_called_normalized_diff', lambda x: (x[x.isin([1, 3])].count() - x[x == 2].count()) /  (x[x.isin([1, 3])].count() + x[x == 2].count())),
    ],
    'start_time_diff': [
        # ('start_time_diff_mean', 'mean'), 
        # ('start_time_diff_std', 'std'), 
        ('start_time_diff_max', 'max'), 
        ('start_time_diff_coefficient_of_variation', lambda x: x.std() / x.mean()),
    ], 
    'distance': [
        # ('distance_sum', 'sum'), 
        ('distance_std', 'std'), 
        # ('distance_max', 'max'), 
        # ('distance_quantile_25', lambda x: x.quantile(0.25)), 
        ('distance_quantile_50', lambda x: x.quantile(0.50)), 
        ('distance_quantile_75', lambda x: x.quantile(0.75)),
    ]
})

labeled_aug_features = aggregate_features(labeled_data_aug)
validation_features = aggregate_features(validation_data)

labeled_aug_features.columns = ['_'.join(col).strip() for col in labeled_aug_features.columns.values]
validation_features.columns = ['_'.join(col).strip() for col in validation_features.columns.values]

labeled_aug_features.columns = labeled_aug_features.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '').str.replace('(', '').str.replace(')', '').str.replace(',', '').str.replace(' ', '_')
validation_features.columns = validation_features.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '').str.replace('(', '').str.replace(')', '').str.replace(',', '').str.replace(' ', '_')

# 重置索引
labeled_aug_features = labeled_aug_features.reset_index()
validation_features = validation_features.reset_index()

# 合并标签数据
labeled_aug_features = labeled_aug_features.merge(labels_aug, on='msisdn', how='left')
# 打印结果
# labeled_aug_features

In [61]:
X = labeled_aug_features.drop(['msisdn'], axis=1)
y = labeled_aug_features['is_sa']
X_validation = validation_features.drop(['msisdn'], axis=1)

n_sample = y.shape[0]
n_pos_sample = y[y ==1].shape[0]
n_neg_sample = y[y == 0].shape[0]
print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('特征维数：', X.shape[1])

样本个数：26611; 正样本占30.16%; 负样本占69.84%
特征维数： 30


In [62]:
# TODO use all_X to impute
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

In [63]:
imputer2 = SimpleImputer(strategy='most_frequent')
X_validation = imputer2.fit_transform(X_validation)

In [64]:
X.shape

(26611, 30)

In [65]:
X_validation.shape

(1278, 29)

In [66]:
y.shape

(26611,)

In [67]:
assert len(train_labels) + len(test_labels) == len(labeled_aug_features)
len(train_labels), len(test_labels)

(25843, 768)

In [68]:
from sklearn.utils import shuffle
# 将 msisdn 和 is_sa 并入 X 再划分
train_data_msisdn = train_labels['msisdn']
test_data_msisdn = test_labels['msisdn']
X_df = pd.DataFrame(X, columns=labeled_aug_features.drop(['msisdn'], axis=1).columns)
X_df = pd.concat([labeled_aug_features[['msisdn']], X_df], axis=1)
train_set = X_df[X_df['msisdn'].isin(train_data_msisdn)][X_df.columns[1:]]
test_set = X_df[X_df['msisdn'].isin(test_data_msisdn)][X_df.columns[1:]]

print(f"1 samples / 0 samples in train set: {len(train_set[train_set['is_sa'] == 1])} / {len(train_set[train_set['is_sa'] == 0])}")
print(f"1 samples / 0 samples in test set: {len(test_set[test_set['is_sa'] == 1])} / {len(test_set[test_set['is_sa'] == 0])}")

if ALL:
    # if not NO_SMOTE:
    #     smote = SMOTE(random_state=42)    # 处理过采样的方法
    #     X, y = smote.fit_resample(X, y)
    # X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42, shuffle=True)
    train_len = len(test_set) + len(train_set)
    test_len = 0
else:
    # X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y,test_size= 0.3,random_state=42, shuffle=True)
    # X_train, y_train = shuffle(X_train, y_train, random_state=42)
    train_len, test_len = len(train_set), len(test_set)

    # if not NO_SMOTE:
    #     smote = SMOTE(random_state=42)    # 处理过采样的方法
    #     X_train, y_train = smote.fit_resample(X_train, y_train)
    #     print('通过SMOTE方法平衡正负样本后')
    #     n_sample = y_train.shape[0]
    #     n_pos_sample = y_train[y_train == 1].shape[0]
    #     n_neg_sample = y_train[y_train == 0].shape[0]
    #     print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
    #                                                     n_pos_sample / n_sample,
    #                                                     n_neg_sample / n_sample))
    #     print('特征维数：', X.shape[1])

1 samples / 0 samples in train set: 7870 / 17973
1 samples / 0 samples in test set: 155 / 613


In [69]:
columns = labeled_aug_features.columns.tolist()
columns.remove('msisdn')
valid_set = np.c_[X_validation, np.zeros(X_validation.shape[0])]
valid_set = pd.DataFrame(valid_set, columns=columns)
valid_set['is_sa'] = -1

In [70]:
train_set.head()

Unnamed: 0,call_duration_call_duration_mean,call_duration_call_duration_max,call_duration_call_duration_quantile_25,call_duration_call_duration_quantile_50,call_duration_call_duration_quantile_75,cfee_cfee_std,cfee_cfee_mean,lfee_lfee_mean,lfee_lfee_std,hour_hour_mean,...,open_datetime_open_count,other_party_account_person_num,other_party_called_diff_home_code,a_serv_type_call_called_normalized_diff,start_time_diff_start_time_diff_max,start_time_diff_start_time_diff_coefficient_of_variation,distance_distance_std,distance_distance_quantile_50,distance_distance_quantile_75,is_sa
0,68.483871,1129.0,20.0,37.0,78.0,0.0,0.0,0.0,0.0,12.483871,...,1.0,44.0,80.0,0.333333,105436.0,1.819554,0.0,103426.0,103426.0,0.0
1,65.202381,1129.0,19.0,37.0,75.75,0.0,0.0,0.0,0.0,12.5,...,1.0,42.0,75.0,0.357143,146225.0,1.888866,0.0,103426.0,103426.0,0.0
2,58.380952,470.0,19.75,37.0,78.0,0.0,0.0,0.0,0.0,12.428571,...,1.0,43.0,73.0,0.380952,105436.0,1.731489,0.0,103426.0,103426.0,0.0
3,86.34,1129.0,19.25,38.0,88.0,0.0,0.0,0.0,0.0,13.28,...,1.0,30.0,46.0,0.36,105436.0,1.568385,0.0,103426.0,103426.0,0.0
4,53.64,219.0,23.75,39.5,58.75,0.0,0.0,0.0,0.0,11.68,...,1.0,26.0,40.0,0.32,80307.0,2.172599,0.0,103426.0,103426.0,0.0


In [71]:
test_set.describe()

Unnamed: 0,call_duration_call_duration_mean,call_duration_call_duration_max,call_duration_call_duration_quantile_25,call_duration_call_duration_quantile_50,call_duration_call_duration_quantile_75,cfee_cfee_std,cfee_cfee_mean,lfee_lfee_mean,lfee_lfee_std,hour_hour_mean,...,open_datetime_open_count,other_party_account_person_num,other_party_called_diff_home_code,a_serv_type_call_called_normalized_diff,start_time_diff_start_time_diff_max,start_time_diff_start_time_diff_coefficient_of_variation,distance_distance_std,distance_distance_quantile_50,distance_distance_quantile_75,is_sa
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,...,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,91.482206,972.363281,22.85319,44.679036,94.721029,4.024875,2.821239,0.00806,0.067647,13.943249,...,1.458333,57.983073,141.947917,0.104563,159979.5,2.164894,233529.6,191921.9,289215.8,0.201823
std,64.553937,803.897173,21.986951,36.60815,71.165832,23.303907,32.907571,0.262984,0.983664,1.459174,...,2.354132,112.064686,198.42915,0.442265,158842.3,1.133584,271303.6,460314.1,565202.9,0.401622
min,7.842909,30.0,2.0,6.0,8.0,0.0,0.0,-2.192916,0.0,7.0,...,1.0,1.0,1.0,-1.0,0.0,0.689959,0.0,0.0,0.0,0.0
25%,52.154273,360.75,16.0,29.0,55.0,0.0,0.0,0.0,0.0,13.076923,...,1.0,12.0,27.0,-0.156344,66947.0,1.545699,0.0,0.0,0.0,0.0
50%,74.22654,773.5,20.0,38.0,78.5,0.0,0.0,0.0,0.0,13.837143,...,1.0,28.0,71.0,0.064642,94959.5,1.91208,142382.6,0.0,0.0,0.0
75%,114.410418,1385.0,25.0,51.0,115.0,0.0,0.0,0.0,0.0,14.81531,...,1.0,61.0,172.25,0.333333,189053.5,2.405582,354495.4,92132.0,262316.2,0.0
max,765.0,4727.0,557.5,765.0,972.5,567.312612,897.0,6.923077,25.102865,21.0,...,15.0,1483.0,1593.0,1.0,1125536.0,12.241096,1495596.0,3245129.0,3316118.0,1.0


In [72]:
all_set = pd.concat([train_set, test_set, valid_set], axis=0).reset_index(drop=True)
labeled_data_len = train_set.shape[0] + test_set.shape[0]

In [73]:
test_set.shape, train_set.shape, valid_set.shape, all_set.shape

((768, 30), (25843, 30), (1278, 30), (27889, 30))

In [92]:
labeled_set, valid_set = all_set.iloc[:labeled_data_len].copy(), all_set.iloc[labeled_data_len:].copy()
labeled_set.reset_index(drop=True, inplace=True)
valid_set.reset_index(drop=True, inplace=True)
# 有一些值在SMOTE后对数变换后为 NaN，需要删除这些数据
print(labeled_set.isnull().sum().sum())
labeled_set = labeled_set.dropna()
print(labeled_set.isnull().sum().sum())
assert valid_set.shape[0] == validation_features.shape[0]

# 重新划分训练集和测试集
if not ALL:
    train_set, test_set = labeled_set.iloc[:train_len].copy(), labeled_set.iloc[train_len:].copy()
    train_set.reset_index(drop=True, inplace=True)
    test_set.reset_index(drop=True, inplace=True)

    # remove_columns = ['distance_distance_std', "start_time_diff_start_time_diff_max", "distance_distance_quantile_75"]

    remove_columns = ["lfee_lfee_std", "lfee_lfee_mean", 'call_duration_call_duration_max', "distance_distance_quantile_50", "call_duration_call_duration_quantile_25"]
    train_set = train_set.drop(remove_columns, axis=1)
    test_set = test_set.drop(remove_columns, axis=1)
    valid_set = valid_set.drop(remove_columns, axis=1)
    
assert train_set.shape[1] == test_set.shape[1] == valid_set.shape[1]

0
0


In [93]:
train_set.shape, test_set.shape, valid_set.shape

((25843, 25), (768, 25), (1278, 25))

In [94]:
# 使用 autogluon 训练
from autogluon.tabular import TabularPredictor
# 输入数据X_train, y_train
if not ALL:
    model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(train_set, presets='medium_quality', excluded_model_types=['KNN'])
    # model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(train_set, presets='best_quality', time_limit=3600)
else:
    model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(labeled_set, presets='best_quality', time_limit=3600)

No path specified. Models will be saved in: "AutogluonModels\ag-20240725_153752"
Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240725_153752"
AutoGluon Version:  1.0.0
Python Version:     3.10.14
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19041
CPU Count:          12
Memory Avail:       13.30 GB / 31.90 GB (41.7%)
Disk Space Avail:   402.69 GB / 3726.01 GB (10.8%)
Train Data Rows:    25843
Train Data Columns: 24
Label Column:       is_sa
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    13611.74 MB
	Train Data (Original)  Memory Usage: 4.73 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtyp

[1000]	valid_set's binary_logloss: 0.058453	valid_set's f1: 0.974189


	0.9775	 = Validation score   (f1)
	8.98s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's binary_logloss: 0.0476043	valid_set's f1: 0.97878


	0.9795	 = Validation score   (f1)
	8.29s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.9734	 = Validation score   (f1)
	2.53s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.9727	 = Validation score   (f1)
	3.15s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: CatBoost ...
	0.9768	 = Validation score   (f1)
	48.94s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.9707	 = Validation score   (f1)
	1.1s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.9693	 = Validation score   (f1)
	1.05s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.9607	 = Validation score   (f1)
	35.86s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: XGBoost ...
	0.9768	 = Validation score   (f1)
	7.37s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetTor

In [95]:
if not ALL:
    print(model.evaluate(test_set))

{'f1': 0.75, 'accuracy': 0.9088541666666666, 'balanced_accuracy': 0.8223964637162553, 'mcc': 0.7010610815855318, 'roc_auc': 0.9137083618376045, 'precision': 0.84, 'recall': 0.6774193548387096}


In [96]:
feature_importance = model.feature_importance(test_set if not ALL else labeled_set)
print(feature_importance)
feature_importance

Computing feature importance via permutation shuffling for 24 features using 768 rows with 5 shuffle sets...
	11.6s	= Expected runtime (2.32s per shuffle set)
	2.69s	= Actual runtime (Completed 5 of 5 shuffle sets)


                                                          importance  \
called_home_code_called_home_code_nunique                   0.139972   
other_party_called_diff_home_code                           0.119351   
a_serv_type_call_called_normalized_diff                     0.044415   
start_time_diff_start_time_diff_max                         0.042872   
other_party_account_person_num                              0.037590   
start_time_diff_start_time_diff_coefficient_of_variation    0.033094   
called_code_called_code_diff                                0.023347   
distance_distance_quantile_75                               0.016627   
hour_hour_std                                               0.012752   
hour_hour_min                                               0.010002   
call_duration_call_duration_mean                            0.009857   
call_duration_call_duration_quantile_75                     0.008667   
dayofweek_work_day_weekend_diff                             0.00

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
called_home_code_called_home_code_nunique,0.139972,0.020507,5.4e-05,5,0.182196,0.097748
other_party_called_diff_home_code,0.119351,0.022146,0.000136,5,0.16495,0.073753
a_serv_type_call_called_normalized_diff,0.044415,0.023515,0.00672,5,0.092833,-0.004003
start_time_diff_start_time_diff_max,0.042872,0.02569,0.010135,5,0.095769,-0.010025
other_party_account_person_num,0.03759,0.015746,0.002967,5,0.070011,0.005168
start_time_diff_start_time_diff_coefficient_of_variation,0.033094,0.008868,0.000564,5,0.051354,0.014835
called_code_called_code_diff,0.023347,0.010585,0.003931,5,0.045142,0.001552
distance_distance_quantile_75,0.016627,0.010745,0.012907,5,0.038751,-0.005497
hour_hour_std,0.012752,0.010999,0.030267,5,0.035399,-0.009896
hour_hour_min,0.010002,0.004518,0.003881,5,0.019305,0.000698


In [97]:
# leaderboard
if not ALL:
    leaderboard = model.leaderboard(test_set, silent=True)
    print(leaderboard)
else:
    leaderboard = model.leaderboard(labeled_set, silent=True)
    print(leaderboard)
leaderboard

                  model  score_test  score_val eval_metric  pred_time_test  \
0            LightGBMXT    0.753623   0.977543          f1        0.038334   
1               XGBoost    0.751773   0.976775          f1        0.042796   
2   WeightedEnsemble_L2    0.750000   0.981457          f1        0.082970   
3              LightGBM    0.742857   0.979457          f1        0.032638   
4         LightGBMLarge    0.742049   0.978189          f1        0.026921   
5              CatBoost    0.737589   0.976806          f1        0.019566   
6      RandomForestEntr    0.732824   0.972721          f1        0.126085   
7      RandomForestGini    0.730038   0.973404          f1        0.148195   
8        ExtraTreesGini    0.706349   0.970745          f1        0.180344   
9        ExtraTreesEntr    0.706349   0.969333          f1        0.232959   
10       NeuralNetTorch    0.693069   0.965744          f1        0.036304   
11      NeuralNetFastAI    0.671378   0.960682          f1      

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT,0.753623,0.977543,f1,0.038334,0.051133,8.980382,0.038334,0.051133,8.980382,1,True,1
1,XGBoost,0.751773,0.976775,f1,0.042796,0.030929,7.369399,0.042796,0.030929,7.369399,1,True,9
2,WeightedEnsemble_L2,0.75,0.981457,f1,0.08297,0.100329,64.902138,0.003845,0.006239,3.329878,2,True,12
3,LightGBM,0.742857,0.979457,f1,0.032638,0.050281,8.290201,0.032638,0.050281,8.290201,1,True,2
4,LightGBMLarge,0.742049,0.978189,f1,0.026921,0.029757,4.343315,0.026921,0.029757,4.343315,1,True,11
5,CatBoost,0.737589,0.976806,f1,0.019566,0.014051,48.938744,0.019566,0.014051,48.938744,1,True,5
6,RandomForestEntr,0.732824,0.972721,f1,0.126085,0.103707,3.145403,0.126085,0.103707,3.145403,1,True,4
7,RandomForestGini,0.730038,0.973404,f1,0.148195,0.111663,2.53072,0.148195,0.111663,2.53072,1,True,3
8,ExtraTreesGini,0.706349,0.970745,f1,0.180344,0.099512,1.095929,0.180344,0.099512,1.095929,1,True,6
9,ExtraTreesEntr,0.706349,0.969333,f1,0.232959,0.08743,1.053688,0.232959,0.08743,1.053688,1,True,7


In [98]:
# 在testset 上计算指标
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

if not ALL:
    y_pred = model.predict(test_set)
    y_true = test_set['is_sa']
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.97      0.94       613
         1.0       0.84      0.68      0.75       155

    accuracy                           0.91       768
   macro avg       0.88      0.82      0.85       768
weighted avg       0.91      0.91      0.91       768

[[593  20]
 [ 50 105]]


In [99]:
valid_set

Unnamed: 0,call_duration_call_duration_mean,call_duration_call_duration_quantile_50,call_duration_call_duration_quantile_75,cfee_cfee_std,cfee_cfee_mean,hour_hour_mean,hour_hour_std,hour_hour_min,dayofweek_dayofweek_std,dayofweek_dayofweek_mode,...,called_code_called_code_diff,open_datetime_open_count,other_party_account_person_num,other_party_called_diff_home_code,a_serv_type_call_called_normalized_diff,start_time_diff_start_time_diff_max,start_time_diff_start_time_diff_coefficient_of_variation,distance_distance_std,distance_distance_quantile_75,is_sa
0,60.308682,37.0,62.50,0.000000,0.00,14.408360,4.069215,0.0,2.003756,7.0,...,113.0,1.0,69.0,249.0,-0.421222,51194.0,2.044400,2.081493e+05,180274.0,-1.0
1,54.165854,37.0,65.00,0.000000,0.00,12.887805,4.674933,0.0,1.928827,3.0,...,5.0,1.0,72.0,174.0,-0.160976,51724.0,1.751810,2.102743e+04,0.0,-1.0
2,89.500000,37.5,49.00,0.000000,0.00,13.500000,4.409215,7.0,1.495090,3.0,...,1.0,1.0,8.0,14.0,-0.333333,217652.0,1.105694,0.000000e+00,0.0,-1.0
3,46.458515,27.0,53.00,0.000000,0.00,14.021834,4.541664,1.0,2.059754,1.0,...,1.0,1.0,27.0,155.0,0.362445,65807.0,2.030289,0.000000e+00,231682.0,-1.0
4,38.430556,19.0,42.50,6.804617,3.75,14.402778,1.741456,10.0,1.149862,4.0,...,32.0,1.0,44.0,51.0,0.500000,242220.0,2.338846,1.696337e+05,280421.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1273,156.426230,45.0,96.50,0.000000,0.00,13.863388,3.962451,8.0,1.573592,5.0,...,21.0,1.0,55.0,144.0,0.136612,148767.0,2.562838,7.154820e+04,0.0,-1.0
1274,52.897872,34.0,54.00,0.000000,0.00,15.131915,4.746078,0.0,1.877277,5.0,...,31.0,1.0,36.0,130.0,0.055319,144098.0,2.482085,1.749472e+05,0.0,-1.0
1275,346.552448,96.0,389.50,0.000000,0.00,14.335664,3.928846,7.0,1.754228,3.0,...,40.0,1.0,23.0,114.0,0.230769,72131.0,1.823380,3.078249e+05,0.0,-1.0
1276,22.500000,22.5,33.25,0.000000,0.00,12.500000,2.121320,11.0,1.414214,1.0,...,2.0,1.0,2.0,2.0,0.000000,183154.0,1.414214,1.117818e+06,1185625.5,-1.0


In [100]:
# 预测
y_validation_pred = model.predict(valid_set.drop('is_sa', axis=1))

# 将预测结果与 msisdn 对应起来
validation_results = validation_features[['msisdn']].copy()
validation_results['is_sa'] = y_validation_pred.astype(int)

print(validation_results.describe())

# 保存结果到CSV文件
file_name = './valid_large_data.csv' if ALL else './valid_small_data.csv'
validation_results.to_csv(file_name, index=False)
print(file_name)

             is_sa
count  1278.000000
mean      0.165884
std       0.372122
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max       1.000000
./valid_small_data.csv
