In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix,classification_report
from xgboost import XGBClassifier
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer



# 读取CSV文件
train_data = pd.read_csv('../data/raw/trainSet_res_with_distances.csv', dtype={'msisdn': 'str'})
train_labels = pd.read_csv('../data/raw/trainSet_ans.csv', dtype={'msisdn': 'str'})

# 读取验证集
validation_data = pd.read_csv('../data/raw/validationSet_res_with_distances.csv', dtype={'msisdn': 'str'})


# 转换时间格式
train_data['start_time'] = pd.to_datetime(train_data['start_time'], format='%Y%m%d%H%M%S')
train_data['end_time'] = pd.to_datetime(train_data['end_time'], format='%Y%m%d%H%M%S')
train_data['open_datetime'] = pd.to_datetime(train_data['open_datetime'], format='%Y%m%d%H%M%S')
train_data['update_time'] = pd.to_datetime(train_data['update_time'])
train_data['date'] = pd.to_datetime(train_data['date'])

validation_data['start_time'] = pd.to_datetime(validation_data['start_time'], format='%Y%m%d%H%M%S')
validation_data['end_time'] = pd.to_datetime(validation_data['end_time'], format='%Y%m%d%H%M%S')
validation_data['open_datetime'] = pd.to_datetime(validation_data['open_datetime'], format='%Y%m%d%H%M%S',errors='coerce')
validation_data['update_time'] = pd.to_datetime(validation_data['update_time'])
validation_data['date'] = pd.to_datetime(validation_data['date'])

ALL = False

In [70]:
# 数据增强根据 msisdn 分组，例如，对 msisdn 为 A 的经过随机遮盖一部分数据，然后将其加入到训练集中
class Augmentation:
    def __init__(self, df: pd.DataFrame, label: int, id_column: str):
        self.df = df.drop(columns=[id_column])
        self.label = label
        self.call_count = 0
        self.id = df[id_column].iloc[0]
        self.id_column = id_column
        # self.numeric_columns = [
        #     'call_duration',
        #     'cfee',
        #     'lfee',
        #     'hour',
        # ]
    
    def count_calls(func):
        def wrapper(self, *args, **kwargs):
            self.call_count += 1
            return func(self, *args, **kwargs)
        return wrapper
    
    def times(self, ratio, times=1, method='mask'):
        """
        随机删除比例为 ratio 的行
        """
        if int(ratio * self.df.shape[0]) < 1:
            # print(f"Masking ratio is too small, ratio: {ratio}, not enough data to mask")
            return None, None
        res_dfs = []
        for i in range(times):
            method_func = getattr(self, method)
            assert method_func is not None, f"Method {method} not found"
            res_df, label = method_func(ratio)
            res_dfs.append(res_df)
        res_dfs = pd.concat(res_dfs)
        unique_ids = res_dfs[self.id_column].unique()
        res_labels = pd.DataFrame([self.label] * len(unique_ids), columns=['is_sa'])
        res_labels = pd.concat([pd.DataFrame(unique_ids, columns=[self.id_column]), res_labels], axis=1)
        return res_dfs, res_labels

    @count_calls
    def mask(self, ratio):
        # num_rows_to_reserve = self.df.shape[0] - int(ratio * self.df.shape[0])
        # reserve = np.random.choice(self.df.index, num_rows_to_reserve, replace=False)
        # reserve = np.sort(reserve)
        # new_id = self.id + f"_{self.call_count}"
        # new_df = self.df[reserve]
        # ids = pd.DataFrame([new_id] * new_df.shape[0], columns=[self.id_column])
        # res_df = pd.concat([ids, new_df], axis=1, ignore_index=True)
        # return res_df, self.label
        num_rows_to_mask = int(ratio * self.df.shape[0])
        mask_indices = np.random.choice(self.df.index, size=num_rows_to_mask, replace=False)
        mask = self.df.index.isin(mask_indices)
        new_df = self.df[~mask].reset_index(drop=True)
        new_id = self.id + f"_{self.call_count}"
        ids = pd.DataFrame([new_id] * new_df.shape[0], columns=[self.id_column])
        res_df = pd.concat([ids, new_df], axis=1, ignore_index=False)
        return res_df, self.label

    @count_calls
    def interpolation(self, label, ratio):
        """
        随机插入比例为 ratio 的行
        """
        pass
    
    @count_calls
    def noise(self, label, ratio):
        """
        为数值类型变量 原值乘以 [1-ratio, 1+ratio] 的随机因子
        """
        # for col in self.df.columns:
        #     if col in self.numeric_columns:
        #         self.df[col] *= np.random.uniform(1-ratio, 1+ratio)
        # return self.df, self.label, self.id
        pass
    
    @count_calls
    def time_smoothing(self, label, ratio):
        """
        将占比为
        """
        pass

In [71]:
# 遍历 groupby('msisdn') 的结果，对每个 msisdn 进行数据增强
from tqdm import tqdm
addition_train_data = []
addition_train_labels = []

times = 5
pbar = tqdm(train_data.groupby('msisdn'))
for msisdn, group in pbar:
    if msisdn == 0:
        continue
    # print(f"Augmenting msisdn {msisdn}")
    pbar.set_description(f"Augmenting msisdn {msisdn}")
    aug = Augmentation(group, train_labels[train_labels['msisdn'] == msisdn].iloc[0]['is_sa'], 'msisdn')
    res_df, res_labels = aug.times(0.2, times=times, method='mask')
    addition_train_data.append(res_df)
    addition_train_labels.append(res_labels)
addition_train_data = pd.concat(addition_train_data)
addition_train_labels = pd.concat(addition_train_labels)
addition_train_data

Augmenting msisdn 1304080: 100%|██████████| 3836/3836 [01:16<00:00, 50.19it/s]


Unnamed: 0,msisdn,start_time,end_time,call_event,other_party,ismultimedia,home_area_code,visit_area_code,called_home_code,called_code,...,phone1_type,phone2_type,phone1_loc_city,phone1_loc_province,phone2_loc_city,phone2_loc_province,update_time,date,date_c,distance
0,1000176_1,2023-12-30 12:26:27,2023-12-30 12:28:06,call_src,1196442,0,714,714,714,27,...,9,2,黄石,湖北,武汉,湖北,2023-12-30 12:55:36,2023-12-30,20240113,103426
1,1000176_1,2023-12-31 17:47:09,2023-12-31 17:47:34,call_src,1182976,0,714,714,714,27,...,9,2,黄石,湖北,武汉,湖北,2023-12-31 18:11:15,2023-12-31,20240113,103426
2,1000176_1,2023-12-31 17:51:30,2023-12-31 17:52:52,call_dst,1051904,0,714,714,714,27,...,9,2,黄石,湖北,武汉,湖北,2023-12-31 18:33:09,2023-12-31,20240113,103426
3,1000176_1,2023-12-31 19:41:25,2023-12-31 19:43:01,call_src,1051904,0,714,714,714,27,...,9,2,黄石,湖北,武汉,湖北,2023-12-31 20:21:38,2023-12-31,20240113,103426
4,1000176_1,2024-01-01 10:25:11,2024-01-01 10:44:00,call_dst,1080562,0,714,714,714,27,...,9,2,黄石,湖北,武汉,湖北,2024-01-01 11:13:10,2024-01-01,20240113,103426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,1304080_5,2024-01-20 18:35:33,2024-01-20 18:37:00,call_dst,1101045,0,519,519,519,519,...,2,2,常州,江苏,常州,江苏,2024-01-20 19:08:40,2024-01-20,20240122,0
7,1304080_5,2024-01-21 04:19:04,2024-01-21 04:19:13,call_dst,1217324,0,519,519,519,519,...,2,2,常州,江苏,常州,江苏,2024-01-21 05:01:58,2024-01-21,20240122,0
8,1304080_5,2024-01-21 19:45:20,2024-01-21 19:45:38,call_dst,1254737,0,519,519,519,519,...,2,2,常州,江苏,常州,江苏,2024-01-22 01:12:09,2024-01-21,20240122,0
9,1304080_5,2024-01-22 10:58:25,2024-01-22 10:58:37,call_dst,1207673,0,519,519,519,519,...,2,2,常州,江苏,常州,江苏,2024-01-22 11:36:01,2024-01-22,20240122,0


In [72]:
addition_train_labels

Unnamed: 0,msisdn,is_sa
0,1000176_1,0
1,1000176_2,0
2,1000176_3,0
3,1000176_4,0
4,1000176_5,0
...,...,...
0,1304080_1,0
1,1304080_2,0
2,1304080_3,0
3,1304080_4,0


In [73]:
len(addition_train_labels), len(addition_train_data)
len(train_labels) * times, len(addition_train_labels)

(19180, 18825)

In [74]:
# 将新数据加入到train_data中
train_data = pd.concat([train_data, addition_train_data], ignore_index=True).reset_index(drop=True)
train_labels = pd.concat([train_labels, addition_train_labels], ignore_index=True).reset_index(drop=True)

In [75]:
# 为每条记录添加start_time_diff，记录 start_time 与上一条记录的 start_time 之差 (单位：秒)
start_time_diff = train_data.groupby('msisdn')['start_time'].diff().dt.total_seconds().fillna(0).reset_index(drop=True)
# 将该列加入到数据集中
train_data['start_time_diff'] = start_time_diff.copy()
# time_diff_start2end = train_data.groupby('msisdn')['end_time'].diff().dt.total_seconds().fillna(0)
start_time_diff = validation_data.groupby('msisdn')['start_time'].diff().dt.total_seconds().fillna(0).reset_index(drop=True)
validation_data['start_time_diff'] = start_time_diff.copy()

In [76]:
train_data['distance']

0           600618
1          1321683
2                0
3                0
4                0
            ...   
3515865          0
3515866          0
3515867          0
3515868          0
3515869     389001
Name: distance, Length: 3515870, dtype: int64

In [77]:
train_labels

Unnamed: 0,msisdn,is_sa
0,1109993,1
1,1017493,1
2,1114848,1
3,1243192,1
4,1270247,1
...,...,...
22656,1304080_1,0
22657,1304080_2,0
22658,1304080_3,0
22659,1304080_4,0


数据特征处理

In [78]:
# 聚合特征
def aggregate_features(data):
    return data.groupby('msisdn').agg({
    'call_duration': [
        ('call_duration_sum', 'sum'), 
        ('call_duration_mean', 'mean'), 
        ('call_duration_max', 'max'), 
        ('call_duration_quantile_25', lambda x: x.quantile(0.25)), 
        ('call_duration_quantile_50', lambda x: x.quantile(0.50)), 
        ('call_duration_quantile_75', lambda x: x.quantile(0.75))
    ],
    'cfee': [
        ('cfee_sum', 'sum'), 
        ('cfee_mean', 'mean')
    ],
    'lfee': [
        ('lfee_sum', 'sum'), 
        ('lfee_mean', 'mean')
    ],
    'hour': [
        ('hour_mean', 'mean'), 
        ('hour_std', 'std'), 
        # ('hour_max', 'max'), 
        ('hour_min', 'min')
    ],
    'dayofweek': [
        ('dayofweek_std', 'std'), 
        ('magic_dayofweek', lambda x: x.value_counts().mean()), 
        ('work_day_num', lambda x: x[x.isin([1,2,3,4,5])].count()), 
        ('weekend_num', lambda x: x[x.isin([6,7])].count()),
        ('dayofweek_mode', lambda x: x.mode().values[0])
    ],
    # 'home_area_code': [
    #     ('home_area_code_nunique', 'nunique')
    # ],
    'visit_area_code': [
        ('visit_area_code_nunique', 'nunique'),
        ('times_not_at_home_area', lambda x: x[x != x.shift()].count())
    ],
    'called_home_code': [
        ('called_home_code_nunique', 'nunique'),
        ('called_diff_home_code', lambda x: x[x != x.shift()].count())
    ],
    'called_code': [
        ('called_code_nunique', 'nunique')
    ],
    'open_datetime': [
        ('open_count', 'nunique')
    ],
    'other_party': [
        ('account_person_num', 'nunique'),
        ('called_diff_home_code', lambda x: x[x != x.shift()].count())
    ],
    'a_serv_type': [
        # ('call_num', lambda x: x[x.isin([1, 3])].count()), 
        # ('called_num', lambda x: x[x == 2].count()),
        ('call_called_normalized_diff', lambda x: (x[x.isin([1, 3])].count() - x[x == 2].count()) / (x.count() + 1))  # 标准化差值，加1避免除以0
    ],
    'start_time_diff': [
        ('start_time_diff_mean', 'mean'), 
        ('start_time_diff_std', 'std'), 
        ('start_time_diff_max', 'max'), 
    ], 
    'distance': [
        # ('distance_sum', 'sum'), 
        ('distance_std', 'std'), 
        # ('distance_max', 'max'), 
        # ('distance_quantile_25', lambda x: x.quantile(0.25)), 
        ('distance_quantile_50', lambda x: x.quantile(0.50)), 
        ('distance_quantile_75', lambda x: x.quantile(0.75)),
    ]
})

train_features = aggregate_features(train_data)
validation_features = aggregate_features(validation_data)


train_features.columns = ['_'.join(col).strip() for col in train_features.columns.values]
validation_features.columns = ['_'.join(col).strip() for col in validation_features.columns.values]
train_features.columns = train_features.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '').str.replace('(', '').str.replace(')', '').str.replace(',', '').str.replace(' ', '_')
validation_features.columns = validation_features.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '').str.replace('>', '').str.replace('(', '').str.replace(')', '').str.replace(',', '').str.replace(' ', '_')
# 重置索引
train_features = train_features.reset_index()
validation_features = validation_features.reset_index()

# 合并标签数据
train_features = train_features.merge(train_labels, on='msisdn', how='left')
# 打印结果
# train_features

In [82]:
y = train_features['is_sa']
X = train_features.drop(['msisdn', 'is_sa'], axis=1)
X_validation = validation_features.drop(['msisdn'], axis=1)

n_sample = y.shape[0]
n_pos_sample = y[y ==1].shape[0]
n_neg_sample = y[y == 0].shape[0]
print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('特征维数：', X.shape[1])

样本个数：22661; 正样本占19.47%; 负样本占80.53%
特征维数： 33


In [83]:
# TODO use all_X to impute
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

In [84]:
imputer2 = SimpleImputer(strategy='most_frequent')
X_validation = imputer2.fit_transform(X_validation)

In [85]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y,test_size= 0.3,random_state=42)

# X_test_imputed = imputer.transform(X)

smote = SMOTE(random_state=42)    # 处理过采样的方法
X_train, y_train = smote.fit_resample(X_train, y_train)
print('通过SMOTE方法平衡正负样本后')
n_sample = y_train.shape[0]
n_pos_sample = y_train[y_train == 1].shape[0]
n_neg_sample = y_train[y_train == 0].shape[0]
print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,
                                                   n_pos_sample / n_sample,
                                                   n_neg_sample / n_sample))
print('特征维数：', X.shape[1])
if ALL:
    X_test, y_test = smote.fit_resample(X_test, y_test)
    train_len = len(y_train) + len(y_test)
    test_len = 0
else:
    train_len, test_len = len(y_train), len(y_test)

通过SMOTE方法平衡正负样本后
样本个数：25548; 正样本占50.00%; 负样本占50.00%
特征维数： 33


In [86]:
print(y_test.value_counts())

is_sa
0    5475
1    1324
Name: count, dtype: int64


In [87]:
X.shape

(22661, 33)

In [88]:
columns = train_features.columns.tolist()
columns.remove('msisdn')

In [89]:
assert X_validation.shape[1] == X_train.shape[1]

In [90]:
# 拼接 X_train 和 y_train np.array 为 dataframe
train_set = np.c_[X_train, y_train]
train_set = pd.DataFrame(train_set, columns=columns)
test_set = np.c_[X_test, y_test]
test_set = pd.DataFrame(test_set, columns=columns)
valid_set = np.c_[X_validation, np.zeros(X_validation.shape[0])]
valid_set = pd.DataFrame(valid_set, columns=columns)
valid_set['is_sa'] = -1

In [91]:
test_set.describe()

Unnamed: 0,call_duration_call_duration_sum,call_duration_call_duration_mean,call_duration_call_duration_max,call_duration_call_duration_quantile_25,call_duration_call_duration_quantile_50,call_duration_call_duration_quantile_75,cfee_cfee_sum,cfee_cfee_mean,lfee_lfee_sum,lfee_lfee_mean,...,other_party_account_person_num,other_party_called_diff_home_code,a_serv_type_call_called_normalized_diff,start_time_diff_start_time_diff_mean,start_time_diff_start_time_diff_std,start_time_diff_start_time_diff_max,distance_distance_std,distance_distance_quantile_50,distance_distance_quantile_75,is_sa
count,6799.0,6799.0,6799.0,6799.0,6799.0,6799.0,6799.0,6799.0,6799.0,6799.0,...,6799.0,6799.0,6799.0,6799.0,6799.0,6799.0,6799.0,6799.0,6799.0,6799.0
mean,12733.153552,95.58411,1033.817473,23.070525,45.712605,98.813465,267.347992,2.231416,5.940874,0.045842,...,53.146639,125.519929,0.102464,21122.760668,36514.01498,162432.8,234169.0,176197.2,284147.5,0.194735
std,14928.641132,75.186737,1010.486196,17.769599,43.808524,90.096264,1204.456251,16.710453,128.837296,1.15563,...,99.644423,168.401624,0.418617,29238.278752,46703.704926,151798.0,273467.6,433389.5,562507.0,0.396025
min,9.0,7.333333,9.0,2.0,6.0,8.0,0.0,0.0,-2235.0,-18.484848,...,1.0,1.0,-0.999417,0.0,62.352371,0.0,0.0,0.0,0.0,0.0
25%,3036.5,52.515823,361.0,16.0,30.0,57.0,0.0,0.0,0.0,0.0,...,13.0,27.0,-0.156165,5405.3925,12306.642235,69673.0,0.0,0.0,0.0,0.0
50%,7601.0,75.195122,737.0,20.75,39.0,78.0,0.0,0.0,0.0,0.0,...,28.0,66.0,0.054054,11725.326923,21774.364928,104332.0,147999.9,0.0,0.0,0.0
75%,16815.5,116.128247,1406.0,26.0,52.0,114.0,0.0,0.0,0.0,0.0,...,59.0,159.0,0.315789,24623.885217,41325.637492,192535.5,371955.0,92132.0,277292.0,0.0
max,170790.0,1600.25,20580.0,748.75,1633.5,1968.5,29260.0,907.310345,4960.0,38.207547,...,2547.0,2558.0,0.998737,562006.0,794796.507335,1203916.0,1925788.0,3879296.0,4223004.0,1.0


In [92]:
all_set = pd.concat([train_set, test_set, valid_set], axis=0).reset_index(drop=True)
labeled_data_len = train_set.shape[0] + test_set.shape[0]

In [93]:
test_set.shape, train_set.shape, valid_set.shape, all_set.shape

((6799, 34), (25548, 34), (1278, 34), (33625, 34))

In [94]:
labeled_set, valid_set = all_set.iloc[:labeled_data_len].copy(), all_set.iloc[labeled_data_len:].copy()
labeled_set.reset_index(drop=True, inplace=True)
valid_set.reset_index(drop=True, inplace=True)
# 有一些值在SMOTE后对数变换后为 NaN，需要删除这些数据
print(labeled_set.isnull().sum().sum())
labeled_set = labeled_set.dropna()
print(labeled_set.isnull().sum().sum())
assert valid_set.shape[0] == validation_features.shape[0]

0
0


In [95]:
# 重新划分训练集和测试集
if not ALL:
    train_set, test_set = labeled_set.iloc[:train_len].copy(), labeled_set.iloc[train_len:].copy()
    train_set.reset_index(drop=True, inplace=True)
    test_set.reset_index(drop=True, inplace=True)

In [97]:
# 使用 autogluon 训练
from autogluon.tabular import TabularPredictor
# 输入数据X_train, y_train
if not ALL:
    model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(train_set, presets='medium_quality')
# , excluded_model_types=['KNN']
else:
    model = TabularPredictor(label='is_sa', eval_metric='f1', problem_type='binary').fit(labeled_set, presets='best_quality', time_limit=3600)

  from .autonotebook import tqdm as notebook_tqdm
No path specified. Models will be saved in: "AutogluonModels\ag-20240717_043624"
Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240717_043624"
AutoGluon Version:  1.0.0
Python Version:     3.10.14
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19041
CPU Count:          12
Memory Avail:       15.60 GB / 31.90 GB (48.9%)
Disk Space Avail:   506.02 GB / 3726.01 GB (13.6%)
Train Data Rows:    25548
Train Data Columns: 33
Label Column:       is_sa
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    15982.65 MB
	Train Data (Original)  Memory Usage: 6.43 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set fe

[1000]	valid_set's binary_logloss: 0.0220457	valid_set's f1: 0.996003


	0.9976	 = Validation score   (f1)
	12.48s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's binary_logloss: 0.00980448	valid_set's f1: 0.998001


	0.9984	 = Validation score   (f1)
	8.68s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.9916	 = Validation score   (f1)
	3.17s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.9936	 = Validation score   (f1)
	3.74s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: CatBoost ...
	0.9968	 = Validation score   (f1)
	146.63s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.9964	 = Validation score   (f1)
	1.3s	 = Training   runtime
	0.18s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.9968	 = Validation score   (f1)
	1.3s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.9932	 = Validation score   (f1)
	37.81s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: XGBoost ...
	0.9976	 = Validation score   (f1)
	4.91s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetT

In [98]:
if not ALL:
    print(model.evaluate(test_set))

{'f1': 0.9821088694328131, 'accuracy': 0.9930872187086336, 'balanced_accuracy': 0.9859729062340493, 'mcc': 0.977873185543758, 'roc_auc': 0.9992837533970672, 'precision': 0.990023023791251, 'recall': 0.974320241691843}


In [99]:
model.feature_importance(test_set if not ALL else labeled_set)

Computing feature importance via permutation shuffling for 33 features using 5000 rows with 5 shuffle sets...
	107.36s	= Expected runtime (21.47s per shuffle set)
	59.89s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
called_home_code_called_home_code_nunique,0.131903,0.009332,2.986494e-06,5,0.151118,0.112689
other_party_called_diff_home_code,0.041675,0.006734,7.903902e-05,5,0.055541,0.027809
start_time_diff_start_time_diff_mean,0.02945,0.003816,3.307588e-05,5,0.037307,0.021593
start_time_diff_start_time_diff_max,0.026834,0.007535,0.0006735435,5,0.042348,0.01132
other_party_account_person_num,0.023118,0.004769,0.0002054756,5,0.032937,0.013299
hour_hour_min,0.023104,0.001038,4.88186e-07,5,0.025242,0.020966
a_serv_type_call_called_normalized_diff,0.022041,0.005013,0.0002999902,5,0.032362,0.01172
distance_distance_std,0.021257,0.001022,6.403227e-07,5,0.023362,0.019152
dayofweek_dayofweek_std,0.019903,0.004361,0.0002597569,5,0.028882,0.010924
start_time_diff_start_time_diff_std,0.018488,0.003549,0.000155192,5,0.025795,0.011181


In [100]:
# leaderboard
if not ALL:
    leaderboard = model.leaderboard(test_set, silent=True)
    print(leaderboard)
else:
    leaderboard = model.leaderboard(labeled_set, silent=True)
    print(leaderboard)
leaderboard

                  model  score_test  score_val eval_metric  pred_time_test  \
0            LightGBMXT    0.984417   0.997602          f1        0.100626   
1        ExtraTreesEntr    0.983681   0.996795          f1        0.203394   
2        ExtraTreesGini    0.982883   0.996393          f1        0.196394   
3   WeightedEnsemble_L2    0.982109   0.999201          f1        0.720328   
4         LightGBMLarge    0.980168   0.998001          f1        0.087950   
5              CatBoost    0.979016   0.996803          f1        0.051039   
6              LightGBM    0.978642   0.998400          f1        0.078830   
7        NeuralNetTorch    0.976848   0.993197          f1        0.100590   
8      RandomForestEntr    0.975927   0.993579          f1        0.217649   
9      RandomForestGini    0.972849   0.991556          f1        0.171769   
10              XGBoost    0.971691   0.997600          f1        0.070462   
11      NeuralNetFastAI    0.965901   0.993235          f1      

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT,0.984417,0.997602,f1,0.100626,0.043281,12.479377,0.100626,0.043281,12.479377,1,True,3
1,ExtraTreesEntr,0.983681,0.996795,f1,0.203394,0.117555,1.295668,0.203394,0.117555,1.295668,1,True,9
2,ExtraTreesGini,0.982883,0.996393,f1,0.196394,0.180747,1.303833,0.196394,0.180747,1.303833,1,True,8
3,WeightedEnsemble_L2,0.982109,0.999201,f1,0.720328,0.350709,70.246597,0.006716,0.01,6.340659,2,True,14
4,LightGBMLarge,0.980168,0.998001,f1,0.08795,0.033604,5.887428,0.08795,0.033604,5.887428,1,True,13
5,CatBoost,0.979016,0.996803,f1,0.051039,0.01519,146.629222,0.051039,0.01519,146.629222,1,True,7
6,LightGBM,0.978642,0.9984,f1,0.07883,0.034414,8.676849,0.07883,0.034414,8.676849,1,True,4
7,NeuralNetTorch,0.976848,0.993197,f1,0.10059,0.058531,77.280956,0.10059,0.058531,77.280956,1,True,12
8,RandomForestEntr,0.975927,0.993579,f1,0.217649,0.130321,3.739483,0.217649,0.130321,3.739483,1,True,6
9,RandomForestGini,0.972849,0.991556,f1,0.171769,0.115835,3.169242,0.171769,0.115835,3.169242,1,True,5


In [101]:
# 在testset 上计算指标
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

if not ALL:
    y_pred = model.predict(test_set)
    y_true = test_set['is_sa']
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      5475
         1.0       0.99      0.97      0.98      1324

    accuracy                           0.99      6799
   macro avg       0.99      0.99      0.99      6799
weighted avg       0.99      0.99      0.99      6799

[[5462   13]
 [  34 1290]]


In [103]:
# 预测
y_validation_pred = model.predict(valid_set.drop('is_sa', axis=1))

# 将预测结果与 msisdn 对应起来
validation_results = validation_features[['msisdn']].copy()
validation_results['is_sa'] = y_validation_pred.astype(int)

print(validation_results.describe())

# 保存结果到CSV文件
file_name = './valid_large_data.csv' if ALL else './valid_small_data.csv'
validation_results.to_csv(file_name, index=False)
print(file_name)

             is_sa
count  1278.000000
mean      0.179186
std       0.383658
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max       1.000000
./valid_small_data.csv
