In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix,classification_report
# from xgboost import XGBClassifier
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import time

ALL = False
NO_SMOTE = True
SUB_PROCESSED_DIR = 'processed_undersample'
TEST_RATIO = 0.1

dtypes = {
    'msisdn': 'str',
    'start_time': 'str',
    'end_time': 'str',
    'call_event': 'category',
    'other_party': 'str',
    'ismultimedia': 'category',
    'home_area_code': 'str',
    'visit_area_code': 'str',
    'called_home_code': 'str',
    'called_code': 'str',
    'a_serv_type': 'int',
    'long_type1': 'int',
    'roam_type': 'int',
    'a_product_id': 'str',
    'open_datetime': 'str',
    'call_duration': 'int32',
    'cfee': 'float64',
    'lfee': 'float64',
    'hour': 'int8',
    'dayofweek': 'int',
    'phone1_type': 'int',
    'phone2_type': 'int',
    'phone1_loc_city': 'str',
    'phone1_loc_province': 'str',
    'phone2_loc_city': 'str',
    'phone2_loc_province': 'str',
    'update_time': 'str',
    'date': 'str',
    'date_c': 'str'
}

# 判断 processed 文件夹是否存在
import os
# 读取CSV文件
labeled_data = pd.read_csv('../self_data/all_trainSet_res.csv', dtype=dtypes)
labels = pd.read_csv('../self_data/all_trainSet_ans.csv', dtype=dtypes)

validation_data = pd.read_csv('../self_data/sorted_validationSet_res_with_head.csv', dtype=dtypes)

# 按照 msisdn 切分 train_data 和 test_data
train_data_msisdn, test_data_msisdn = train_test_split(labels['msisdn'], test_size=TEST_RATIO, random_state=42, stratify=labels['is_sa'])
train_data = labeled_data[labeled_data['msisdn'].isin(train_data_msisdn)]
train_labels = labels[labels['msisdn'].isin(train_data_msisdn)]
assert len(train_data['msisdn'].unique()) == len(train_data_msisdn)

test_data = labeled_data[labeled_data['msisdn'].isin(test_data_msisdn)]
test_labels = labels[labels['msisdn'].isin(test_data_msisdn)]
assert len(test_data['msisdn'].unique()) == len(test_data_msisdn)

# Calculate the count of is_sa == 1 and is_sa == 0 for train_data
train_count = train_labels['is_sa'].value_counts()
train_ratio = train_count.get(1, 0) / train_count.get(0, 1)  # Avoid division by zero

# Calculate the count of is_sa == 1 and is_sa == 0 for test_data
test_count = test_labels['is_sa'].value_counts()
test_ratio = test_count.get(1, 0) / test_count.get(0, 1)  # Avoid division by zero

# Output the results
print(f"Train data - is_sa==1 count: {train_count.get(1, 0)}, is_sa==0 count: {train_count.get(0, 0)}, Ratio: {train_ratio:.2f}")
print(f"Test data - is_sa==1 count: {test_count.get(1, 0)}, is_sa==0 count: {test_count.get(0, 0)}, Ratio: {test_ratio:.2f}")


Train data - is_sa==1 count: 3458, is_sa==0 count: 31272, Ratio: 0.11
Test data - is_sa==1 count: 384, is_sa==0 count: 3475, Ratio: 0.11


In [24]:
os.mkdir(f'../self_data/{SUB_PROCESSED_DIR}')
print("Creating processed data folder...")
# Save entries with is_sa == 1 
merged_train_data = pd.merge(train_data, train_labels, on='msisdn', how='left')
merged_test_data = pd.merge(test_data, test_labels, on='msisdn', how='left')
train_data_issa = merged_train_data[merged_train_data['is_sa'] == 1]
test_data_issa = merged_test_data[merged_test_data['is_sa'] == 1]

train_data_issa = train_data_issa.drop(columns=['is_sa'])
test_data_issa = test_data_issa.drop(columns=['is_sa'])
# Save these to CSV files
train_data_issa.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/train_data_issa.csv', index=False)
test_data_issa.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/test_data_issa.csv', index=False)

train_labels_issa = train_data_issa['msisdn'].unique()  # Get unique msisdn values
train_labels_issa_df = pd.DataFrame({'msisdn': train_labels_issa, 'is_sa': 1})  # Create DataFrame
train_labels_issa_df.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/train_label_issa.csv', index=False)

test_labels_issa = test_data_issa['msisdn'].unique()
test_labels_issa_df = pd.DataFrame({'msisdn': test_labels_issa, 'is_sa': 1})
test_labels_issa_df.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/test_label_issa.csv', index=False)

# Number of samples to create
num_samples = 9

# Randomly sample is_sa == 0 from train_data and test_data
for i in range(1, num_samples + 1):
    # Sample is_sa == 0 data with replacement
    train_data_notsa = merged_train_data[merged_train_data['is_sa'] == 0].sample(
        n=len(train_data_issa), replace=True, random_state=i*100
    )
    test_data_notsa = merged_test_data[merged_test_data['is_sa'] == 0].sample(
        n=len(test_data_issa), replace=True, random_state=i*100
    )

    train_data_notsa = train_data_notsa.drop(columns=['is_sa'])
    test_data_notsa = test_data_notsa.drop(columns=['is_sa'])
    # Save the sampled data to CSV files
    train_data_notsa.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/train_data_notsa_0{i}.csv', index=False)
    test_data_notsa.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/test_data_notsa_0{i}.csv', index=False)
    
    train_labels_notsa = train_data_notsa['msisdn'].unique()  # Get unique msisdn values
    train_labels_notsa_df = pd.DataFrame({'msisdn': train_labels_issa, 'is_sa': 0})  # Create DataFrame
    train_labels_notsa_df.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/train_label_issa_0{i}.csv', index=False)
    
    test_labels_notsa = test_data_notsa['msisdn'].unique()
    test_labels_notsa_df = pd.DataFrame({'msisdn': test_labels_notsa, 'is_sa': 0}) 
    test_labels_notsa_df.to_csv(f'../self_data/{SUB_PROCESSED_DIR}/test_label_notsa_0{i}.csv', index=False)

Creating processed data folder...


In [23]:
def ensemble_predict(models, test_data):
    # 每个模型对测试集进行预测
    predictions = np.array([model.predict(test_data).to_numpy() for model in models])
    # 进行投票表决
    final_prediction = np.round(predictions.mean(axis=0)).astype(int)
    return final_prediction