### 黄牛检测

In [6]:
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
# 读取数据
file_path = './data/raw_data.xlsx'
sheet_name = 'Sheet1'
data = pd.read_excel(file_path, sheet_name=sheet_name)

In [3]:
# 时间预处理
data['订单创建时间'] = pd.to_datetime(data['订单创建时间'])
data['就诊日期'] = pd.to_datetime(data['就诊日期'])

In [49]:
print(data.columns)

Index(['ID', '省份', 'APPID', 'IP_ADDRESS', '订单创建时间', '患者ID', '患者创建时间', '就诊日期',
       '就诊科室名称', '医生姓名', '状态', '商户订单号'],
      dtype='object')


#### 1. 基于规则的检测

1. IP重复，来自同一个IP，且为超过3个人挂号
2. 用户重复，来自同一个用户，且挂号了超过3个科室/超过两个app_id
3. 时间过早，每天5:00-5:01进行操作的

In [9]:
# 重复值筛选，找出data的seg_name字段中重复数大于limit的行, sort_add和asc_add是检测完之后添加的排序要求
def duplicate_detect(data, seg_name, limit, up=True, sort_add=[], asc_add=[]):
    assert len(sort_add) == len(asc_add)
    value_counts = data[seg_name].value_counts()
    if up:
        # 向上筛
        dup_row = data[data[seg_name].isin(value_counts[value_counts > limit].index)]
    else:
        # 向下筛
        dup_row = data[data[seg_name].isin(value_counts[value_counts < limit].index)]
    sort_by = [seg_name] + sort_add
    asc = [True] + asc_add
    dup_row = dup_row.sort_values(by=sort_by, ascending=asc)
    return dup_row

In [10]:
# 筛选出同一个dup_seg中有超过limit个unique_seg的索引(从0开始)
def unique_dup_filter(data, dup_seg, unique_seg, limit, up=True, get_row=False):
    # 确定是黄牛的行编号
    selects = []
    # IP重复检测
    dup = duplicate_detect(data, dup_seg, limit, up=up, sort_add=[unique_seg], asc_add=[True])
    print("init dup_num:", len(dup))
    if len(dup) == 0:
        return selects
    pos = 0          # 这个IP的起点
    count = 1      # 涉及多少个用户ID
    last = dup[unique_seg].iloc[0] # 上一个用户的ID
    dup_num = len(dup)
    for i in tqdm(range(1, dup_num)):
        if dup[dup_seg].iloc[i] == dup[dup_seg].iloc[pos]:
            if dup[unique_seg].iloc[i] != last:
                # 相同IP下一个新的患者
                count += 1
                last = dup[unique_seg].iloc[i]
        if dup[dup_seg].iloc[i] != dup[dup_seg].iloc[pos] or i == dup_num - 1:
            # 开始检测下一个IP
            if count > limit:
                # 达到重复人数条件
                for j in range(pos, i):
                    selects.append(dup['ID'].iloc[j])
            # 重置
            pos = i
            count = 1
            last = dup[unique_seg].iloc[i]
    print("filtered dup_num:", len(selects))
    if not get_row:
        selects.sort()
        return selects
    else:
        dup_rows = data[data['ID'].isin(selects)]
        dup_rows = dup_rows.sort_values(by=dup_seg, ascending=True)
        return dup_rows

In [11]:
# 对unique_dup_filter, 在一定范围内遍历limit
# 使用: limits, select_list = grid_traverse(data, '患者ID', '就诊科室名称', 5, 10)
def grid_traverse(data, dup_seg, unique_seg, start, end, gap=1):
    limits = []
    selects_list = []
    for limit in range(start, end, gap):
        print(f"limit: {limit}")
        selects = unique_dup_filter(data, dup_seg, unique_seg, limit)
        limits.append(limit)
        selects_list.append(selects)
    return limits, selects_list

In [12]:
# 将一个list写作答案
def write_list(lis):
    # 打开一个文件进行写入，如果文件不存在则创建
    with open('./data/result.txt', 'w', encoding='utf-8') as file:
        # 遍历列表中的每个元素
        for item in lis:
            # 将每个元素写入文件，每个元素后面加上换行符
            file.write(str(item) + '\n')
        print(f"Total line: {len(lis)}")

In [13]:
# 时间段过滤器，过滤出每天一段时间内的数据
# 形如daily_filter(data, '5:00:00', '5:01:00')
def daily_filter(data, start, end):
    return data[(data['订单创建时间'].dt.time >= pd.to_datetime(start).time()) &
                    (data['订单创建时间'].dt.time <= pd.to_datetime(end).time())]

In [14]:
# 时间分段统计
def hour_count():
    print("\t\t总数\t已挂号  医保换号  已退号  窗口退号  无号退款  超时取消")
    for i in range(5, 23):
        time_filter = data[(data['订单创建时间'].dt.time >= pd.to_datetime(f'{i}:00:00').time()) &
                     (data['订单创建时间'].dt.time <= pd.to_datetime(f'{i+1}:00:00').time())]
        counts = time_filter['状态'].value_counts()
        print(f"{i}:00 - {i+1}:00 \t{len(time_filter)}\t{counts['已挂号']}\t{counts['医保换号']}\t {counts['已退号']}\t"
              f"   {counts['窗口退号']}\t   {counts['无号退款']}\t    {counts['超时取消']}\t")

def minute_line(data, hour):
    count = []
    for i in tqdm(range(0, 59, 5)):
        start = str(i)
        end = str(i+5)
        if len(start) < 2:
            start = '0' + start
        if len(end) < 2:
            end = '0' + end 
        time_filter = data[(data['订单创建时间'].dt.time >= pd.to_datetime(f'{hour}:{start}:00').time()) &
                     (data['订单创建时间'].dt.time <= pd.to_datetime(f'{hour}:{end}:00').time())]
        count.append(len(time_filter))
    plt.plot(count)

In [15]:
# 恰好在16:00进行第二天/下一周操作
def hurry_sixteen(data, get_row=False):
    gap = daily_filter(data, '16:00:00', '16:00:01')
    time_diff = (gap['就诊日期'] - gap['订单创建时间']).dt.days
    hurry_row = gap[(time_diff == 0) | (time_diff == 6)]
    if get_row:
        return hurry_row
    else:
        return hurry_row['ID'].tolist()

In [16]:
# 将之前一次答案中的数据提取成列表
def get_list(res_id):
    file_path = f'./data/result{res_id}.txt'
    with open(file_path, 'r') as file:
        lis = [int(line.strip()) for line in file]
    return lis

In [17]:
# 两个list进行对比
def lis_cmp(lis1, lis2, ret=False):
    set1 = set(lis1)
    set2 = set(lis2)
    new_ele = set2 - set1
    miss_ele = set1 - set2
    same_ele = set1.intersection(set2)
    print(f"lis1: {len(lis1)}, lis2: {len(lis2)}, more: {len(new_ele)}, miss: {len(miss_ele)}, same = {len(same_ele)}")
    if ret:
        return new_ele, miss_ele

# 与之前一次答案进行对比
def res_cmp(res_id, lis, ret=False):
    lis1 = get_list(res_id)
    res = lis_cmp(lis1, lis)
    if ret:
        return res

In [52]:
# 频繁退号
def frequent_drop(data, limit, get_row=False):
    drop = data[data['状态'] == '已退号']
    mass_drop = duplicate_detect(drop, "患者ID", limit)
    if get_row:
        return mass_drop['ID'].tolist()
    else:
        return mass_drop

In [18]:
# 确认不是黄牛(要小心让模型学到的是什么，有些东西可以拿来筛选但是不能输给模型，可能设置不超过)
# 条件：
# 1. IP行
# 2. 用户挂号3次及以下，且是相同的科室和相同APPID
# 3. 北京/河北
# 4. 时间不在两个重点时段5:00 - 6:00, 16:00 - 16:30
def normal_people(data):
    ip_limit = duplicate_detect(data, 'IP_ADDRESS', 2, up=False)
    print(f"limit: {len(ip_limit)}")
    user_limit = duplicate_detect(ip_limit, '患者ID', 4, up=False)
    print(f"limit: {len(user_limit)}")
    depart_limit = duplicate_detect(user_limit, '就诊科室名称', 10, up=False)
    print(f"limit: {len(depart_limit)}")
    app_limit = duplicate_detect(depart_limit, 'APPID', 2, up=False)
    print(f"limit: {len(app_limit)}")
    area_limit = app_limit[(app_limit['省份'] == '北京') | (app_limit['省份'] == '河北')]
    print(len(area_limit))
    # time_limit = area_limit[((area_limit['订单创建时间'].dt.time >= pd.to_datetime(f'6:00:00').time()) &
    #                  (area_limit['订单创建时间'].dt.time <= pd.to_datetime(f'16:00:00').time())) | 
    #                  area_limit['订单创建时间'].dt.time >= pd.to_datetime(f'17:00:00').time()]
    time_limit = area_limit[((area_limit['订单创建时间'].dt.time >= pd.to_datetime(f'6:00:00').time()) &
                     (area_limit['订单创建时间'].dt.time <= pd.to_datetime(f'16:00:00').time()))]
    return time_limit

In [None]:
# 根据权重在多个不同的list中进行合并筛选
def weighted_selection(lists, weights, n, limit=0):
    # 创建一个字典来存储元素的总权重
    total_weights = defaultdict(float)

    # 遍历每个列表及其对应的权重
    for lst, weight in zip(lists, weights):
        for element in lst:
            total_weights[element] += weight  # 累加权重

    # 将字典转换为列表，并筛选出权重大于 limit 的元素
    filtered_elements = {k: v for k, v in total_weights.items() if v >= limit}

    # 按照权重排序
    sorted_elements = sorted(filtered_elements.items(), key=lambda x: x[1], reverse=True)

    # 选择前 n 个元素及其权重
    top_n_elements = sorted_elements[:n]

    return top_n_elements

#### 工作区

In [109]:
# 第四次测试(11.15)
IP_USER = 50
USER_DORM = 6
AREA_COUNT = 40
USER_APPID = 3
DROP_LIMIT = 8
# IP相同，用户数超过IP_USER，且属地不在北京/河北
ip_dup_rows = unique_dup_filter(data, 'IP_ADDRESS', '患者ID', IP_USER, get_row=True)
area_limit_rows = ip_dup_rows[(ip_dup_rows['省份'] != '北京') & (ip_dup_rows['省份'] != '河北')]
select1 = area_limit_rows['ID'].tolist()
# 用户相同，在超过USER_DORM个科室挂号
select2 = unique_dup_filter(data, '患者ID', '就诊科室名称', USER_DORM)
# 在5:00:00-5:00:01之间挂号
select3 = daily_filter(data, '5:00:00', '5:00:01')['ID'].tolist()
# 在16:00:00-16:00:01之间挂号(挂号第二天/下一周当天)
select4 = hurry_sixteen(data)
# 低频地区(少于AREA_COUNT次操作)
area_limit = duplicate_detect(data, '省份', AREA_COUNT, False)
select5 = area_limit['ID'].tolist()
# 同一用户使用超过USER_APPID个APPID操作
select6 = unique_dup_filter(data, '患者ID', 'APPID', USER_APPID)
# 同一用户超过DROP_LIMIT次退号
select7 = frequent_drop(data, DROP_LIMIT)['ID'].tolist()
select = select1 + select2 + select3 + select4 + select5 + select6 + select7
select = list(set(select))
select.sort()
write_list(select)

init dup_num: 46607


100%|██████████| 46606/46606 [00:01<00:00, 23703.81it/s]


filtered dup_num: 45544
init dup_num: 14443


100%|██████████| 14442/14442 [00:00<00:00, 28059.53it/s]


filtered dup_num: 4999
init dup_num: 61175


100%|██████████| 61174/61174 [00:01<00:00, 32617.47it/s]


filtered dup_num: 595
Total line: 8921


In [95]:
selects = [select1, select2, select3, select4, select5, select6, select7]
weights = [0.7, 0.5, 0.8, 0.3, 0.8, 0.6, 0.7]
# 总得分超过LIMIT,且在前N个
LIMIT = 0.5
N = 6500
res = weighted_selection(selects, weights, N, limit=LIMIT)
res = [pair[0] for pair in res]
res.sort()
write_list(res)

Total line: 5824


In [110]:
res_cmp(1, select)
res_cmp(2, select)

lis1: 6339, lis2: 8921, more: 2582, miss: 0, same = 6339
lis1: 8480, lis2: 8921, more: 441, miss: 0, same = 8480
