### 黄牛检测

In [1]:
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn

In [2]:
# 读取数据
file_path = './data/raw_data.xlsx'
sheet_name = 'Sheet1'
data = pd.read_excel(file_path, sheet_name=sheet_name)

In [15]:
print(data.columns)

Index(['ID', '省份', 'APPID', 'IP_ADDRESS', '订单创建时间', '患者ID', '患者创建时间', '就诊日期',
       '就诊科室名称', '医生姓名', '状态', '商户订单号'],
      dtype='object')


#### 1. 基于规则的检测

1. IP重复，来自同一个IP，且为超过3个人挂号
2. 用户重复，来自同一个用户，且挂号了超过3个科室/超过两个app_id
3. 时间过早，每天5:00-5:01进行操作的

In [13]:
# 重复值筛选，找出data的seg_name字段中重复数大于limit的行, sort_add和asc_add是检测完之后添加的排序要求
def duplicate_detect(data, seg_name, limit, sort_add=[], asc_add=[]):
    assert len(sort_add) == len(asc_add)
    value_counts = data[seg_name].value_counts()
    dup_row = data[data[seg_name].isin(value_counts[value_counts > limit].index)]
    sort_by = [seg_name] + sort_add
    asc = [True] + asc_add
    dup_row = dup_row.sort_values(by=sort_by, ascending=asc)
    return dup_row

In [127]:
# 筛选出同一个dup_seg中有超过limit个unique_seg的索引(从0开始)
def unique_dup_filter(data, dup_seg, unique_seg, limit):
    # 确定是黄牛的行编号
    selects = []
    # IP重复检测
    dup = duplicate_detect(data, dup_seg, limit, sort_add=[unique_seg], asc_add=[True])
    print("init dup_num:", len(dup))
    if len(dup) == 0:
        return selects
    pos = 0          # 这个IP的起点
    count = 1      # 涉及多少个用户ID
    last = dup[unique_seg].iloc[0] # 上一个用户的ID
    dup_num = len(dup)
    for i in tqdm(range(1, dup_num)):
        if dup[dup_seg].iloc[i] == dup[dup_seg].iloc[pos]:
            if dup[unique_seg].iloc[i] != last:
                # 相同IP下一个新的患者
                count += 1
                last = dup[unique_seg].iloc[i]
        if dup[dup_seg].iloc[i] != dup[dup_seg].iloc[pos] or i == dup_num - 1:
            # 开始检测下一个IP
            if count > limit:
                # 达到重复人数条件
                for j in range(pos, i):
                    selects.append(dup['ID'].iloc[j])
            # 重置
            pos = i
            count = 1
            last = dup[unique_seg].iloc[i]
    selects.sort()
    print("filtered dup_num:", len(selects))
    return selects

In [134]:
# 对unique_dup_filter, 在一定范围内遍历limit
def grid_traverse(data, dup_seg, unique_seg, start, end, gap=1):
    limits = []
    selects_list = []
    for limit in range(start, end, gap):
        print(f"limit: {limit}")
        selects = unique_dup_filter(data, dup_seg, unique_seg, limit)
        limits.append(limit)
        selects_list.append(selects)
    return limits, selects_list

In [None]:
# 将一个list写作答案
def write_list(lis):
    # 打开一个文件进行写入，如果文件不存在则创建
    with open('./data/result.txt', 'w', encoding='utf-8') as file:
        # 遍历列表中的每个元素
        for item in lis:
            # 将每个元素写入文件，每个元素后面加上换行符
            file.write(str(item) + '\n')

In [155]:
# IP重复检测
ip_dup_rid = unique_dup_filter(data, 'IP_ADDRESS', '患者ID', 50)
ip_dup_rows = data[data['ID'].isin(ip_dup_rid)]
ip_dup_rows = ip_dup_rows.sort_values(by='患者ID', ascending=True)
area_limit_rows = ip_dup_rows[(ip_dup_rows['省份'] != '北京') & (ip_dup_rows['省份'] != '河北')]
print(len(area_limit_rows))

init dup_num: 46607


100%|██████████| 46606/46606 [00:01<00:00, 24883.68it/s]

filtered dup_num: 45544
1883





In [137]:
# 用户重复检测
user_dup_rid = unique_dup_filter(data, '患者ID', '就诊科室名称', 6)
user_dup_rows = data[data['ID'].isin(user_dup_rid)]
user_dup_rows = user_dup_rows.sort_values(by='患者ID', ascending=True)

init dup_num: 14443


100%|██████████| 14442/14442 [00:00<00:00, 25451.89it/s]

filtered dup_num: 4999





In [136]:
limits, select_list = grid_traverse(data, '患者ID', '就诊科室名称', 5, 10)

limit: 5
init dup_num: 22225


100%|██████████| 22224/22224 [00:00<00:00, 29380.38it/s]


filtered dup_num: 7923
limit: 6
init dup_num: 14443


100%|██████████| 14442/14442 [00:00<00:00, 27358.89it/s]


filtered dup_num: 4999
limit: 7
init dup_num: 10103


100%|██████████| 10102/10102 [00:00<00:00, 28085.30it/s]


filtered dup_num: 3406
limit: 8
init dup_num: 6951


100%|██████████| 6950/6950 [00:00<00:00, 27361.54it/s]


filtered dup_num: 2395
limit: 9
init dup_num: 5106


100%|██████████| 5105/5105 [00:00<00:00, 27896.60it/s]

filtered dup_num: 1585





In [145]:
# 时间过滤
data['订单创建时间'] = pd.to_datetime(data['订单创建时间'])
time_filter = data[(data['订单创建时间'].dt.time >= pd.to_datetime('05:00:00').time()) &
                     (data['订单创建时间'].dt.time <= pd.to_datetime('05:01:00').time())]

In [146]:
print(len(time_filter))

1146
