## Import libraries

In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import time
import math
import fractions
import os
import itertools
import re
from matplotlib import pyplot as plt
import networkx as nx
from datetime import datetime

## Load dataset

In [3]:
df = pd.read_csv('order_brush_order.csv')

In [36]:
# Convert event_time to datetime
df.loc[:,'event_time'] = pd.to_datetime(df.event_time)

In [37]:
df.head()

Unnamed: 0,orderid,shopid,userid,event_time
0,31076582227611,93950878,30530270,2019-12-27 00:23:03
1,31118059853484,156423439,46057927,2019-12-27 11:54:20
2,31123355095755,173699291,67341739,2019-12-27 13:22:35
3,31122059872723,63674025,149380322,2019-12-27 13:01:00
4,31117075665123,127249066,149493217,2019-12-27 11:37:55


## Analyze data

Shops are deemed to have conducted order brushing if their concentrate rate is **greater than or equal to 3 at any instance**.

Concentrate rate = number of orders within 1 hour / number of unique buyers within 1 hour.

Suspicious buyers are deemed as the buyer that contributed to highest portion of ordes to a shope that is deemed to have conducted order brushing.

Final submission must have 2 columns: shopid and userid. If there is more than 1 userid identified as suspicious, list all the userids separated by '&' with the smaller numerical userid first. Submission should have 18770 rows (excluding headers).

In [53]:
shopids = df.shopid.unique()

In [None]:
df[df.shopid == i]

In [462]:
window_combos = [('0M','60M'), ('60M','0M'), ('30M','30M'), ('45M','15M')]
for i in window_combos:
    df_forward = sample[(sample.event_time >= j-pd.Timedelta(i[0])) & (sample.event_time <= j+pd.Timedelta(i[1]))]
    display(df_forward)

Unnamed: 0,orderid,shopid,userid,event_time
222725,31173369000956,147118226,47283981,2019-12-28 03:16:09


Unnamed: 0,orderid,shopid,userid,event_time
222725,31173369000956,147118226,47283981,2019-12-28 03:16:09


Unnamed: 0,orderid,shopid,userid,event_time
222725,31173369000956,147118226,47283981,2019-12-28 03:16:09


Unnamed: 0,orderid,shopid,userid,event_time
222725,31173369000956,147118226,47283981,2019-12-28 03:16:09


In [464]:
brush_test = pd.DataFrame(columns = ['orderid','shopid','userid','event_time'])
slice_length = []
for i in tqdm(shopids):
    shop_slice = []
    sample = df[df.shopid == i].sort_values('event_time')
#     brush_period = pd.DataFrame(columns = ['orderid','shopid','userid','event_time'])
    for j in sample.event_time:
        df_slice = sample[(sample.event_time >= j) & (sample.event_time <= j+pd.Timedelta('1H'))]
        shop_slice.append(len(df_slice))
#         conc = df_slice.orderid.count()/df_slice.userid.nunique()
#         if conc >= 3:
#             brush_period = pd.concat((brush_period,df_slice), axis = 0).sort_values('event_time')
#     if len(brush_period) != 0:
#         brush_period = brush_period.drop_duplicates(subset='orderid', keep = 'first')
#         df_brushing_forward = pd.concat((df_brushing_forward, brush_period), axis = 0)
    slice_length.append(shop_slice)

HBox(children=(IntProgress(value=0, max=18770), HTML(value='')))

In [474]:
shopids

array([ 93950878, 156423439, 173699291, ...,  50236030, 203587596,
       147118226], dtype=int64)

In [480]:
len(max(slice_length))

11703

In [501]:
dfslices = pd.DataFrame({'shopid':shopids,
             'slice': slice_length})
dfslices.loc[:,'max_slice'] = dfslices.slice.apply(lambda x: sorted(x, reverse = True)[0:3])
dfslices.loc[:, 'number_slice'] = dfslices.slice.apply(lambda x: len(x))

In [502]:
dfslices.sort_values('max_slice', ascending = False)

Unnamed: 0,shopid,slice,max_slice,number_slice
8,147941492,"[269, 268, 267, 266, 265, 264, 263, 263, 262, ...","[551, 550, 550]",11703
84,61556313,"[24, 23, 26, 27, 26, 26, 27, 26, 26, 25, 25, 2...","[127, 127, 127]",6691
60,54615708,"[43, 42, 41, 40, 40, 40, 40, 40, 39, 38, 37, 3...","[54, 53, 53]",2640
2377,162551899,"[37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 2...","[45, 45, 44]",115
1304,99831506,"[1, 1, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31,...","[40, 39, 38]",43
259,89768730,"[7, 6, 5, 4, 3, 3, 4, 4, 3, 3, 2, 2, 2, 1, 1, ...","[35, 35, 35]",607
583,153849290,"[30, 29, 28, 28, 27, 26, 25, 24, 23, 22, 21, 2...","[31, 30, 30]",136
272,43412276,"[1, 1, 1, 2, 2, 1, 2, 5, 4, 3, 2, 4, 8, 8, 7, ...","[30, 29, 28]",529
1248,149828485,"[4, 3, 2, 1, 1, 1, 4, 4, 3, 2, 2, 2, 1, 2, 1, ...","[27, 26, 25]",207
2906,144969458,"[1, 1, 1, 1, 1, 1, 3, 3, 2, 1, 1, 1, 1, 1, 1, ...","[27, 26, 25]",205


In [490]:
dfslices[dfslices.max_slice >= 3]

Unnamed: 0,shopid,slice,max_slice,number_slice
0,93950878,"[2, 2, 1, 2, 3, 2, 1, 1, 3, 2, 1, 2, 3, 6, 5, ...",8,124
1,156423439,"[1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 3, 3, ...",4,41
2,173699291,"[2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, ...",5,53
4,127249066,"[1, 2, 1, 2, 1, 2, 3, 3, 2, 2, 2, 2, 1, 2, 1, ...",5,122
5,173811070,"[1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 4, 3, 2, 1, 1, ...",4,35
6,107921853,"[6, 5, 4, 6, 6, 5, 5, 4, 3, 3, 2, 3, 3, 2, 2, ...",20,684
7,178400128,"[4, 3, 2, 1, 1, 1, 1, 3, 2, 1, 3, 2, 3, 2, 2, ...",4,78
8,147941492,"[269, 268, 267, 266, 265, 264, 263, 263, 262, ...",551,11703
10,9374147,"[4, 3, 6, 5, 8, 7, 6, 5, 5, 4, 3, 2, 2, 3, 3, ...",12,385
11,145694343,"[2, 1, 3, 2, 2, 3, 2, 2, 1, 4, 3, 2, 1, 5, 4, ...",6,121


In [348]:
df_brushing_forward = pd.DataFrame(columns = ['orderid','shopid','userid','event_time'])
for i in tqdm(shopids):
    sample = df[df.shopid == i].sort_values('event_time')
    brush_period = pd.DataFrame(columns = ['orderid','shopid','userid','event_time'])
    for j in sample.event_time:
        df_slice = sample[(sample.event_time >= j) & (sample.event_time <= j+pd.Timedelta('1H'))]
        conc = df_slice.orderid.count()/df_slice.userid.nunique()
        if conc >= 3:
            brush_period = pd.concat((brush_period,df_slice), axis = 0).sort_values('event_time')
    if len(brush_period) != 0:
        brush_period = brush_period.drop_duplicates(subset='orderid', keep = 'first')
        df_brushing_forward = pd.concat((df_brushing_forward, brush_period), axis = 0)
        fraud_shop.append(i)
        mode_user = sorted(list(brush_period.userid.value_counts().index[brush_period.userid.value_counts().values == max(brush_period.userid.value_counts().values)]))        
        fraud_buyer.append(mode_user)

HBox(children=(IntProgress(value=0, max=18770), HTML(value='')))

In [355]:
df_brushing_backward = pd.DataFrame(columns = ['orderid','shopid','userid','event_time'])
fraud_shop = []
fraud_buyer = []
for i in tqdm(shopids):
    sample = df[df.shopid == i].sort_values('event_time')
    brush_period = pd.DataFrame(columns = ['orderid','shopid','userid','event_time'])
    for orders in sample.orderid:
        j = sample.loc[sample.orderid == orders,'event_time'].values[0]
        df_slice = sample[(sample.event_time <= j) & (sample.event_time >= j-pd.Timedelta('1H'))]
        conc = df_slice.orderid.count()/df_slice.userid.nunique()
        if conc >= 3:
            brush_period = pd.concat((brush_period,df_slice), axis = 0).sort_values('event_time')
    if len(brush_period) != 0:
        brush_period = brush_period.drop_duplicates(subset='orderid', keep = 'first')
        df_brushing_backward = pd.concat((df_brushing_backward, brush_period), axis = 0)
        fraud_shop.append(i)
        mode_user = sorted(list(brush_period.userid.value_counts().index[brush_period.userid.value_counts().values == max(brush_period.userid.value_counts().values)]))        
        fraud_buyer.append(mode_user)

HBox(children=(IntProgress(value=0, max=18770), HTML(value='')))

In [374]:
def brushdf(df_brushing_forward):
    fraud_shop = df_brushing_forward.shopid.unique()
    fraud_buyer = []
    for i in fraud_shop:
        brush_period = df_brushing_forward[df_brushing_forward.shopid == i]
        mode_user = sorted(list(brush_period.userid.value_counts().index[brush_period.userid.value_counts().values == max(brush_period.userid.value_counts().values)]))
        fraud_buyer.append(mode_user)
        
    # Create new dataframe
    final_csv = pd.DataFrame(
        {'shopid': fraud_shop,
         'userid': fraud_buyer,})
    final_csv.loc[:,'userid'] = final_csv.userid.apply(lambda x: ('&').join([str(i) for i in x]))
    final_csv

    # Concat list of non-fraudulent shops
    notfraud = [x for x in shopids if x not in fraud_shop]
    final_csv = pd.concat((final_csv, pd.DataFrame({'shopid': notfraud,
                 'userid': [str(0)]*len(notfraud)})), axis = 0).set_index('shopid')

    return final_csv

In [392]:
len(union[union.userid != '0']) #- len(intersection[intersection.userid != '0'])

308

In [431]:
# Using +- 30 minutes 
df_30 = pd.DataFrame(columns = ['orderid','shopid','userid','event_time'])
for i in tqdm(shopids):
    sample = df[df.shopid == i].sort_values('event_time')
    brush_period = pd.DataFrame(columns = ['orderid','shopid','userid','event_time'])
    for j in sample.event_time:
        df_slice = sample[(sample.event_time >= j-pd.Timedelta('30M')) & (sample.event_time <= j+pd.Timedelta('30M'))]
        conc = df_slice.orderid.count()/df_slice.userid.nunique()
        if conc >= 3:
            brush_period = pd.concat((brush_period,df_slice), axis = 0).sort_values('event_time')
    if len(brush_period) != 0:
        brush_period = brush_period.drop_duplicates(subset='orderid', keep = 'first')
        df_30 = pd.concat((df_30, brush_period), axis = 0)

HBox(children=(IntProgress(value=0, max=18770), HTML(value='')))

In [442]:
# Using +- 45 minutes 
df_45 = pd.DataFrame(columns = ['orderid','shopid','userid','event_time'])
for i in tqdm(shopids):
    sample = df[df.shopid == i].sort_values('event_time')
    brush_period = pd.DataFrame(columns = ['orderid','shopid','userid','event_time'])
    for j in sample.event_time:
        df_slice = sample[(sample.event_time >= j-pd.Timedelta('45M')) & (sample.event_time <= j+pd.Timedelta('15M'))]
        conc = df_slice.orderid.count()/df_slice.userid.nunique()
        if conc >= 3:
            brush_period = pd.concat((brush_period,df_slice), axis = 0).sort_values('event_time')
    if len(brush_period) != 0:
        brush_period = brush_period.drop_duplicates(subset='orderid', keep = 'first')
        df_45 = pd.concat((df_45, brush_period), axis = 0)

HBox(children=(IntProgress(value=0, max=18770), HTML(value='')))

In [447]:
rolling45 = brushdf(df_45).rename(columns = {'userid':'df45'})

In [457]:
fb30 = brushdf(pd.concat((df_brushing_forward, df_brushing_backward, df_30), axis = 0).drop_duplicates(subset = 'orderid', keep = 'first')).rename(columns ={'userid':'fb30'})
fb3045 = brushdf(pd.concat((df_brushing_forward, df_brushing_backward, df_30, df_45), axis = 0).drop_duplicates(subset = 'orderid', keep = 'first'))#.rename(columns = {'userid':'fb3045'})

# bleh = fb3045.join(fb30)
# bleh[bleh.fb3045 != bleh.fb30]

In [460]:
fb3045.to_csv('31GB_4515.csv')

In [441]:
# with 30
fb30 = brushdf(pd.concat((df_brushing_forward, df_brushing_backward, df_30), axis = 0).drop_duplicates(subset = 'orderid', keep = 'first')).rename({'userid':'fb30'})
bleh = fb30.join(union)
bleh[bleh.userid != bleh.union]

Unnamed: 0_level_0,userid,union
shopid,Unnamed: 1_level_1,Unnamed: 2_level_1
9374147,148176353,0
10159,214988798,0
120358496,210701971,0
9466594,84811421,0


In [391]:
# Get intersection between forward and backward periods and analyze
intersection = brushdf(df_brushing_forward[df_brushing_forward.orderid.isin(df_brushing_backward.orderid)])

# Get union between forward and backward
union = brushdf(pd.concat((df_brushing_forward, df_brushing_backward), axis = 0).drop_duplicates(subset = 'orderid', keep = 'first'))

# Try analysis again on forward and see if you get the same results


In [430]:
df

Unnamed: 0,orderid,shopid,userid,event_time
0,31076582227611,93950878,30530270,2019-12-27 00:23:03
1,31118059853484,156423439,46057927,2019-12-27 11:54:20
2,31123355095755,173699291,67341739,2019-12-27 13:22:35
3,31122059872723,63674025,149380322,2019-12-27 13:01:00
4,31117075665123,127249066,149493217,2019-12-27 11:37:55
5,31119725718155,173811070,116451780,2019-12-27 12:22:05
6,31151322178251,107921853,166741763,2019-12-27 21:08:43
7,31079856153738,178400128,61272835,2019-12-27 01:17:37
8,31118426867571,147941492,10986763,2019-12-27 12:00:27
9,31133458226149,164933170,135957741,2019-12-27 16:10:59


In [428]:
bleh = union.join(forwards_backwards)
bleh[(bleh.userid != bleh.forback)]

Unnamed: 0_level_0,userid,forback
shopid,Unnamed: 1_level_1,Unnamed: 2_level_1


In [425]:
# Check differences between union and 'manual' edit
# union.rename()
forwards_backwards.rename(columns = {'userid':'forback'}, inplace = True)

In [346]:
# Create new dataframe
final_csv = pd.DataFrame(
    {'shopid': fraud_shop,
     'userid': fraud_buyer,})
final_csv.loc[:,'userid'] = final_csv.userid.apply(lambda x: ('&').join([str(i) for i in x]))
final_csv

# Concat list of non-fraudulent shops
notfraud = [x for x in shopids if x not in fraud_shop]
final_csv = pd.concat((final_csv, pd.DataFrame({'shopid': notfraud,
             'userid': [str(0)]*len(notfraud)})), axis = 0).set_index('shopid')

# Export as csv
final_csv.to_csv('31GB_pathy_orderid.csv')

In [208]:
# Cross check with Raven's
raven = pd.read_csv('31GB_raven4.csv').set_index('shopid')
raven.rename(columns ={'userid':'ravenuser'}, inplace = True)

In [237]:
len(raven[raven.ravenuser != '0'])

264

In [240]:
len(final_csv[final_csv.userid != '0']) - len(raven[raven.ravenuser != '0'])

12

In [260]:
# Going backward in timedelta
df_brushing = pd.DataFrame(columns = ['orderid','shopid','userid','event_time'])
fraud_shop = []
fraud_buyer = []
for i in tqdm(shopids):
    sample = df[df.shopid == i].sort_values('event_time')
    brush_period = pd.DataFrame(columns = ['orderid','shopid','userid','event_time'])
    for j in sample.event_time:
        df_slice = sample[(sample.event_time <= j) & (sample.event_time >= j-pd.Timedelta('1H'))]
        conc = df_slice.orderid.count()/df_slice.userid.nunique()
        if conc >= 3:
            brush_period = pd.concat((brush_period,df_slice), axis = 0).sort_values('event_time')
    if len(brush_period) != 0:
        brush_period = brush_period.drop_duplicates(subset='orderid', keep = 'first')
        df_brushing = pd.concat((df_brushing, brush_period), axis = 0)
        fraud_shop.append(i)
        mode_user = sorted(list(brush_period.userid.value_counts().index[brush_period.userid.value_counts().values == max(brush_period.userid.value_counts().values)]))        
        fraud_buyer.append(mode_user)

HBox(children=(IntProgress(value=0, max=18770), HTML(value='')))

In [397]:
# Get forwards and backwards list
backwards = pd.read_csv('31GB_pathy_backward.csv').set_index('shopid')
forwards = pd.read_csv('31GB_pathy.csv').set_index('shopid')
forwards.rename(columns = {'userid':'forwards'}, inplace = True)
backwards.rename(columns = {'userid':'backwards'}, inplace = True)

In [410]:
# check with myself
check = forwards.join(backwards)
concat_diff = check[check.forwards != check.backwards]
concat_diff = concat_diff.replace('0', np.nan)

# Create list of userids
concat_diff.loc[concat_diff.forwards.notna(), 'forwards'] = concat_diff[concat_diff.forwards.notna()].forwards.apply(lambda x: x.split('&'))
concat_diff.loc[concat_diff.backwards.notna(), 'backwards'] = concat_diff[concat_diff.backwards.notna()].backwards.apply(lambda x: x.split('&'))

# Fill forwards and backwards
concat_diff.ffill(axis = 1, inplace = True)
concat_diff.bfill(axis = 1, inplace = True)

# # Join two lists
concat_diff.loc[:,'userid'] = concat_diff['forwards'] + concat_diff['backwards']
concat_diff

# Drop duplicates
unique_list = lambda x: list(dict.fromkeys(x))
concat_diff.loc[:,'userid'] = concat_diff.userid.apply(lambda x: ('&').join([str(i) for i in sorted([int(x) for x in (unique_list(x))])]))

In [413]:
#Get same from forwards and backwards
same_check = pd.DataFrame(check[check.forwards == check.backwards].loc[:,'forwards'])
same_check.rename(columns={'forwards':'userid'}, inplace = True)

#Concat different from forwards and backwards
forwards_backwards = pd.concat((same_check, pd.DataFrame(concat_diff.loc[:,'userid'])), axis = 0)

In [363]:
len(check[(check.forwards != check.backwards)& (check.backwards == '0')])

44

In [259]:
# CHECKING CHUNK
i = 156248768
sample = df[df.shopid == i].sort_values('event_time')
brush_period = pd.DataFrame(columns = ['orderid','shopid','userid','event_time'])
for j in sample.event_time:
    df_slice = sample[(sample.event_time <= j) & (sample.event_time >= j-pd.Timedelta('1H'))]
    conc = df_slice.orderid.count()/df_slice.userid.nunique()
    print(conc)
    if conc >= 3:
        brush_period = pd.concat((brush_period,df_slice), axis = 0).sort_values('event_time')
# brush_period = brush_period.drop_duplicates(subset='orderid', keep = 'first')
# mode_user = sorted(list(brush_period.userid.value_counts().index[brush_period.userid.value_counts().values == max(brush_period.userid.value_counts().values)]))    

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.5
1.3333333333333333
1.25
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
2.0
3.0
2.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [232]:
mode_user

[213502289]

In [200]:
raven

Unnamed: 0,shopid,userid
0,6042309,0
1,104804492,0
2,8715449,0
3,190969466,0
4,2859407,0
5,94479614,0
6,65551316,0
7,147941492,0
8,54615708,0
9,127236302,0


In [198]:
final_csv.sort_values('usercount', ascending = False)

Unnamed: 0_level_0,userid,usercount
shopid,Unnamed: 1_level_1,Unnamed: 2_level_1
54257623,1974334&107414154,2.0
175531295,187697407&215009429,2.0
156883302,76102350&188025647,2.0
181009364,101832161&214208720,2.0
155143347,156202149&214265994,2.0
143281052,99517130&186080843,2.0
51134277,29857724&212200633,2.0
1175477,122277324,1.0
736620,62618064,1.0
162282525,162389170,1.0


In [188]:
df_brushing

Unnamed: 0,orderid,shopid,userid,event_time
21027,31243838739795,1175477,122277324,2019-12-28 22:50:39
9409,31244142101712,1175477,122277324,2019-12-28 22:55:42
190109,31244747422135,1175477,122277324,2019-12-28 23:05:47
186125,31247365651612,66861410,213502289,2019-12-28 23:49:26
45551,31248087439155,66861410,213502289,2019-12-29 00:01:27
180814,31249068920318,66861410,213502289,2019-12-29 00:17:48
119754,31158495379642,8715449,9753706,2019-12-27 23:08:15
148038,31158633469991,8715449,9753706,2019-12-27 23:10:33
111247,31158746900290,8715449,9753706,2019-12-27 23:12:26
77275,31303367297188,58543771,61893096,2019-12-29 15:22:48


In [92]:
from collections import Counter
from itertools import groupby

l = [1,2,3,3,3,4,4,4,5,5,6,6,6]

# group most_common output by frequency
freqs = groupby(Counter(l).most_common(), lambda x:x[1])
# pick off the first group (highest frequency)
print([val for val,count in next(freqs)[1]])
# prints [3, 4, 6]

[3, 4, 6]


In [103]:
my_unique = lambda x: len(np.unique(x))
concentrate = lambda sample: sample.orderid.rolling('1H', closed='both').count()/sample.userid.rolling('1H', closed='both').apply(my_unique)
 
def get_order_brushing_user(shopid):
    sample = df.loc[df.shopid==shopid].copy(deep=True)
    c_rate = concentrate(sample)
    if sum(concentrate(sample)>=3)>0:        
        sample['c_rate'] = list(concentrate(sample))
#         prop = sample.groupby(by=['userid']).rolling('1H').orderid.count().div(sample.reset_index().groupby(by="event_time").orderid.count().rolling('1H').sum(), level='event_time').reset_index()
        brush_index = sample.loc[sample.c_rate>=3].index    
        sample = pd.concat([sample.loc[(i-pd.Timedelta('1H')):i] for i in brush_index]).sort_index()
#         prop = sample.groupby(by=['userid']).rolling('1H', closed='both').orderid.count().reset_index().groupby(by='userid').orderid.sum()/sample.rolling('1H', closed='both').orderid.count().sum()
        prop= sample.groupby(by=['userid']).orderid.count()
        return list(np.sort(list(prop[prop==prop.max()].index)))
#         return list(prop[prop.event_time.isin(sample[sample.c_rate>=3].index) & (prop.orderid==prop.orderid.max())].userid)
    else:
        return 0

In [None]:
for i in 

In [100]:
# Sample
i = 156883302
sample = df[df.shopid == i].sort_values('event_time')
sample.loc[sample.event_time == sample.event_time + pd.Timedelta('1H')]

Unnamed: 0,orderid,shopid,userid,event_time


In [106]:
sample = df[df.shopid == i].sort_values('event_time')#.set_index('event_time')
sample

Unnamed: 0,orderid,shopid,userid,event_time
215218,31100672373467,156883302,68977756,2019-12-27 07:04:32
68439,31100977597782,156883302,155379241,2019-12-27 07:09:37
162531,31101488337409,156883302,192062938,2019-12-27 07:18:08
72319,31103141463757,156883302,187119675,2019-12-27 07:45:41
152477,31104942645315,156883302,160218790,2019-12-27 08:15:42
87402,31105140135135,156883302,136087303,2019-12-27 08:19:01
43778,31106329531411,156883302,122736767,2019-12-27 08:38:50
22494,31106974728041,156883302,45050041,2019-12-27 08:49:34
7106,31110087220831,156883302,127299122,2019-12-27 09:41:27
7070,31111060412201,156883302,38945250,2019-12-27 09:57:40


In [91]:
# Sample
i = 156883302
sample = df[df.shopid == i].sort_values('event_time').set_index('event_time')
temp = pd.DataFrame(sample.orderid.rolling('1H').count() / sample.userid.rolling('1H').apply(lambda x: len(np.unique(x)), raw = True))
temp.loc[:,'userid_count'] = sample.userid.rolling('1H').apply(lambda x: len(np.unique(x)), raw = True)
# temp.at[:,'userids'] = sample.userid.rolling('1H').apply(lambda x: list(np.unique(x)), raw = True)
# temp[temp >= 3]

sample.userid.rolling('1H').apply(lambda x: ('-').join([str(i) for i in list(np.unique(x))]), raw = True)

TypeError: must be real number, not str

In [59]:
j=0
fraud_shops = []

for i in shopids:
    sample = df[df.shopid == i].sort_values('event_time').set_index('event_time')
    temp = sample.orderid.rolling('1H').count() / sample.userid.rolling('1H').apply(lambda x: len(np.unique(x)), raw = True)
    if len(temp[temp >= 3]) > 0:
        fraud_shops.append(i)
        j+=1
        
print(j)

264


In [63]:
fraud_shops

[1175477,
 156883302,
 1532569,
 27476241,
 28091290,
 80049863,
 210197928,
 104590058,
 188546697,
 43412276,
 68862371,
 86285837,
 156248768,
 192785588,
 52377417,
 131387639,
 87621695,
 130100254,
 50713918,
 169902791,
 42818,
 161160594,
 123548863,
 22800308,
 86368642,
 27015534,
 145777302,
 168046193,
 93358941,
 118949192,
 14184981,
 64394533,
 180676972,
 134968430,
 171407673,
 16001939,
 63001696,
 178273138,
 51134277,
 150526730,
 76668593,
 4888564,
 110641390,
 151556698,
 129460270,
 10206302,
 8996761,
 12078788,
 156246169,
 187570150,
 162043213,
 20522747,
 108170744,
 36641013,
 193424291,
 161277691,
 85648725,
 128300567,
 163968828,
 115179739,
 29650418,
 18190345,
 51487211,
 168388504,
 140626153,
 114919380,
 116337057,
 96460204,
 28164269,
 27987240,
 86822226,
 28812867,
 769445,
 119309887,
 13735592,
 118139770,
 45084184,
 160135616,
 66391375,
 130168935,
 165500538,
 823357,
 130118966,
 108064389,
 195870375,
 201503467,
 104245736,
 11612863