In [98]:
import pandas as pd
import datetime
df = pd.read_csv( "castle-event-data-v4.csv",sep=',', header=0, parse_dates=True,quoting=1 )


In [99]:
import numpy as np

#### replace all values "custome" in column name with custom name

In [100]:
df.name=df.name.replace(to_replace='custom',value=np.nan)
df.name=df.name.fillna(df.custom_name)

####  Calculate the percentile of each time difference between actions

In [122]:
df.sort_values(by=['device_id','created_at'],ascending=True,inplace=True)
device_grouped=df.groupby(by='device_id')
df['time_diff']=device_grouped.created_at.diff()

In [125]:
df.time_diff=df.time_diff.fillna(df.time_diff.mean())

In [130]:
df['time_diff_pct']=df.time_diff.rank(pct=True)

In [181]:
df.columns

Index([u'created_at', u'user_id', u'session_id', u'device_id', u'ua', u'l',
       u'name', u'custom_name', u'ip', u'continent', u'country', u'city',
       u'lon', u'lat', u'accuracy', u'isp_name', u'datacenter', u'proxy',
       u'mobile_device', u'os', u'agent_name', u'time_diff', u'time_diff_pct',
       u'country_pct', u'name_pct', u'os_pct', u'isp_name_pct',
       u'agent_name_pct', u'lon_diff', u'lat_diff', u'speed'],
      dtype='object')

#### Calculate the speed ofuser travels

In [175]:
df['lon_diff']=device_grouped.lon.diff()
df['lat_diff']=device_grouped.lat.diff()

In [177]:
df['speed']=(df.lon_diff.pow(2)+df.lat_diff.pow(2))/df.time_diff

In [182]:
df.speed=df.speed.fillna(0)

#### Calculate the percentile of each time difference between actions, country, isp_name, agent_name

In [165]:
# function to convert categorical to percentile
def cat_to_pct(df,col):
    pct=df[col].value_counts()/df[col].notnull().sum()
    df=df.join(pct,on=[col],how='left',rsuffix='_pct')
    return df

In [166]:
df=cat_to_pct(df,'os')

In [None]:
df=cat_to_pct(df,'country')

In [183]:
df=cat_to_pct(df,'continent')

In [169]:
df=cat_to_pct(df,'isp_name')

In [170]:
df=cat_to_pct(df,'agent_name')

In [None]:
df=cat_to_pct(df,'name')

In [172]:
df.columns

Index([u'created_at', u'user_id', u'session_id', u'device_id', u'ua', u'l',
       u'name', u'custom_name', u'ip', u'continent', u'country', u'city',
       u'lon', u'lat', u'accuracy', u'isp_name', u'datacenter', u'proxy',
       u'mobile_device', u'os', u'agent_name', u'time_diff', u'time_diff_pct',
       u'country_pct', u'name_pct', u'os_pct', u'isp_name_pct',
       u'agent_name_pct'],
      dtype='object')

In [101]:
#read in the agent os file i generated from user agent API
df_agent_os=pd.read_csv('agent_name.csv')

In [102]:
df_agent_os.columns.values

array(['Unnamed: 0', '0', '1'], dtype=object)

In [103]:
df_agent_os.rename(columns={'Unnamed: 0':'ua','0':'os','1':'agent_name'},inplace=True)

In [104]:
df_agent_os.head()

Unnamed: 0,ua,os,agent_name
0,Mozilla/5.0 (Linux; Android 5.1.1; Ilium_PAD_i...,Android,Android Webkit Browser
1,Mozilla/5.0 (Linux; Android 4.4.2; SM-G355M Bu...,Android,Android Webkit Browser
2,Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7....,Windows NT,Internet Explorer
3,Mozilla/5.0 (Linux; Android 5.0.1; SM-N910P Bu...,Android,Android Webkit Browser
4,Mozilla/5.0 (Linux; Android 4.4.4; ILIUM L900 ...,Android,Android Webkit Browser


In [105]:
df=df.merge(df_agent_os,left_on='ua',right_on='ua',how='left')

In [106]:
df_model=df.drop(['ua','created_at','user_id','session_id','device_id','ip','lon','lat','accuracy','custom_name'],axis=1)
df_model=df_model.drop(['l','city','isp_name'],axis=1)

In [107]:
df_model.columns

Index([u'name', u'continent', u'country', u'datacenter', u'proxy',
       u'mobile_device', u'os', u'agent_name'],
      dtype='object')

In [108]:
df.name.value_counts()

page_viewed                                   1535114
login_succeeded                                335810
passwordChange_succeeded                        73548
Payment With Balance                            62531
Regular Cellphone Confirmation                  26639
passwordReset_requested                         14829
Create Withdraw                                 13383
Account Create Edc Deposit On Landing Page      12417
registration_succeeded                          11175
Card Checkout                                    7152
Mobile Show Cvv                                  4975
Cellphone Confirmation                           3864
First Timeline Access                            3580
First Card Access                                3576
Mobile Withdraw Check                            1728
New Contact                                       463
Account Login Soft Block                          200
Mobile Account Change Password                      9
Regular Cellphone Confirmati

In [109]:
df_trans=pd.get_dummies(df_model,dummy_na=True)

In [110]:
df_trans.columns

Index([u'datacenter', u'proxy', u'mobile_device',
       u'name_Account Create Edc Deposit On Landing Page',
       u'name_Account Login Mobile Chat#using Token',
       u'name_Account Login Soft Block', u'name_Card Checkout',
       u'name_Cellphone Confirmation', u'name_Create Withdraw',
       u'name_First Card Access',
       ...
       u'agent_name_Opera', u'agent_name_Opera Mini', u'agent_name_PHP',
       u'agent_name_Palemoon', u'agent_name_RockMelt', u'agent_name_Safari',
       u'agent_name_SeaMonkey', u'agent_name_UC Browser',
       u'agent_name_unknown', u'agent_name_nan'],
      dtype='object', length=170)

In [111]:
df_trans.shape

(2111004, 170)

In [112]:
# apparently we need to do 1 hot coding for the model
df_trans=pd.get_dummies(df_model,dummy_na=True)

In [113]:
df_trans.head()

Unnamed: 0,datacenter,proxy,mobile_device,name_Account Create Edc Deposit On Landing Page,name_Account Login Mobile Chat#using Token,name_Account Login Soft Block,name_Card Checkout,name_Cellphone Confirmation,name_Create Withdraw,name_First Card Access,...,agent_name_Opera,agent_name_Opera Mini,agent_name_PHP,agent_name_Palemoon,agent_name_RockMelt,agent_name_Safari,agent_name_SeaMonkey,agent_name_UC Browser,agent_name_unknown,agent_name_nan
0,False,False,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,False,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,False,False,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,False,False,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,False,False,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [114]:
#initialize isolation forest
from sklearn.ensemble import IsolationForest
seed=25
sf=IsolationForest(contamination=0.001,n_jobs=-1,random_state=seed)

In [115]:
# fitting the isolation forest model
sf.fit(df_trans)


KeyboardInterrupt: 

In [None]:
predict_anom=sf.predict(df_trans)

In [None]:
df_trans['score']=predict_anom

In [None]:
df['score']=predict_anom

In [None]:
df_trans.head()

In [None]:
df_anom=df[df.score==-1]

In [None]:
# columns in model: 'name', u'continent', u'country', u'datacenter', u'proxy',
#       u'mobile_device'
df_anom.name.value_counts()

In [None]:
df_anom.mobile_device.value_counts()

In [None]:
df.name.value_counts()

In [None]:
df_anom.country.value_counts()

In [None]:
df.shape