In [1]:
import numpy as np
import pandas as pd

import sqlite3

In [2]:
with sqlite3.connect('logs/logs.db') as conn:
    logs = pd.read_sql(
    """
    SELECT * FROM logs WHERE datetime BETWEEN '2018-01-01' AND '2019-01-01';
    """, conn, parse_dates = ['datetime'], index_col='datetime'
    )
    
logs.head()

Unnamed: 0_level_0,source_ip,username,success,failure_reason
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01 00:06:19.353126,223.178.55.3,djones,1,
2018-01-01 00:09:07.147971,223.178.55.3,djones,1,
2018-01-01 01:08:08.610041,6.252.142.27,asmith,1,
2018-01-01 02:37:50.329298,124.178.25.98,akim,1,
2018-01-01 02:45:20.382080,98.43.141.103,akim,1,


In [3]:
def get_X(log, day):
    """
    Get data we can use for the X
    
    Parameters:
        - log: The logs dataframe
        - day: A day or single value we can use as a datetime index slice
    
    Returns: 
        A pandas DataFrame
    """
    return pd.get_dummies(log[day].assign(
        failures=lambda x:  1 - x.success
    ).query('failures > 0').resample('1min').agg(
        {'username':'nunique', 'failures': 'sum'}
    ).dropna().rename(
        columns={'username':'usernames_with_failures'}
    ).assign(
        day_of_week=lambda x: x.index.dayofweek, 
        hour=lambda x: x.index.hour
    ).drop(columns=['failures']), columns=['day_of_week', 'hour'])

In [4]:
x = get_X(logs, '2018-01')
x.head()

Unnamed: 0_level_0,usernames_with_failures,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,hour_0,hour_1,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01 16:01:00,1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2018-01-01 16:02:00,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2018-01-01 16:03:00,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2018-01-01 16:04:00,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2018-01-01 16:05:00,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [5]:
from sklearn.svm import OneClassSVM
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [6]:
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('svm', OneClassSVM(random_state=0))
]).fit(x)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [7]:
preds = pipeline.predict(x)

  Xt = transform.transform(Xt)


In [11]:
pd.Series(np.where(preds == -1, 'outlier', 'inlier')).value_counts()

outlier    22823
inlier     18794
dtype: int64