# Anomaly Detection using web servers' logs

## Problem
We need to detect itrusioners and crawlers of a web server from its users' foot prints that provided for us as a log file.

It's an unsupervised problem becuase, there isn't any label for the anomalies, So we have to figure out them and find them from the log server.


In [None]:
# Import requirements
import re
import pandas as pd
import numpy as np
!pip install pandas-profiling==2.7.1
from pandas_profiling import ProfileReport

import matplotlib.pyplot as plt

## Prepare data
1. Create a `.csv` from the provided logs.
2. Extract features.
3. Clean data:
  * Fix datetimes.
  * Fix missing values.
  * Figure out categories.
  * Check correlations.
4. Prepare data to fit our model:
  * Check the balences.
  * Split train, test.


In [60]:
# !gzip --decompress drive/MyDrive/Rahnema-College/Tuning/Final-Project/output.log.gz

In [61]:
from google.colab import drive
drive.mount("/content/drive/")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


### Extract features from log files to build .csv format


In [62]:
# Regex Pattern
pattern = '(?P<Client>\S+) \[(?P<Time>\S+)\] \[(?P<Method>\S+) (?P<Request>\S+)\] (?P<Status>\S+) (?P<Size>\S+) \[\[(?P<UserAgent>[\S\s]+)\]\] (?P<Duration>\S+)'
file_path = 'drive/MyDrive/Rahnema-College/Tuning/Final-Project/output.log'
columns = ["Client", "Datetime", "Method", "Request", "Status", "Length", "UserAgent", "ResponseTime"]

In [63]:
# Find regex in our logs
def parse_data(file_path, pattern):
  """
  Return the part of data that extracted by given pattern.
  file_path -> Your log file.
  pattern -> The pattern that you're looking for in your logs.

  Return parsed_line -> as a list of finded data.
  """
  parsed_lines = []

  with open(file_path) as logs:
    for line in logs:
      try:
        finded_pattern = list(re.findall(pattern, line)[0])
        parsed_lines.append(finded_pattern)
      except Exception as e:
        print("There is an error while parsing data! Try Again :(")
  return parsed_lines

In [64]:
extracted_features = parse_data(file_path, pattern)
extracted_features[0]

['207.213.193.143',
 '2021-5-12T5:6:0.0+0430',
 'Get',
 '/cdn/profiles/1026106239',
 '304',
 '0',
 'Googlebot-Image/1.0',
 '32']

In [65]:
# Create a .csv format
data = pd.DataFrame(extracted_features, columns=columns)
data.head()

Unnamed: 0,Client,Datetime,Method,Request,Status,Length,UserAgent,ResponseTime
0,207.213.193.143,2021-5-12T5:6:0.0+0430,Get,/cdn/profiles/1026106239,304,0,Googlebot-Image/1.0,32
1,207.213.193.143,2021-5-12T5:6:0.0+0430,Get,images/badge.png,304,0,Googlebot-Image/1.0,4
2,35.110.222.153,2021-5-12T5:6:0.0+0430,Get,/pages/630180847,200,52567,Mozilla/5.0 (Linux; Android 6.0.1; SAMSUNG SM-...,32
3,35.108.208.99,2021-5-12T5:6:0.0+0430,Get,images/fav_icon2.ico,200,23531,Mozilla/5.0 (Linux; Android 6.0; CAM-L21) Appl...,20
4,35.110.222.153,2021-5-12T5:6:0.0+0430,Get,images/sanjagh_logo_purpule5.png,200,4680,Mozilla/5.0 (Linux; Android 6.0.1; SAMSUNG SM-...,8


In [66]:
data.shape

(1260035, 8)

In [67]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260035 entries, 0 to 1260034
Data columns (total 8 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   Client        1260035 non-null  object
 1   Datetime      1260035 non-null  object
 2   Method        1260035 non-null  object
 3   Request       1260035 non-null  object
 4   Status        1260035 non-null  object
 5   Length        1260035 non-null  object
 6   UserAgent     1260035 non-null  object
 7   ResponseTime  1260035 non-null  object
dtypes: object(8)
memory usage: 76.9+ MB


### Fix time series

In [68]:
# Convert datetimes to datetime format
data["Datetime"] = pd.to_datetime(data["Datetime"], format="%Y-%m-%dT%H:%M:%S")

In [69]:
# Split datatimes
data['Year'] = data.Datetime.dt.year
data['Month'] = data.Datetime.dt.month
data['Day'] = data.Datetime.dt.day
data['Hour'] = data.Datetime.dt.hour
data['Minute'] = data.Datetime.dt.minute
data['Second'] = data.Datetime.dt.second
data['dayOfWeek'] = data.Datetime.dt.dayofweek
data['dayOfYear'] = data.Datetime.dt.dayofyear

In [70]:
data.drop("Datetime", axis=1, inplace=True)

In [71]:
data.head()

Unnamed: 0,Client,Method,Request,Status,Length,UserAgent,ResponseTime,Year,Month,Day,Hour,Minute,Second,dayOfWeek,dayOfYear
0,207.213.193.143,Get,/cdn/profiles/1026106239,304,0,Googlebot-Image/1.0,32,2021,5,12,5,6,0,2,132
1,207.213.193.143,Get,images/badge.png,304,0,Googlebot-Image/1.0,4,2021,5,12,5,6,0,2,132
2,35.110.222.153,Get,/pages/630180847,200,52567,Mozilla/5.0 (Linux; Android 6.0.1; SAMSUNG SM-...,32,2021,5,12,5,6,0,2,132
3,35.108.208.99,Get,images/fav_icon2.ico,200,23531,Mozilla/5.0 (Linux; Android 6.0; CAM-L21) Appl...,20,2021,5,12,5,6,0,2,132
4,35.110.222.153,Get,images/sanjagh_logo_purpule5.png,200,4680,Mozilla/5.0 (Linux; Android 6.0.1; SAMSUNG SM-...,8,2021,5,12,5,6,0,2,132


### Split Requests
* Level_1
  * Level_2

In [101]:
# Level Requests' path to make them more categorized
path_l1 = []
path_l2 = []
paths_pattern = re.compile(r'^(\/|\w+)')
for element in data.Request.values:
  finded_part = paths_pattern.findall(element)[0]
  path_l1.append(finded_part)
  path_l2.append(element.replace(finded_part, ''))

data["Req_Path_L1"] = path_l1
data["Req_Path_L2"] = path_l2

In [103]:
data.head()

Unnamed: 0,Client,Method,Request,Status,Length,UserAgent,ResponseTime,Year,Month,Day,Hour,Minute,Second,dayOfWeek,dayOfYear,Req_Path_L1,Req_Path_L2
0,207.213.193.143,Get,/cdn/profiles/1026106239,304,0,Googlebot-Image/1.0,32,2021,5,12,5,6,0,2,132,/,cdnprofiles1026106239
1,207.213.193.143,Get,images/badge.png,304,0,Googlebot-Image/1.0,4,2021,5,12,5,6,0,2,132,images,/badge.png
2,35.110.222.153,Get,/pages/630180847,200,52567,Mozilla/5.0 (Linux; Android 6.0.1; SAMSUNG SM-...,32,2021,5,12,5,6,0,2,132,/,pages630180847
3,35.108.208.99,Get,images/fav_icon2.ico,200,23531,Mozilla/5.0 (Linux; Android 6.0; CAM-L21) Appl...,20,2021,5,12,5,6,0,2,132,images,/fav_icon2.ico
4,35.110.222.153,Get,images/sanjagh_logo_purpule5.png,200,4680,Mozilla/5.0 (Linux; Android 6.0.1; SAMSUNG SM-...,8,2021,5,12,5,6,0,2,132,images,/sanjagh_logo_purpule5.png


In [107]:
data["Req_Path_L1"].value_counts()

/            663910
images       298288
js           125224
fonts         98485
css           56303
templates     17825
Name: Req_Path_L1, dtype: int64

### Check Dataset

In [18]:
# Create a profile report
profile = ProfileReport(data)
profile.to_file("drive/MyDrive/Rahnema-College/Tuning/Final-Project/profile_report.html")

KeyboardInterrupt: ignored

* There are some rows which don't have any values for `ResponseTime` and `Clinet` features.
* Take care: some of rows contains `Clients'` value but no `ResponseTime` value.

In [None]:
data[(data["Client"] == '-') & (data["ResponseTime"] == '-')]

In [None]:
data[(data["Client"] != '-') & (data["ResponseTime"] == '-')]

In [None]:
# Fill the - values with null
data.loc[data["Client"] == '-', "Client"] = np.nan

# Becuase response time will be an integer value we put -1 to be convertable
data.loc[data["ResponseTime"] == '-', "ResponseTime"] = np.nan

In [None]:
# Convert integers
data["Status"] = data["Status"].astype("int64")
data["Length"] = data["Length"].astype("int64")
data["ResponseTime"] = data["ResponseTime"].astype("int64")

In [None]:
# Then set -1s to null
data.loc[data["ResponseTime"] == -1, "ResponseTime"] = None

In [None]:
# Check count of status
data["Status"].hist();

### Fix Missing values

In [None]:
data.info()

In [None]:
data["Client"].isna().sum()

In [None]:
data["ResponseTime"].isna().sum()

In [None]:
pd.DataFrame.fillna()

In [None]:
data["ResponseTime"].mean()

In [None]:
data.describe()

### Fix Categorical

* Categorical features: Method, Status

In [None]:
data["Method"].value_counts()

In [None]:
data["Status"].value_counts()

In [None]:
# Fix Method feature
data["Method"] = data["Method"].astype("category")
data = pd.get_dummies(data, columns=["Method"], drop_first=True)

In [None]:
# Fix Status feature
data["Status"] = data["Status"].astype("category")
data["Status_cat"] = data["Status"].cat.codes+1
data.drop("Status", axis=1, inplace=True)

In [None]:
data.head()