In [23]:
import elasticsearch

In [22]:
!pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-7.6.0-py2.py3-none-any.whl (88 kB)
[K     |████████████████████████████████| 88 kB 4.2 MB/s eta 0:00:011
Installing collected packages: elasticsearch
Successfully installed elasticsearch-7.6.0


In [16]:
#!/usr/bin/env python
import gzip
import os
import sys
import re
import pandas as pd

INPUT_DIR = "log_store/"

lineformat1 = re.compile(r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(GET|POST) )(?P<url>.+)(http\/1\.1")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (?P<refferer>-|"([^"]+)") (["](?P<useragent>[^"]+)["])""", re.IGNORECASE)
lineformat2 = re.compile(r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(GET|POST) )(?P<url>.+)(http\/1\.1")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (["](?P<refferer>(\-)|(.+))["]) (["](?P<useragent>.+)["])""", re.IGNORECASE)

lineformat3 = re.compile(r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(GET|POST) )(?P<url>.+)(http\/[1-2]\.[0-9]")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (?P<refferer>-|"([^"]+)") (["](?P<useragent>[^"]+)["])""", re.IGNORECASE)
lineformat4 = re.compile( r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - (?P<remoteuser>.+) \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(?P<method>.+) )(?P<url>.+)(http\/[1-2]\.[0-9]")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (["](?P<refferer>(\-)|(.+))["]) (["](?P<useragent>.+)["])""", re.IGNORECASE)

def log_reader(INPUT_DIR=,lineformat=lineformat1,output='print'):
    
    out=[]
    for f in os.listdir(INPUT_DIR):
        if f.endswith(".gz"):
            logfile = gzip.open(os.path.join(INPUT_DIR, f))
        else:
            logfile = open(os.path.join(INPUT_DIR, f))

        for l in logfile.readlines():
            data = re.search(lineformat, l)
            if data:
                datadict = data.groupdict()
                ip = datadict["ipaddress"]
                datetimestring = datadict["dateandtime"]
                url = datadict["url"]
                bytessent = datadict["bytessent"]
                referrer = datadict["refferer"]
                useragent = datadict["useragent"]
                status = datadict["statuscode"]
                method = data.group(6)
                res=[ip, \
                      datetimestring, \
                      url, \
                      bytessent, \
                      referrer, \
                      useragent, \
                      status, \
                      method]
                if output=='df':
                    out.append(res)
                elif output=='print':
                    print(res)

        logfile.close()

    if output=='df':
        cols=['ip','datetime','url','bytessent','referrer','useragent','statuscode','method']
        return pd.DataFrame(out,columns=cols)

In [14]:
log_reader(INPUT_DIR,lineformat1,'df').tail(50)

Unnamed: 0,ip,datetime,url,bytessent,referrer,useragent,statuscode,method
19,194.35.233.182,17/Apr/2020:16:45:52 +0000,/_dash-update-component,194,"""https://countingchickens.co.uk/""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
20,194.35.233.182,17/Apr/2020:16:46:09 +0000,/,572,"""-""",WhatsApp/0.4.930 N,200,GET
21,194.35.233.182,17/Apr/2020:16:46:09 +0000,/_favicon.ico?v=1.8.0,9662,"""-""",-,200,GET
22,194.35.233.182,17/Apr/2020:16:46:26 +0000,/_dash-update-component,1137,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
23,194.35.233.182,17/Apr/2020:16:46:26 +0000,/_dash-update-component,394,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
24,194.35.233.182,17/Apr/2020:16:46:26 +0000,/_dash-update-component,251,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
25,194.35.233.182,17/Apr/2020:16:46:26 +0000,/_dash-update-component,298,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
26,194.35.233.182,17/Apr/2020:16:46:27 +0000,/_dash-update-component,294,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
27,194.35.233.182,17/Apr/2020:16:46:27 +0000,/_dash-update-component,501,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
28,194.35.233.182,17/Apr/2020:16:46:27 +0000,/_dash-update-component,30275,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST


In [19]:
log_reader(INPUT_DIR,lineformat3,'df').tail(50)

Unnamed: 0,ip,datetime,url,bytessent,referrer,useragent,statuscode,method
19,194.35.233.182,17/Apr/2020:16:45:52 +0000,/_dash-update-component,194,"""https://countingchickens.co.uk/""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
20,194.35.233.182,17/Apr/2020:16:46:09 +0000,/,572,"""-""",WhatsApp/0.4.930 N,200,GET
21,194.35.233.182,17/Apr/2020:16:46:09 +0000,/_favicon.ico?v=1.8.0,9662,"""-""",-,200,GET
22,194.35.233.182,17/Apr/2020:16:46:26 +0000,/_dash-update-component,1137,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
23,194.35.233.182,17/Apr/2020:16:46:26 +0000,/_dash-update-component,394,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
24,194.35.233.182,17/Apr/2020:16:46:26 +0000,/_dash-update-component,251,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
25,194.35.233.182,17/Apr/2020:16:46:26 +0000,/_dash-update-component,298,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
26,194.35.233.182,17/Apr/2020:16:46:27 +0000,/_dash-update-component,294,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
27,194.35.233.182,17/Apr/2020:16:46:27 +0000,/_dash-update-component,501,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
28,194.35.233.182,17/Apr/2020:16:46:27 +0000,/_dash-update-component,30275,"""https://countingchickens.co.uk/COFFEE""",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3...,200,POST
