# By Mehdi Touil - Data Scientist / engineer
# 1. Importing libraries


In [2]:
import pandas
import re
from h2o_wave import site, ui

try:
    from urllib.parse import unquote # Python 3
except ImportError:
    from urllib import unquote       # Python 2


# Get Data : We Use Apache web server access logs 

In [3]:
filename = 'access_log.gz'

##### host: The IP address of the client making the request.
##### user: The user associated with the request (if available).
##### http_code: The HTTP response status code returned by the server (e.g., 200 for success, 404 for not found, 304 for not modified).
##### response_bytes: The size of the response in bytes.
##### referer: The referring page or URL that led to the request.
##### user_agent: Information about the client's user agent, typically the web browser or user agent string.
##### unknown: Unidentified or irrelevant column.
##### time: The timestamp of the request.
##### command: The HTTP request method (e.g., GET, POST, etc.).
##### path: The path or endpoint requested.
##### protocol: The protocol used (e.g., HTTP/1.1).


# 2 Data cleaning (pandas)
1. Import the Apache access log file
2. Parse the logs into a Pandas dataframe

In [4]:
def parseApacheLogs(filename):
    fields = ['host', 'identity', 'user', 'time_part1', 'time_part2', 'cmd_path_proto', 
             'http_code', 'response_bytes', 'referer', 'user_agent', 'unknown']
    data = pandas.read_csv(filename, compression='gzip', sep=' ', header=None, names=fields, na_values=['-'])

    # Panda's parser mistakenly splits the date into two columns, so we must concatenate them
    time = data.time_part1 + data.time_part2
    time_trimmed = time.map(lambda s: s.strip('[]').split('-')[0]) # Drop the timezone for simplicity
    data['time'] = pandas.to_datetime(time_trimmed, format='%d/%b/%Y:%H:%M:%S')
    
    # Split column `cmd_path_proto` into three columns, and decode the URL (ex: '%20' => ' ')
    data['command'], data['path'], data['protocol'] = zip(*data['cmd_path_proto'].str.split().tolist())
    data['path'] = data['path'].map(lambda s: unquote(s))
    
    # Drop the fixed columns and any empty ones
    data1 = data.drop(['time_part1', 'time_part2', 'cmd_path_proto'], axis=1)
    return data1.dropna(axis=1, how='all')

logs = parseApacheLogs(filename)
logs[:3]

Unnamed: 0,host,user,http_code,response_bytes,referer,user_agent,unknown,time,command,path,protocol
0,201.21.152.44,,404,89.0,,Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKi...,random-site.com,2013-04-25 14:05:35,GET,/favicon.ico,HTTP/1.1
1,70.194.129.34,,200,2553.0,http://www.random-site.com/,Mozilla/5.0 (Linux; U; Android 4.1.2; en-us; S...,www.random-site.com,2013-04-25 14:10:48,GET,/include/jquery.jshowoff.min.js,HTTP/1.1
2,70.194.129.34,,304,,http://www.random-site.com/,Mozilla/5.0 (Linux; U; Android 4.1.2; en-us; S...,www.random-site.com,2013-04-25 14:10:48,GET,/include/main.css,HTTP/1.1


# 3. Spilling the beans

3.1 HTTP status code and associated IP addresses

In [5]:
http_code_host = logs.groupby([logs.http_code, logs.host]).count()
http_code_host.iloc[:len(http_code_host.index)]

Unnamed: 0_level_0,Unnamed: 1_level_0,user,response_bytes,referer,user_agent,unknown,time,command,path,protocol
http_code,host,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
200,208.115.113.91,0,47,0,47,47,47,47,47,47
200,208.54.80.244,0,21,21,21,21,21,21,21,21
200,208.68.234.99,1,1,0,1,0,1,1,1,1
200,70.194.129.34,0,5,4,5,5,5,5,5,5
200,72.133.47.242,0,3,3,3,3,3,3,3,3
200,88.112.192.2,0,7,7,7,7,7,7,7,7
200,98.238.13.253,0,6,6,6,6,6,6,6,6
200,99.127.177.95,0,18,18,18,18,18,18,18,18
301,208.115.113.91,0,2,0,2,2,2,2,2,2
302,208.115.113.91,0,2,0,2,2,2,2,2,2


3.2 Finding an outlier by mapping IP addresses with URL path

In [6]:
#Bundle edges into a Pandas group when they share the same attributes like 'host' and 'path'
grouped_logs = logs.groupby(['host', 'path', 'user_agent', 'command', 'protocol', 'http_code'])

# Make dataframes count, min_time, max_time, and referer that are indexed by the groupby keys.
count = grouped_logs.size().to_frame('count')
min_time = grouped_logs['time'].agg('min').to_frame('time (min)')
max_time = grouped_logs['time'].agg('max').to_frame('time (max)')

def mostFrequent(x):
    s = x.value_counts()
    return None#return s.index[0] if len(s.index > 0) else None
referer = grouped_logs['referer'].agg(mostFrequent)

# Join into one table based on the same groupby keys
# We remove the indexes (via reset_index) since we do not need them anymore.
summary = count.join([min_time, max_time, referer]).reset_index()

def path2pathGraph(summary):
    host2path = summary[['host', 'path']].copy()
    host2path['path'] = host2path['path'].map(lambda p: p.split('?')[0])
    sessions = pandas.merge(host2path, host2path, on='host').drop_duplicates()

    host2color = {host: 265000 + index for index, host in enumerate(sessions.host.unique())}
    sessions['ecolor'] = sessions['host'].map(lambda x: host2color[x])
    return sessions

sessionEdges = path2pathGraph(summary)


In [22]:
sessionEdges

Unnamed: 0,host,path_x,path_y,ecolor
0,201.21.152.44,/favicon.ico,/favicon.ico,265000
1,208.115.113.91,/New_York/calendar/,/New_York/calendar/,265001
2,208.115.113.91,/New_York/calendar/,/New_York/calendar/calendar.php,265001
4,208.115.113.91,/New_York/calendar/,/austin/calendar/calendar.php,265001
8,208.115.113.91,/New_York/calendar/,/blog/,265001
...,...,...,...,...
4431,99.127.177.95,/san_francisco/images/mainimages.jpg,/dallas/photos/rcache/bb416414d86c8cc0e08cfdcb...,265008
4432,99.127.177.95,/san_francisco/images/mainimages.jpg,/include/jquery.js,265008
4433,99.127.177.95,/san_francisco/images/mainimages.jpg,/include/main.css,265008
4434,99.127.177.95,/san_francisco/images/mainimages.jpg,/san_francisco/calendar/calendar.php,265008


# Observations and findings

Flag 1:

From 3.1 we can notice that there are huge number of 401 http request from a specific IP address (The HTTP 401 Unauthorized client error status response code indicates that the request has not been applied because it lacks valid authentication credentials for the target resource.)

Flag 2:

From 3.2 we have found that the mapping of IP address to a specific URL path. From the graphistry we can notice that IP address linked to admin page is an outlier

Flag 3:

Now let us list that specific IP address which generated 401 request

In [None]:
logs.loc[(logs.host == "208.68.234.99") & (logs.path == "//admin")]

It can be noted that there 1037 request of 401 made from a specific IP address to the admin page. Also it can be noted from above there one request of 200  which indicates the login success after brute force attempt

In [None]:
logs.loc[(logs.host == "208.68.234.99") & (logs.path == "//admin") & (logs.http_code == 200)]

The log evidence shows that there was a successful brute force attempt made and compromised the account