In [386]:
import pandas as pd
import re
from datetime import datetime
import pytz
from urllib.parse import unquote

In [387]:
parse_datetime('13/Nov/2015:11:45:42 +0000')

datetime.datetime(2015, 11, 3, 11, 45, 4, tzinfo=<UTC>)

In [388]:
import os
current_file = os.getcwd()
myfile = os.path.join(current_file, 'www.pangaea.de-access-anon.log.bz2')

In [389]:
#ref: https://mmas.github.io/read-apache-access-log-pandas
logData = pd.read_csv(myfile,
    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
    engine='python',
    na_values='-',
    header=None,
    usecols=[0, 3, 4, 5, 7, 8],encoding="utf-8",
    names=['ip', 'time', 'request', 'status', 'referer', 'user_agent'])

In [390]:
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],"""GET /search?ie=UTF-8&q=parameter%3A%22Bathysi...",301.0,"""-""","""Mozilla/5.0 (compatible; AhrefsBot/6.1; +http..."


In [391]:
logData['status'] = logData['status'].fillna(0) #convert non-finite values (NA or inf) to integer
logData['status'] = logData['status'].astype(int)
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],"""GET /search?ie=UTF-8&q=parameter%3A%22Bathysi...",301,"""-""","""Mozilla/5.0 (compatible; AhrefsBot/6.1; +http..."


In [392]:
logData['request']= logData['request'].apply(lambda x: re.sub(r'^"|"$', '', str(x)) )
logData['user_agent']= logData['user_agent'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )
logData['referer']= logData['referer'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )

In [393]:
logData.shape

(3874088, 6)

In [394]:
logData['request_type'] = logData['request'].str.split().str[0]
logData['resource'] = logData['request'].str.split().str[1]
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent,request_type,resource
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],GET /search?ie=UTF-8&q=parameter%3A%22Bathysip...,301,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...,GET,/search?ie=UTF-8&q=parameter%3A%22Bathysiphon+...


In [395]:
logData = logData[(logData.request_type == 'GET') & (logData.status == 200)]
logData.shape

(3063078, 8)

In [396]:
logData=logData.drop('request', axis=1)
logData.head(1)

Unnamed: 0,ip,time,status,referer,user_agent,request_type,resource
3,75892c86ec3272b7fa79e0f2c375a9f5,[30/Sep/2019:06:25:56 +0000],200,-,Mozilla/5.0 (compatible; SemrushBot/6~bl; +htt...,GET,/?maxdate=2005-12-31T23%3A59%3A59&mindate=2005...


In [397]:
logData = logData[~logData['resource'].str.match(
    r'^/media|^/static|^/admin|^/robots.txt$|^/favicon.ico$')]
logData = logData[~logData['user_agent'].str.match(
    r'.*?bot|.*?spider|.*?crawler|.*?slurp', flags=re.I).fillna(False)]
logData.shape

(1858386, 7)

In [398]:
logData.head(2)

Unnamed: 0,ip,time,status,referer,user_agent,request_type,resource
9,eb6f30241bf7ff9bf1c6ddb6d5ea2d66,[30/Sep/2019:06:26:12 +0000],200,https://doi.pangaea.de/10.1594/PANGAEA.890974,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...,GET,/?q=SOCATv5
20,b66d6bd12d4aadc04d896d2d086e7572,[30/Sep/2019:06:26:42 +0000],200,https://wiki.pangaea.de/wiki/Main_Page,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,GET,/assets/layout-images/pangaea-logo.png


In [399]:
#https://doi.pangaea.de, https://www.pangaea.de
#use referrel to filter out external requests - only request originated from the portal will be considered
logData = logData[logData.referer.str.startswith('https://doi.pangaea.de') | logData.referer.str.startswith('https://www.pangaea.de')]
logData.shape

(1335062, 7)

In [490]:
#exclude queries originated from external search -> /advanced/search.php, /advanced/warehouse.php
logData = logData[~logData.resource.str.startswith('/advanced/')]
logData.shape

(1234130, 8)

In [491]:
logData.head(5)

Unnamed: 0,ip,time,status,referer,user_agent,request_type,resource,title
9,eb6f30241bf7ff9bf1c6ddb6d5ea2d66,[30/Sep/2019:06:26:12 +0000],200,https://doi.pangaea.de/10.1594/PANGAEA.890974,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...,GET,/?q=SOCATv5,/?q=SOCATv5
69,f7e9caa0b677920355cc01f122454d80,[30/Sep/2019:06:28:49 +0000],200,https://doi.pangaea.de/10.1594/PANGAEA.856679,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,GET,/shared/pics/licenses/CC-BY-NC-SA-3.0.png,/shared/pics/licenses/CC-BY-NC-SA-3.0.png
84,21346f2062c9244571b00c87599e8a71,[30/Sep/2019:06:29:45 +0000],200,https://doi.pangaea.de/10.1594/PANGAEA.906113,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:6...,GET,/assets/v.49ad7ed928027a3f2ade180129c6f614/boo...,/assets/v.49ad7ed928027a3f2ade180129c6f614/boo...
85,21346f2062c9244571b00c87599e8a71,[30/Sep/2019:06:29:45 +0000],200,https://doi.pangaea.de/10.1594/PANGAEA.906113,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:6...,GET,/assets/v.49ad7ed928027a3f2ade180129c6f614/css...,/assets/v.49ad7ed928027a3f2ade180129c6f614/css...
93,c55f188273605a153e5cd0985f1c56c2,[30/Sep/2019:06:30:16 +0000],200,https://doi.pangaea.de/10.1594/PANGAEA.744675,Mozilla/5.0 (Windows NT 6.1; Win64; x64) Apple...,GET,/assets/v.49ad7ed928027a3f2ade180129c6f614/boo...,/assets/v.49ad7ed928027a3f2ade180129c6f614/boo...


In [492]:
regex = '(\?q\=)(.*?)\&'
def parse_query(url):
    parsed = urlparse.urlparse(url)
    l = None
    if 'q' in parse_qs(parsed.query):
        l = parse_qs(parsed.query)['q']
        l= l[0].lower()
    else:
        search = re.search(regex, url, re.IGNORECASE)
        if search:
            l= search.group(2).lower()
    return l

In [493]:
searchData =logData.copy()
logData.shape

(1234130, 8)

In [494]:
searchData= searchData[searchData.resource.str.contains('q=')]
searchData.shape

(6091, 8)

In [495]:
keyWords = searchData['resource'].apply(parse_query)

In [496]:
top50KeyWords = keyWords.value_counts().head(50)

In [497]:
df = pd.DataFrame({'word':top50KeyWords.index, 'count':top50KeyWords.values})
df.head(20)

Unnamed: 0,word,count
0,"parameter:""file content""",116
1,"parameter:""carbon dioxide flux""",98
2,"parameter:""file name""",89
3,"parameter:""uniform resource locator/link to file""",70
4,melles,52
5,"parameter:""file size""",46
6,"parameter:""depth, sediment/rock""",45
7,"parameter:""age""",37
8,"parameter:""event label""",36
9,"parameter:""date/time""",32


In [498]:
#remove 'parameter:' from the word column
#df['word'] = df['word'].str.replace('parameter:', '')
df.head(50)

Unnamed: 0,word,count
0,"""file content""",116
1,"""carbon dioxide flux""",98
2,"""file name""",89
3,"""uniform resource locator/link to file""",70
4,melles,52
5,"""file size""",46
6,"""depth, sediment/rock""",45
7,"""age""",37
8,"""event label""",36
9,"""date/time""",32


In [504]:
temp = logData[logData.resource.str.contains('content')]
print(temp.shape)
for index, row in temp.iterrows():
    print(row['resource'])

(125, 8)
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=parameter:"File+content"
/?q=p