In [46]:
import pandas as pd
import re
from datetime import datetime
import pytz
from urllib.parse import urlparse
from urllib.parse import parse_qs

In [47]:
# datetime.datetime.strptime('13/Nov/2015:11:45:42','%Y-%m-%d:%H:%M:%S +%f')

In [48]:
import os
current_file = os.getcwd()
myfile = os.path.join(current_file, 'www.pangaea.de-access-anon.log.bz2')

In [49]:
#ref: https://mmas.github.io/read-apache-access-log-pandas
logData = pd.read_csv(myfile,
    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
    engine='python',
    na_values='-',
    header=None,
    usecols=[0, 3, 4, 5, 7, 8],encoding="utf-8",
    names=['ip', 'time', 'request', 'status', 'referer', 'user_agent'])

In [50]:
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],"""GET /search?ie=UTF-8&q=parameter%3A%22Bathysi...",301.0,"""-""","""Mozilla/5.0 (compatible; AhrefsBot/6.1; +http..."


In [51]:
logData['status'] = logData['status'].fillna(0) #convert non-finite values (NA or inf) to integer
logData['status'] = logData['status'].astype(int)
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],"""GET /search?ie=UTF-8&q=parameter%3A%22Bathysi...",301,"""-""","""Mozilla/5.0 (compatible; AhrefsBot/6.1; +http..."


In [52]:
logData['request']= logData['request'].apply(lambda x: re.sub(r'^"|"$', '', str(x)) )
logData['user_agent']= logData['user_agent'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )
logData['referer']= logData['referer'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )

In [53]:
logData.shape

(3874088, 6)

In [54]:
logData['request_type'] = logData['request'].str.split().str[0]
logData['resource'] = logData['request'].str.split().str[1]
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent,request_type,resource
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],GET /search?ie=UTF-8&q=parameter%3A%22Bathysip...,301,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...,GET,/search?ie=UTF-8&q=parameter%3A%22Bathysiphon+...


In [55]:
logData = logData[(logData.request_type == 'GET') & (logData.status == 200)]
logData.shape

(3063078, 8)

In [56]:
logData=logData.drop('request', axis=1)
logData.head(1)

Unnamed: 0,ip,time,status,referer,user_agent,request_type,resource
3,75892c86ec3272b7fa79e0f2c375a9f5,[30/Sep/2019:06:25:56 +0000],200,-,Mozilla/5.0 (compatible; SemrushBot/6~bl; +htt...,GET,/?maxdate=2005-12-31T23%3A59%3A59&mindate=2005...


In [57]:
logData = logData[~logData['resource'].str.match(
    r'^/media|^/static|^/admin|^/robots.txt$|^/favicon.ico$')]
logData = logData[~logData['user_agent'].str.match(
    r'.*?bot|.*?spider|.*?crawler|.*?slurp', flags=re.I).fillna(False)]
logData.shape

(1858386, 7)

In [58]:
logData.head(1)

Unnamed: 0,ip,time,status,referer,user_agent,request_type,resource
9,eb6f30241bf7ff9bf1c6ddb6d5ea2d66,[30/Sep/2019:06:26:12 +0000],200,https://doi.pangaea.de/10.1594/PANGAEA.890974,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...,GET,/?q=SOCATv5


In [59]:
#https://doi.pangaea.de, https://www.pangaea.de
#use referrel to filter out external requests - only request originated from the portal will be considered
logData = logData[logData.referer.str.startswith('https://doi.pangaea.de') | logData.referer.str.startswith('https://www.pangaea.de')]
logData.shape

(1335062, 7)

In [60]:
#exclude queries originated from external search -> /advanced/search.php, /advanced/warehouse.php
logData = logData[~logData.resource.str.startswith('/advanced/')]
logData.shape

(1234130, 7)

In [61]:
regex = '(\?q\=)(.*?)\&'
def parse_query(url):
    parsed = urlparse(url)
    l = None
    if 'q' in parse_qs(parsed.query):
        l = parse_qs(parsed.query)['q']
        l= l[0].lower()
    else:
        search = re.search(regex, url, re.IGNORECASE)
        if search:
            l= search.group(2).lower()
    return l

In [62]:
searchData =logData.copy()
searchData.shape

(1234130, 7)

In [63]:
searchData= searchData[searchData.resource.str.contains('q=')] 
searchData.shape

(5952, 7)

In [64]:
searchData= searchData[~searchData.resource.str.contains('/?q=parameter%3A%22-%22')]
# /?q=parameter%3A%22Event+label%22 & /?q=parameter%3A%22DATE%2FTIME%22

In [65]:
keyWords = searchData['resource']
keyWords.shape
# keyWords.to_csv('search1')

(5937,)

In [66]:
keyWords = searchData['resource'].apply(parse_query)
keyWords.head(5)

9                        socatv5
775     parameter:"file content"
2159       project:label:sponges
4107        parameter:"salinity"
4244        event:label:lavpicco
Name: resource, dtype: object

In [67]:
keyWords.shape

(5937,)

In [68]:
keyWords = keyWords[~keyWords.str.contains('file')]

In [69]:
keyWords.shape

(5523,)

In [70]:
keyWords.head(5)
# keyWords.to_csv('search2')

9                     socatv5
2159    project:label:sponges
4107     parameter:"salinity"
4244     event:label:lavpicco
4711     event:label:ps2122-2
Name: resource, dtype: object

In [71]:
top50KeyWords = keyWords.value_counts().head(50)

In [72]:
df = pd.DataFrame({'word':top50KeyWords.index, 'count':top50KeyWords.values})
df.head(50)

Unnamed: 0,word,count
0,"parameter:""carbon dioxide flux""",98
1,melles,52
2,"parameter:""depth, sediment/rock""",45
3,"parameter:""age""",37
4,"parameter:""event label""",36
5,"parameter:""date/time""",32
6,"parameter:""methane flux""",30
7,"parameter:""temperature, soil""",29
8,δ18o age model,28
9,vos finnpartner 2005,28


In [73]:
#remove 'parameter:' from the word column
df['word'] = df['word'].str.replace('parameter:', '')
df['word'] = df['word'].str.replace('"', '')
df.head(50)

Unnamed: 0,word,count
0,carbon dioxide flux,98
1,melles,52
2,"depth, sediment/rock",45
3,age,37
4,event label,36
5,date/time,32
6,methane flux,30
7,"temperature, soil",29
8,δ18o age model,28
9,vos finnpartner 2005,28


In [29]:
temp = logData[logData.resource.str.contains('content')]
print(temp.shape)
for index, row in temp.iterrows():
    print(row['resource'])

(125, 7)
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=parameter%3A%22File+content%22
/?q=paramet