In [1]:
import pandas as pd
import re
from datetime import datetime
import pytz
from urllib.parse import urlparse
from urllib.parse import parse_qs
import numpy as np 

In [2]:
# datetime.datetime.strptime('13/Nov/2015:11:45:42','%Y-%m-%d:%H:%M:%S +%f')

In [3]:
import os
current_file = os.getcwd()
myfile = os.path.join(current_file, 'www.pangaea.de-access-anon.log.bz2')

In [4]:
#ref: https://mmas.github.io/read-apache-access-log-pandas
logData = pd.read_csv(myfile,
    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
    engine='python',
    na_values='-',
    header=None,
    usecols=[0, 3, 4, 5, 7, 8],encoding="utf-8",
    names=['ip', 'time', 'request', 'status', 'referer', 'user_agent'])

In [5]:
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],"""GET /search?ie=UTF-8&q=parameter%3A%22Bathysi...",301.0,"""-""","""Mozilla/5.0 (compatible; AhrefsBot/6.1; +http..."


In [6]:
logData['status'] = logData['status'].fillna(0) #convert non-finite values (NA or inf) to integer
logData['status'] = logData['status'].astype(int)
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],"""GET /search?ie=UTF-8&q=parameter%3A%22Bathysi...",301,"""-""","""Mozilla/5.0 (compatible; AhrefsBot/6.1; +http..."


In [7]:
logData['request']= logData['request'].apply(lambda x: re.sub(r'^"|"$', '', str(x)) )
logData['user_agent']= logData['user_agent'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )
logData['referer']= logData['referer'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )

In [8]:
logData.shape

(3874088, 6)

In [9]:
logData['request_type'] = logData['request'].str.split().str[0]
logData['resource'] = logData['request'].str.split().str[1]
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent,request_type,resource
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],GET /search?ie=UTF-8&q=parameter%3A%22Bathysip...,301,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...,GET,/search?ie=UTF-8&q=parameter%3A%22Bathysiphon+...


In [10]:
logData = logData[(logData.request_type == 'GET') & (logData.status == 200)]
logData.shape

(3063078, 8)

In [11]:
logData=logData.drop('request', axis=1)
logData.head(1)

Unnamed: 0,ip,time,status,referer,user_agent,request_type,resource
3,75892c86ec3272b7fa79e0f2c375a9f5,[30/Sep/2019:06:25:56 +0000],200,-,Mozilla/5.0 (compatible; SemrushBot/6~bl; +htt...,GET,/?maxdate=2005-12-31T23%3A59%3A59&mindate=2005...


In [12]:
logData = logData[~logData['resource'].str.match(
    r'^/media|^/static|^/admin|^/robots.txt$|^/favicon.ico$')]
logData = logData[~logData['user_agent'].str.match(
    r'.*?bot|.*?spider|.*?crawler|.*?slurp', flags=re.I).fillna(False)]
logData.shape

(1858386, 7)

In [13]:
logData.head(1)

Unnamed: 0,ip,time,status,referer,user_agent,request_type,resource
9,eb6f30241bf7ff9bf1c6ddb6d5ea2d66,[30/Sep/2019:06:26:12 +0000],200,https://doi.pangaea.de/10.1594/PANGAEA.890974,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...,GET,/?q=SOCATv5


In [14]:
#https://doi.pangaea.de, https://www.pangaea.de
#use referrel to filter out external requests - only request originated from the portal will be considered
logData = logData[logData.referer.str.startswith('https://doi.pangaea.de') | logData.referer.str.startswith('https://www.pangaea.de')]
logData.shape

(1335062, 7)

In [15]:
#exclude queries originated from external search -> /advanced/search.php, /advanced/warehouse.php
logData = logData[~logData.resource.str.contains('/advanced/')]
logData.shape

(1234130, 7)

In [16]:
regex = '(\?q\=)(.*?)\&'
def parse_query(url):
    parsed = urlparse(url)
    l = None
    if 'q' in parse_qs(parsed.query):
        l = parse_qs(parsed.query)['q']
        l= l[0].lower()
    else:
        search = re.search(regex, url, re.IGNORECASE)
        if search:
            l= search.group(2).lower()
    return l

In [66]:
searchData =logData.copy()
searchData.shape

(1234130, 7)

In [67]:
searchData= searchData[searchData.resource.str.contains('q=')] 
searchData.shape
# searchData.to_csv('searchData')

(5952, 7)

In [68]:
keyWords = searchData['resource']
# keyWords.value_counts().head(50)
# keyWords.to_csv('search1')

In [69]:
keyWords = searchData['resource'].apply(parse_query)
keyWords.head(5)

9                        socatv5
775     parameter:"file content"
2159       project:label:sponges
4107        parameter:"salinity"
4244        event:label:lavpicco
Name: resource, dtype: object

In [70]:
top50KeyWords = keyWords.value_counts().head(50)
# top50KeyWords

In [71]:
df = pd.DataFrame({'word':top50KeyWords.index, 'count':top50KeyWords.values})

In [72]:
def checkPar(x):
    if (x == True):
        return 'parameter'
    else: 
        return 'unknown'

In [73]:
df['type']= df['word'].str.contains('parameter')
df['type'] = df['type'].apply(lambda x: checkPar(x))

In [74]:
df.loc[4, 'type'] = 'author'
df.loc[13, 'type'] = 'instrument'
df.loc[14, 'type'] = 'parameter'
df.loc[15, 'type'] = 'observed'
df.loc[16, 'type'] = 'method'
df.loc[18, 'type'] = 'instrument'
df.loc[21, 'type'] = 'instrument'
df.loc[22, 'type'] = 'location'
df.loc[24, 'type'] = 'sampling'
df.loc[27, 'type'] = 'author'
df.loc[30, 'type'] = 'observed'
df.loc[31, 'type'] = 'sampling'
df.loc[33, 'type'] = 'instrument'
df.loc[35, 'type'] = 'subject'
df.loc[37, 'type'] = 'observed'
df.loc[39, 'type'] = 'parameter'
df.loc[45, 'type'] = 'project'
df.loc[41, 'type'] = 'project'
df.loc[42, 'type'] = 'instrument'
df.loc[43, 'type'] = 'author'
df.loc[45, 'type'] = 'project'
df.loc[46, 'type'] = 'method'
# df.loc[47, 'type'] = 'project'
df.loc[48, 'type'] = 'method'

In [79]:
#remove 'parameter:' from the word column send excel sheet
# add url next to type
df['word'] = df['word'].str.replace('parameter:', '')
df['word'] = df['word'].str.replace('"', '')
# df.to_csv('test')

In [76]:
types = df['type'].value_counts()
barData = pd.DataFrame({'type':types.index, 'count':types.values})
barData['count']=barData['count'].astype(int)
barData

Unnamed: 0,type,count
0,parameter,30
1,instrument,5
2,observed,3
3,author,3
4,method,3
5,sampling,2
6,project,2
7,location,1
8,subject,1


In [77]:
import plotly.express as px
fig = px.bar(barData, x='type', y='count')
fig.show()

In [78]:
import plotly.express as px
fig = px.pie(barData, values='count', names='type')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()