In [2]:
import pandas as pd
import re
from datetime import datetime
import pytz
from urllib.parse import urlparse,unquote
from urllib.parse import parse_qs

In [3]:
import os
current_file = os.getcwd()
myfile = os.path.join(current_file, 'www.pangaea.de-access-anon.log.bz2')

In [4]:
logData = pd.read_csv(myfile,
    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
    engine='python',
    na_values='-',
    header=None,
    usecols=[0, 3, 4, 5, 7, 8],encoding="utf-8",
    names=['ip', 'time', 'request', 'status', 'referer', 'user_agent'])

In [5]:
def url_parse(x):
    return unquote(x)

In [6]:
logData['request'] = logData['request'].apply(url_parse )

In [7]:
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],"""GET /search?ie=UTF-8&q=parameter:""Bathysiphon...",301.0,"""-""","""Mozilla/5.0 (compatible; AhrefsBot/6.1; +http..."


In [8]:
logData['status'] = logData['status'].fillna(0) #convert non-finite values (NA or inf) to integer
logData['status'] = logData['status'].astype(int)
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],"""GET /search?ie=UTF-8&q=parameter:""Bathysiphon...",301,"""-""","""Mozilla/5.0 (compatible; AhrefsBot/6.1; +http..."


In [9]:
logData['request']= logData['request'].apply(lambda x: re.sub(r'^"|"$', '', str(x)) )
logData['user_agent']= logData['user_agent'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )
logData['referer']= logData['referer'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )

In [10]:
logData.shape

(3874088, 6)

In [11]:
logData['request_type'] = logData['request'].str.split().str[0]
logData['resource'] = logData['request'].str.split().str[1]
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent,request_type,resource
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],"GET /search?ie=UTF-8&q=parameter:""Bathysiphon+...",301,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...,GET,"/search?ie=UTF-8&q=parameter:""Bathysiphon+sp."""


In [12]:
logData = logData[(logData.request_type == 'GET') & (logData.status == 200)]
logData.shape

(3063078, 8)

In [13]:
logData=logData.drop('request', axis=1)
logData.head(1)

Unnamed: 0,ip,time,status,referer,user_agent,request_type,resource
3,75892c86ec3272b7fa79e0f2c375a9f5,[30/Sep/2019:06:25:56 +0000],200,-,Mozilla/5.0 (compatible; SemrushBot/6~bl; +htt...,GET,/?maxdate=2005-12-31T23:59:59&mindate=2005-01-...


In [14]:
logData = logData[~logData['resource'].str.match(
    r'^/media|^/static|^/admin|^/robots.txt$|^/favicon.ico$')]
logData = logData[~logData['user_agent'].str.match(
    r'.*?bot|.*?spider|.*?crawler|.*?slurp', flags=re.I).fillna(False)]
logData.shape

(1858386, 7)

In [15]:
logData.head(1)

Unnamed: 0,ip,time,status,referer,user_agent,request_type,resource
9,eb6f30241bf7ff9bf1c6ddb6d5ea2d66,[30/Sep/2019:06:26:12 +0000],200,https://doi.pangaea.de/10.1594/PANGAEA.890974,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...,GET,/?q=SOCATv5


In [16]:
#https://doi.pangaea.de, https://www.pangaea.de
#use referrel to filter out external requests - only request originated from the portal will be considered
logData = logData[logData.referer.str.startswith('https://doi.pangaea.de') | logData.referer.str.startswith('https://www.pangaea.de')]
logData.shape

(1335062, 7)

In [17]:
#exclude queries originated from external search -> /advanced/search.php, /advanced/warehouse.php
logData = logData[~logData.resource.str.contains('/advanced/')]
logData.shape

(1234066, 7)

In [18]:
logData['time']=logData['time'].apply(lambda x: datetime.strptime(x,'[%d/%b/%Y:%H:%M:%S +%f]').date())

In [19]:
searchData =logData.copy()
searchData.shape

(1234066, 7)

In [20]:
searchData= searchData[searchData.resource.str.contains('f.author') | searchData.resource.str.contains('f.basis') | searchData.resource.str.contains('f.campaign') | searchData.resource.str.contains('f.device') | searchData.resource.str.contains('f.location') | searchData.resource.str.contains('f.project') | searchData.resource.str.contains('f.pubyear') | searchData.resource.str.contains('f.topic')] .drop_duplicates()
searchData.shape

(657, 7)

In [21]:
searchGroup = searchData.groupby(['time','ip'])
xx = searchGroup.resource.unique()
keyFacets = xx.apply(lambda x: re.findall(r'&?f.(pubyear|location|author|basis|campaign|device|project|topic)',str(x)))

In [22]:
keyFacets

time        ip                              
2019-09-30  70f34bd78d8d0aa0620a64b3be826f42    [pubyear, location, pubyear, location, pubyear...
            a094c47aff4ab6f29450d50321ee069d                                           [location]
            b3d4a9cd0701141eda9934b2c4fae15d    [location, location, location, location, locat...
2019-10-01  0a80b3058f5a8a4a846f3370e42d88fc                                  [location, pubyear]
            70c31dd547a243f7e912265ca8be17d5                                  [campaign, project]
                                                                      ...                        
2019-11-23  41436894c83d5f7802d2adb1931a542f                                           [location]
            b45af8174a0bc4981dd52c143dd9a415                                    [author, pubyear]
2019-11-24  a672baf6cd4cfb14fe591223304894e6                                           [campaign]
            b45af8174a0bc4981dd52c143dd9a415                             

In [23]:
topKeyFacets = keyFacets.value_counts()
topKeyFacets

[location]                                                        24
[pubyear]                                                         23
[author]                                                          21
[project]                                                         20
[topic, topic]                                                    12
                                                                  ..
[topic, pubyear, project]                                          1
[location, author, author]                                         1
[topic, location]                                                  1
[topic, pubyear]                                                   1
[topic, topic, topic, pubyear, topic, pubyear, topic, pubyear]     1
Name: resource, Length: 82, dtype: int64

In [29]:
topKeyFacets.head(10)

[location]                  24
[pubyear]                   23
[author]                    21
[project]                   20
[topic, topic]              12
[topic]                     10
[basis]                      8
[location, location]         8
[author, author, author]     7
[device]                     6
Name: resource, dtype: int64

In [50]:
df = pd.DataFrame({'facet':topKeyFacets.index, 'count':topKeyFacets.values})
df['len'] = df['facet'].apply(lambda x: len(x))

In [55]:
df

Unnamed: 0,facet,count,len
0,[location],24,1
1,[pubyear],23,1
2,[author],21,1
3,[project],20,1
4,"[topic, topic]",12,2
...,...,...,...
77,"[topic, pubyear, project]",1,3
78,"[location, author, author]",1,3
79,"[topic, location]",1,2
80,"[topic, pubyear]",1,2


In [52]:
df.to_csv('facet')

In [57]:
df['len'].value_counts()

2      23
3      13
1       8
4       6
8       5
6       5
7       5
5       4
9       3
11      2
15      2
23      1
10      1
16      1
18      1
20      1
201     1
Name: len, dtype: int64

In [39]:
import plotly.express as px
fig = px.pie(bf, values='count', names='facet')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()