In [1]:
import pandas as pd
import re
from datetime import datetime
import pytz
from urllib.parse import urlparse
from urllib.parse import parse_qs

In [2]:
import os
current_file = os.getcwd()
myfile = os.path.join(current_file, 'www.pangaea.de-access-anon.log.bz2')

In [3]:
#ref: https://mmas.github.io/read-apache-access-log-pandas
logData = pd.read_csv(myfile,
    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
    engine='python',
    na_values='-',
    header=None,
    usecols=[0, 3, 4, 5, 7, 8],encoding="utf-8",
    names=['ip', 'time', 'request', 'status', 'referer', 'user_agent'])

In [4]:
logData['status'] = logData['status'].fillna(0) #convert non-finite values (NA or inf) to integer
logData['status'] = logData['status'].astype(int)
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],"""GET /search?ie=UTF-8&q=parameter%3A%22Bathysi...",301,"""-""","""Mozilla/5.0 (compatible; AhrefsBot/6.1; +http..."


In [5]:
logData['request']= logData['request'].apply(lambda x: re.sub(r'^"|"$', '', str(x)) )
logData['user_agent']= logData['user_agent'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )
logData['referer']= logData['referer'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )

In [6]:
logData.shape

(3874088, 6)

In [7]:
logData['request_type'] = logData['request'].str.split().str[0]
logData['resource'] = logData['request'].str.split().str[1]
logData.head(1)

Unnamed: 0,ip,time,request,status,referer,user_agent,request_type,resource
0,68aa873d61a6586244b0892d7bca2573,[30/Sep/2019:06:25:44 +0000],GET /search?ie=UTF-8&q=parameter%3A%22Bathysip...,301,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...,GET,/search?ie=UTF-8&q=parameter%3A%22Bathysiphon+...


In [8]:
logData = logData[(logData.request_type == 'GET') & (logData.status == 200)]
logData.shape

(3063078, 8)

In [9]:
logData=logData.drop('request', axis=1)
logData.head(1)

Unnamed: 0,ip,time,status,referer,user_agent,request_type,resource
3,75892c86ec3272b7fa79e0f2c375a9f5,[30/Sep/2019:06:25:56 +0000],200,-,Mozilla/5.0 (compatible; SemrushBot/6~bl; +htt...,GET,/?maxdate=2005-12-31T23%3A59%3A59&mindate=2005...


In [10]:
logData = logData[~logData['resource'].str.match(
    r'^/media|^/static|^/admin|^/robots.txt$|^/favicon.ico$')]
logData = logData[~logData['user_agent'].str.match(
    r'.*?bot|.*?spider|.*?crawler|.*?slurp', flags=re.I).fillna(False)]
logData.shape

(1858386, 7)

In [11]:
logData = logData[logData.referer.str.startswith('https://doi.pangaea.de') | logData.referer.str.startswith('https://www.pangaea.de')]
logData.shape

(1335062, 7)

In [12]:
logData = logData[~logData.resource.str.contains('/advanced/')]
logData.shape

(1234130, 7)

In [13]:
regex = '(\?q\=)(.*?)\&'
def parse_query(url):
    parsed = urlparse(url)
    l = None
    if 'q' in parse_qs(parsed.query):
        l = parse_qs(parsed.query)['q']
        l= l[0].lower()
    else:
        search = re.search(regex, url, re.IGNORECASE)
        if search:
            l= search.group(2).lower()
    return l

In [49]:
searchData =logData.copy()
searchData.shape

(1234130, 7)

In [50]:
searchData= searchData[searchData.resource.str.contains('q=')] 
searchData.shape

(5952, 7)

In [51]:
searchData['resource']

9                                                /?q=SOCATv5
775                       /?q=parameter%3A%22File+content%22
2159                           /?q=project%3Alabel%3ASponGES
4107                          /?q=parameter%3A%22Salinity%22
4244                            /?q=event%3Alabel%3ALAVPICCO
                                 ...                        
3872578    /nojs.php?t=Oceans&q=Glacial+and+internaglacia...
3873692          /?q=PI%3Aemail%3Asteven_clemens%40brown.edu
3873761                        /?q=method%3A%22Calculated%22
3873795    /?q=parameter%3A%22%CE%B418O%2C+adjusted%2Fcor...
3873798          /?q=PI%3Aemail%3Asteven_clemens%40brown.edu
Name: resource, Length: 5952, dtype: object

In [52]:
query = searchData['resource'].apply(parse_query)

In [53]:
query

9                                                    socatv5
775                                 parameter:"file content"
2159                                   project:label:sponges
4107                                    parameter:"salinity"
4244                                    event:label:lavpicco
                                 ...                        
3872578    glacial and internaglacial variablity in diato...
3873692                    pi:email:steven_clemens@brown.edu
3873761                                  method:"calculated"
3873795                 parameter:"δ18o, adjusted/corrected"
3873798                    pi:email:steven_clemens@brown.edu
Name: resource, Length: 5952, dtype: object

In [54]:
import nltk
from bs4 import BeautifulSoup
import string 
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/ahmed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ahmed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [55]:
tokenizer = RegexpTokenizer(r'\w+')

In [56]:
query_p = query.apply(lambda x: tokenizer.tokenize(x.lower()))

In [57]:
query_p

9                                                  [socatv5]
775                               [parameter, file, content]
2159                               [project, label, sponges]
4107                                   [parameter, salinity]
4244                                [event, label, lavpicco]
                                 ...                        
3872578    [glacial, and, internaglacial, variablity, in,...
3873692              [pi, email, steven_clemens, brown, edu]
3873761                                 [method, calculated]
3873795               [parameter, δ18o, adjusted, corrected]
3873798              [pi, email, steven_clemens, brown, edu]
Name: resource, Length: 5952, dtype: object

In [58]:
dfb = pd.DataFrame({ 'query':query_p.values})
dfb['length'] = dfb['query'].apply( lambda x: len(x))
dfb

Unnamed: 0,query,length
0,[socatv5],1
1,"[parameter, file, content]",3
2,"[project, label, sponges]",3
3,"[parameter, salinity]",2
4,"[event, label, lavpicco]",3
...,...,...
5947,"[glacial, and, internaglacial, variablity, in,...",15
5948,"[pi, email, steven_clemens, brown, edu]",5
5949,"[method, calculated]",2
5950,"[parameter, δ18o, adjusted, corrected]",4


In [59]:
datab = dfb['length'].value_counts()
dataFrameb = pd.DataFrame({ 'word_number':datab.index ,'percentage':datab.values})
dataSortb = dataFrameb.set_index('word_number')
dataPlotb = dataSortb.sort_index()
dataPlotb = dataPlotb.reset_index()
dataPlotb

Unnamed: 0,word_number,percentage
0,0,1
1,1,612
2,2,1313
3,3,1894
4,4,980
5,5,348
6,6,373
7,7,198
8,8,66
9,9,69


In [60]:
import plotly.express as px
fig = px.pie(dataPlotb, values='percentage', names='word_number')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [61]:
import plotly.express as px
fig = px.bar(dataPlotb, x='word_number', y='percentage')
fig.show()

In [62]:
def remove_stopwords(x):
    words = [w for w in x if w not in stopwords.words('english')]
    return words 

In [63]:
query_s = query_p.apply(lambda x : remove_stopwords(x))
query_s

9                                                  [socatv5]
775                               [parameter, file, content]
2159                               [project, label, sponges]
4107                                   [parameter, salinity]
4244                                [event, label, lavpicco]
                                 ...                        
3872578    [glacial, internaglacial, variablity, diatom, ...
3873692              [pi, email, steven_clemens, brown, edu]
3873761                                 [method, calculated]
3873795               [parameter, δ18o, adjusted, corrected]
3873798              [pi, email, steven_clemens, brown, edu]
Name: resource, Length: 5952, dtype: object

In [64]:
df = pd.DataFrame({ 'query':query_s.values})

In [65]:
df['length'] = df['query'].apply( lambda x: len(x))
df

Unnamed: 0,query,length
0,[socatv5],1
1,"[parameter, file, content]",3
2,"[project, label, sponges]",3
3,"[parameter, salinity]",2
4,"[event, label, lavpicco]",3
...,...,...
5947,"[glacial, internaglacial, variablity, diatom, ...",11
5948,"[pi, email, steven_clemens, brown, edu]",5
5949,"[method, calculated]",2
5950,"[parameter, δ18o, adjusted, corrected]",4


In [66]:
# data = df['length'].value_counts(normalize=True)*10
data = df['length'].value_counts()
dataFrame = pd.DataFrame({ 'word_number':data.index ,'percentage':data.values})
dataSort = dataFrame.set_index('word_number')
dataPlot = dataSort.sort_index()
dataPlot = dataPlot.reset_index()
dataPlot

Unnamed: 0,word_number,percentage
0,0,2
1,1,617
2,2,1347
3,3,1946
4,4,952
5,5,329
6,6,464
7,7,118
8,8,73
9,9,61


In [67]:
import plotly.express as px
# This dataframe has 244 lines, but 4 distinct values for `day`
# df = px.data.tips()
fig = px.pie(dataPlot, values='percentage', names='word_number')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [68]:
import plotly.express as px
fig = px.bar(dataPlot, x='word_number', y='percentage')
fig.show()

In [69]:
lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

In [70]:
query_l = query_s.apply(lambda x : word_lemmatizer(x))
query_l

9                                                  [socatv5]
775                               [parameter, file, content]
2159                                [project, label, sponge]
4107                                   [parameter, salinity]
4244                                [event, label, lavpicco]
                                 ...                        
3872578    [glacial, internaglacial, variablity, diatom, ...
3873692              [pi, email, steven_clemens, brown, edu]
3873761                                 [method, calculated]
3873795               [parameter, δ18o, adjusted, corrected]
3873798              [pi, email, steven_clemens, brown, edu]
Name: resource, Length: 5952, dtype: object

In [71]:
stemmer = PorterStemmer()
def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text])
    return stem_text

In [72]:
query_st = query_l.apply(lambda x :word_stemmer(x))
query_st

9                                                    socatv5
775                                     paramet file content
2159                                     project label spong
4107                                           paramet salin
4244                                    event label lavpicco
                                 ...                        
3872578    glacial internaglaci variabl diatom abund valv...
3873692                     pi email steven_clemen brown edu
3873761                                        method calcul
3873795                          paramet δ18o adjust correct
3873798                     pi email steven_clemen brown edu
Name: resource, Length: 5952, dtype: object

In [73]:
df = pd.DataFrame({ 'query':query_st.values})

In [74]:
df['length'] = df['query'].str.split().apply( lambda x: len(x))
# len()

In [75]:
df

Unnamed: 0,query,length
0,socatv5,1
1,paramet file content,3
2,project label spong,3
3,paramet salin,2
4,event label lavpicco,3
...,...,...
5947,glacial internaglaci variabl diatom abund valv...,11
5948,pi email steven_clemen brown edu,5
5949,method calcul,2
5950,paramet δ18o adjust correct,4


In [76]:
# data = df['length'].value_counts(normalize=True)*100
data = df['length'].value_counts()
data

3     1946
2     1347
4      952
1      617
6      464
5      329
7      118
8       73
9       61
11      12
10      11
12       8
16       4
17       3
15       2
13       2
0        2
14       1
Name: length, dtype: int64

In [77]:
dataFrame = pd.DataFrame({ 'word_number':data.index ,'percentage':data.values})
# dataPlot.index.sort_values()
# dataPlot

In [78]:
dataSort = dataFrame.set_index('word_number')
dataPlot = dataSort.sort_index()
dataPlot = dataPlot.reset_index()
dataPlot

Unnamed: 0,word_number,percentage
0,0,2
1,1,617
2,2,1347
3,3,1946
4,4,952
5,5,329
6,6,464
7,7,118
8,8,73
9,9,61


In [79]:
import plotly

In [80]:
import plotly.express as px
# This dataframe has 244 lines, but 4 distinct values for `day`
# df = px.data.tips()
fig = px.pie(dataPlot, values='percentage', names='word_number')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [81]:
import plotly.express as px
fig = px.bar(dataPlot, x='word_number', y='percentage')
fig.show()