In [44]:
import pandas as pd
import json
import requests
import re
import numpy as np
from datetime import datetime
import pytz
from urllib.parse import urlparse,unquote
from urllib.parse import parse_qs
from bs4 import BeautifulSoup
from requests.exceptions import HTTPError


In [45]:
import os
current_file = os.getcwd()
myfile = os.path.join(current_file, 'doi.pangaea.de-access-ipanonymized.bz2')

In [46]:
logData = pd.read_csv(myfile,
    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
    engine='python',
    na_values='-',
    header=None,
    usecols=[0,1,2, 3, 4, 5, 7,8],encoding="utf-8",
    names=['ip','website','session_id', 'time', 'user_info', 'status', 'referer','user_agent'])

In [47]:
logData['session_id'] = logData['session_id'].str.split('>').str[1]

In [48]:
logData['status'] = logData['status'].fillna(0) #convert non-finite values (NA or inf) to integer
logData['status'] = logData['status'].astype(int)
logData['session_id'] = logData['session_id'].fillna('-') #convert non-finite values (NA or inf) to integer

In [49]:
logData['user_info']= logData['user_info'].apply(lambda x: re.sub(r'^"|"$', '', str(x)) )
logData['user_agent']= logData['user_agent'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )
logData['referer']= logData['referer'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )

In [50]:
logData['request_type'] = logData['user_info'].str.split().str[0]
logData['resource'] = logData['user_info'].str.split().str[1]
logData['time']= logData['time'].str.split().str[0]
logData['time']= logData['time'].str.split('[').str[1]

In [51]:
logData.shape

(3527530, 10)

In [52]:
logData = logData[~logData.session_id.str.contains(';C')]
logData.shape

(3509595, 10)

In [53]:
logData = logData[((logData.user_info.str.contains('format=zip')) | (logData.user_info.str.contains('format=html'))| (logData.user_info.str.contains('format=textfile'))) ]
logData.shape

(702717, 10)

In [54]:
def time_format(x):
    return datetime.strptime(x,'%d/%b/%Y:%H:%M:%S')

In [55]:
logData['time'] = logData['time'].apply(time_format)

In [56]:
logData = logData[(logData.request_type == 'GET') & ((logData.status == 200 ) | (logData.status == 304 ))]

In [57]:
logData = logData[~logData['resource'].str.match(
    r'^/media|^/static|^/admin|^/robots.txt$|^/favicon.ico$')]
logData = logData[~logData['user_agent'].str.match(
    r'.*?bot|.*?spider|.*?crawler|.*?slurp', flags=re.I).fillna(False)]
logData.shape

(83762, 10)

In [92]:
Data =logData.copy()

In [93]:
DataWO = Data[Data.session_id.str.contains('-')]
DataWith = Data[~Data.session_id.str.contains('-')]
DataWith.shape

(24583, 10)

In [94]:
# we remove the session with only log because ?
DataWith = DataWith.groupby(['session_id','user_agent']).filter(lambda x: len(x) > 1)
DataWith.shape

(23120, 10)

In [95]:
DataWith['remove'] = np.random.randint(1, 6, DataWith.shape[0])
DataWith['remove'].value_counts().index.size

5

In [96]:
i = DataWith['remove'].value_counts().index.size 
while i != 1 :
    DataWith = DataWith.sort_values(['time','session_id'])
    DataWith['delta_session'] = DataWith.session_id.ne(DataWith.session_id.shift(1))
    DataWith['delta_resource']=DataWith.resource.ne(DataWith.resource.shift(1))
    DataWith['delta_time']=DataWith['time'].diff().dt.total_seconds()
    DataWith['delta_user_agent']=DataWith.user_agent.ne(DataWith.user_agent.shift(1))
    DataWith['remove'] = (DataWith.delta_time > 30) & (DataWith.delta_session == False)  & (DataWith.delta_resource == False)
    i = DataWith['remove'].value_counts().index.size 
    DataWith = DataWith[~DataWith['remove']==True]
DataWith.shape

(22658, 15)

In [63]:
DataWith = DataWith.drop(['delta_user_agent','delta_time','delta_resource','delta_session','remove'], axis=1)

In [64]:
DataWO['remove'] = np.random.randint(1, 6, DataWO.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [65]:
DataWO['remove'].value_counts().index

Int64Index([2, 3, 4, 5, 1], dtype='int64')

In [66]:
i = DataWO['remove'].value_counts().index.size 
while i != 1 :
    DataWO = DataWO.sort_values(['time','ip'])
    DataWO['delta_ip'] = DataWO.ip.ne(DataWO.ip.shift(1))
    DataWO['delta_resource']=DataWO.resource.ne(DataWO.resource.shift(1))
    DataWO['delta_time']=DataWO['time'].diff().dt.total_seconds()
    DataWO['delta_user_agent']=DataWO.user_agent.ne(DataWO.user_agent.shift(1))
    DataWO['remove'] = (DataWO.delta_time > 30) & (DataWO.delta_ip == False)  & (DataWO.delta_resource == False)
    i = DataWO['remove'].value_counts().index.size 
    DataWO = DataWO[~DataWO['remove']==True]
DataWO.shape


(58400, 15)

In [67]:
DataWO = DataWO.drop(['delta_user_agent','delta_time','delta_resource','delta_ip','remove'], axis=1)

In [68]:
DataA = DataWith.append(DataWO)

In [69]:
DataA = DataA.sort_values(['time','ip'])
DataA.dtypes

ip                      object
website                 object
session_id              object
time            datetime64[ns]
user_info               object
status                   int64
referer                 object
user_agent              object
request_type            object
resource                object
dtype: object

In [70]:
top = DataA['resource'].value_counts()
df = pd.DataFrame({'resource':top.index, 'count':top.values})
df = df[df.resource.str.contains('PANGAEA.')]
df['paper_id'] = 't'
df['paper_id'] = df['resource'].apply(lambda x: re.search('\.\d+\?',x).group())
df['paper_id']= df['paper_id'].apply(lambda x: re.sub(r'^.|\W', '', str(x)) )
df
# df['test']
# df.to_csv('fv')
# \.\d+\?

Unnamed: 0,resource,count,paper_id
0,/10.1594/PANGAEA.908578?format=html,80,908578
1,/10.1594/PANGAEA.734969?format=textfile,78,734969
2,/10.1594/PANGAEA.774574?format=textfile&charse...,48,774574
3,/10.1594/PANGAEA.898014?format=html,39,898014
4,/10.1594/PANGAEA.805734?format=html,39,805734
...,...,...,...
58659,/10.1594/PANGAEA.773661?format=html,1,773661
58660,/10.1594/PANGAEA.293983?format=textfile,1,293983
58661,/10.1594/PANGAEA.678153?format=textfile&charse...,1,678153
58662,/10.1594/PANGAEA.753142?format=textfile,1,753142


In [71]:
def fetch_all(x): 
    url = 'http://ws.pangaea.de/es/pangaea/panmd/'+x
    response = requests.get(url) 
    soup = BeautifulSoup(response.text)
    meta = soup.find_all()
    if len(meta) > 0:
        return meta
    else:
        return 'no meta data'

In [None]:
df['all'] = df['paper_id'].apply(fetch_all)

In [None]:
df = df[~df['all'].apply(lambda x: x == 'no meta data')]
df

In [None]:
# df.to_csv('result_all_json')

In [97]:
udf = DataA.copy()
# udf
udfGroup = udf.groupby(['session_id','resource'])
xx = udfGroup.resource.unique()
key = xx.apply(lambda x: str(x))
key
# key

session_id                        resource                                             
-                                 /10.1594/PANGAEA.101325?format=textfile                        ['/10.1594/PANGAEA.101325?format=textfile']
                                  /10.1594/PANGAEA.101326?format=textfile                        ['/10.1594/PANGAEA.101326?format=textfile']
                                  /10.1594/PANGAEA.101327?format=textfile                        ['/10.1594/PANGAEA.101327?format=textfile']
                                  /10.1594/PANGAEA.101328?format=textfile                        ['/10.1594/PANGAEA.101328?format=textfile']
                                  /10.1594/PANGAEA.101329?format=textfile                        ['/10.1594/PANGAEA.101329?format=textfile']
                                                                                                                 ...                        
ff3960fe11a8da1230bb237ba1d552fa  /10.1594/PANGAEA.757561?format=t

In [98]:
to = key.value_counts()
to
# to.to_csv('sendResults')

['/10.1594/PANGAEA.774574?format=textfile&charset=UTF-8']    21
['/10.1594/PANGAEA.56040?format=textfile&charset=UTF-8']     18
['/10.1594/PANGAEA.56040?format=html']                       17
['/10.1594/PANGAEA.816201?format=html']                      17
['/10.1594/PANGAEA.898014?format=html']                      14
                                                             ..
['/10.1594/PANGAEA.321181?format=textfile']                   1
['/10.1594/PANGAEA.861810?format=textfile&charset=UTF-8']     1
['/10.1594/PANGAEA.891575?format=textfile']                   1
['/10.1594/PANGAEA.693920?format=zip']                        1
['/10.1594/PANGAEA.476283?format=textfile']                   1
Name: resource, Length: 58664, dtype: int64

In [100]:
dfu = pd.DataFrame({'resource':to.index, 'count':to.values})

In [101]:
dfu.to_csv('uniqueResults')