In [2]:
import pandas as pd
import json
import requests
import re
import numpy as np
from datetime import datetime
import pytz
from urllib.parse import urlparse,unquote
from urllib.parse import parse_qs
from bs4 import BeautifulSoup

In [3]:
import os
current_file = os.getcwd()
myfile = os.path.join(current_file, 'doi.pangaea.de-access-ipanonymized.bz2')

In [5]:
logData = pd.read_csv(myfile,
    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
    engine='python',
    na_values='-',
    header=None,
    usecols=[0,1,2, 3, 4, 5, 7,8],encoding="utf-8",
    names=['ip','website','session_id', 'time', 'user_info', 'status', 'referer','user_agent'])

In [6]:
logData['session_id'] = logData['session_id'].str.split('>').str[1]

In [7]:
# logData.head(50)

In [8]:
logData['status'] = logData['status'].fillna(0) #convert non-finite values (NA or inf) to integer
logData['status'] = logData['status'].astype(int)
logData['session_id'] = logData['session_id'].fillna('-') #convert non-finite values (NA or inf) to integer

In [9]:
logData['user_info']= logData['user_info'].apply(lambda x: re.sub(r'^"|"$', '', str(x)) )
logData['user_agent']= logData['user_agent'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )
logData['referer']= logData['referer'].apply(lambda x: re.sub(r'^"|"$', '',str(x)) )
# logData.head(10)

In [10]:
logData['request_type'] = logData['user_info'].str.split().str[0]
logData['resource'] = logData['user_info'].str.split().str[1]
logData['time']= logData['time'].str.split().str[0]
logData['time']= logData['time'].str.split('[').str[1]
# logData.head(10)

In [11]:
logData.shape

(3527530, 10)

In [12]:
logData = logData[~logData.session_id.str.contains(';C')]
logData.shape

(3509595, 10)

In [13]:
logData = logData[((logData.user_info.str.contains('format=zip')) | (logData.user_info.str.contains('format=html'))| (logData.user_info.str.contains('format=textfile'))) ]
logData.shape

(702717, 10)

In [14]:
def time_format(x):
    return datetime.strptime(x,'%d/%b/%Y:%H:%M:%S')

In [15]:
logData['time'] = logData['time'].apply(time_format)

In [16]:
logData = logData[(logData.request_type == 'GET') & ((logData.status == 200 ) | (logData.status == 304 ))]

In [17]:
logData = logData[~logData['resource'].str.match(
    r'^/media|^/static|^/admin|^/robots.txt$|^/favicon.ico$')]
logData = logData[~logData['user_agent'].str.match(
    r'.*?bot|.*?spider|.*?crawler|.*?slurp', flags=re.I).fillna(False)]
logData.shape

(83762, 10)

In [18]:
# logData

In [19]:
Data =logData.copy()

In [20]:
# Data = Data.sort_values(['session_id'])
DataWO = Data[Data.session_id.str.contains('-')]
DataWith = Data[~Data.session_id.str.contains('-')]
DataWith.shape

(24583, 10)

In [21]:
DataWith = DataWith.groupby('session_id').filter(lambda x: len(x) > 1)
DataWith.shape

(23272, 10)

In [22]:
DataWith['remove'] = np.random.randint(1, 6, DataWith.shape[0])

In [23]:
i = DataWith['remove'].value_counts().index.size 
while i != 1 :
    DataWith = DataWith.sort_values(['time','session_id'])
    DataWith['delta_session'] = DataWith.session_id.ne(DataWith.session_id.shift(1))
    DataWith['delta_resource']=DataWith.resource.ne(DataWith.resource.shift(1))
    DataWith['delta_time']=DataWith['time'].diff().dt.total_seconds()
    DataWith['delta_user_agent']=DataWith.user_agent.ne(DataWith.user_agent.shift(1))
    DataWith['remove'] = (DataWith.delta_time > 30) & (DataWith.delta_session == False)  & (DataWith.delta_resource == False)
    i = DataWith['remove'].value_counts().index.size 
    DataWith = DataWith[~DataWith['remove']==True]
DataWith.shape

(22808, 15)

In [24]:
# DataWith.dtypes
DataWith = DataWith.drop(['delta_user_agent','delta_time','delta_resource','delta_session','remove'], axis=1)
# DataWith

In [25]:
DataWO['remove'] = np.random.randint(1, 6, DataWO.shape[0])
# Data['remove'][33] = 'no'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [26]:
DataWO['remove'].value_counts().index

Int64Index([2, 5, 1, 3, 4], dtype='int64')

In [27]:
i = DataWO['remove'].value_counts().index.size 
while i != 1 :
    DataWO = DataWO.sort_values(['time','ip'])
    DataWO['delta_ip'] = DataWO.ip.ne(DataWO.ip.shift(1))
    DataWO['delta_resource']=DataWO.resource.ne(DataWO.resource.shift(1))
    DataWO['delta_time']=DataWO['time'].diff().dt.total_seconds()
    DataWO['delta_user_agent']=DataWO.user_agent.ne(DataWO.user_agent.shift(1))
    DataWO['remove'] = (DataWO.delta_time > 30) & (DataWO.delta_ip == False)  & (DataWO.delta_resource == False)
    i = DataWO['remove'].value_counts().index.size 
    DataWO = DataWO[~DataWO['remove']==True]
DataWO.shape


(58400, 15)

In [28]:
DataWO = DataWO.drop(['delta_user_agent','delta_time','delta_resource','delta_ip','remove'], axis=1)
# DataWO

In [29]:
DataA = DataWith.append(DataWO)

In [30]:
DataA = DataA.sort_values(['time','ip'])
DataA.dtypes

ip                      object
website                 object
session_id              object
time            datetime64[ns]
user_info               object
status                   int64
referer                 object
user_agent              object
request_type            object
resource                object
dtype: object

In [31]:
# delete unwanted cloumns in each and make sure the have the same columns and then add them together using append

In [32]:
# x = '/10.1594/PANGAEA.908578?format=html'
# a =  re.search('\d+',x).group()
# a

In [39]:
# d = 'http://ws.pangaea.de/es/pangaea/panmd/'
# dd = '908578' 
# # ddd = d+dd
# ds = fetch_title(dd)
# ds

In [134]:
x = '794398'
# url = 'https://doi.pangaea.de/10.1594/PANGAEA.'+x+'?format=metadata_jsonld'
# response = requests.get(url) 
# response
# # ss = response.status_code == 200
# # ss
# data = response.json()
data = fetch_title(x)
data
# title = data['creator']['familyName'] + ' '+ data['creator']['givenName']
# title
# soup = BeautifulSoup(response.text)
# soup

{'@context': 'http://schema.org/',
 '@id': 'https://doi.org/10.1594/PANGAEA.794398',
 '@type': 'Dataset',
 'identifier': 'https://doi.org/10.1594/PANGAEA.794398',
 'url': 'https://doi.pangaea.de/10.1594/PANGAEA.794398',
 'creator': [{'@type': 'Person',
   'familyName': "d'Hondt",
   'givenName': 'Steven L'},
  {'@type': 'Person',
   'familyName': 'Jørgensen',
   'givenName': 'Bo Barker',
   'email': 'bo.barker@biology.au.dk'},
  {'@type': 'Person',
   'familyName': 'Miller',
   'givenName': 'D Jay',
   'email': 'john_miller@odp.tamu.edu'},
  {'@type': 'Organization', 'name': 'Shipboard Scientific Party'}],
 'name': 'P-wave velocity (PWS split-core system) on ODP Hole 201-1228A',
 'publisher': {'@type': 'Organization',
  'name': 'PANGAEA',
  'disambiguatingDescription': 'Data Publisher for Earth & Environmental Science',
  'url': 'https://www.pangaea.de/'},
 'includedInDataCatalog': {'@type': 'DataCatalog',
  'name': 'PANGAEA',
  'disambiguatingDescription': 'Data Publisher for Earth & 

In [91]:
# x = '908578'
# url = 'http://ws.pangaea.de/es/pangaea/panmd/'+x
# response = requests.get(url) 
# soup = BeautifulSoup(response.text)
# soup
# title = fetch_all(x)
# title
# m = str(title[0]).split('md:datetime')
# m[1]

In [48]:
def fetch_all(x): 
    url = 'http://ws.pangaea.de/es/pangaea/panmd/'+x
    response = requests.get(url) 
    soup = BeautifulSoup(response.text)
    author = soup.find_all()
    if len(author) > 0:
        return author
    else:
        return 'no title'

In [42]:
def fetch_author(x): 
    url = 'http://ws.pangaea.de/es/pangaea/panmd/'+x
    response = requests.get(url) 
    soup = BeautifulSoup(response.text)
    author = soup.find_all('md:author')
    if len(author) > 0:
        return author[0]
    else:
        return 'no title'

In [61]:
def fetch_dateTime(x): 
    url = 'http://ws.pangaea.de/es/pangaea/panmd/'+x
    response = requests.get(url) 
    soup = BeautifulSoup(response.text)
    dateTime = soup.find_all('md:datetime')
    if len(dateTime) > 0:
        return dateTime[0]
    else:
        return 'no title'

In [44]:
def fetch_year(x): 
    url = 'http://ws.pangaea.de/es/pangaea/panmd/'+x
    response = requests.get(url) 
    soup = BeautifulSoup(response.text)
    year = soup.find_all('md:year')
    if len(year) > 0:
        return year[0]
    else:
        return 'no title'

In [129]:
def fetch_title(x): 
    url = 'https://doi.pangaea.de/10.1594/PANGAEA.'+x+'?format=metadata_jsonld'
    response = requests.get(url)
    if response.status_code == 200: 
        data = response.json()
#         title = data['creator']['givenName'] + ' '+ data['creator']['familyName']
        return data
    else:
        return 'no title'
#     data = response.json()
#     title = data['creator']['givenName'] + ' '+ data['creator']['familyName']
#     if len(title) > 0:
#         return title[0]
#     else:
#         return 'no title'

In [131]:
top = DataA['resource'].value_counts()
df = pd.DataFrame({'resource':top.index, 'count':top.values})
df = df[df.resource.str.contains('PANGAEA.')]
df['paper_id'] = 't'
df['paper_id'] = df['resource'].apply(lambda x: re.search('\.\d+\?',x).group())
df['paper_id']= df['paper_id'].apply(lambda x: re.sub(r'^.|\W', '', str(x)) )
df
# df['test']
# df.to_csv('fv')
# \.\d+\?

Unnamed: 0,resource,count,paper_id
0,/10.1594/PANGAEA.908578?format=html,80,908578
1,/10.1594/PANGAEA.734969?format=textfile,78,734969
2,/10.1594/PANGAEA.774574?format=textfile&charse...,48,774574
3,/10.1594/PANGAEA.816201?format=html,39,816201
4,/10.1594/PANGAEA.805734?format=html,39,805734
...,...,...,...
58659,/10.1594/PANGAEA.890870?format=html,1,890870
58660,/10.1594/PANGAEA.138008?format=textfile,1,138008
58661,/10.1594/PANGAEA.909876?format=textfile&charse...,1,909876
58662,/10.1594/PANGAEA.108510?format=textfile,1,108510


In [136]:
# df['title'] = 't'
df['all'] = df['paper_id'].apply(fetch_title)
# df['title'][0] = fetch_title(df['paper_id'][0])

In [137]:
df

Unnamed: 0,resource,count,paper_id,all
0,/10.1594/PANGAEA.908578?format=html,80,908578,"{'@context': 'http://schema.org/', '@id': 'htt..."
1,/10.1594/PANGAEA.734969?format=textfile,78,734969,"{'@context': 'http://schema.org/', '@id': 'htt..."
2,/10.1594/PANGAEA.774574?format=textfile&charse...,48,774574,"{'@context': 'http://schema.org/', '@id': 'htt..."
3,/10.1594/PANGAEA.816201?format=html,39,816201,"{'@context': 'http://schema.org/', '@id': 'htt..."
4,/10.1594/PANGAEA.805734?format=html,39,805734,"{'@context': 'http://schema.org/', '@id': 'htt..."
...,...,...,...,...
58659,/10.1594/PANGAEA.890870?format=html,1,890870,no title
58660,/10.1594/PANGAEA.138008?format=textfile,1,138008,no title
58661,/10.1594/PANGAEA.909876?format=textfile&charse...,1,909876,no title
58662,/10.1594/PANGAEA.108510?format=textfile,1,108510,no title


In [119]:
df.to_csv('result_all_json')

In [127]:
df.head(200)

Unnamed: 0,resource,count,paper_id,title
0,/10.1594/PANGAEA.908578?format=html,80,908578,"{'@context': 'http://schema.org/', '@id': 'htt..."
1,/10.1594/PANGAEA.734969?format=textfile,78,734969,"{'@context': 'http://schema.org/', '@id': 'htt..."
2,/10.1594/PANGAEA.774574?format=textfile&charse...,48,774574,"{'@context': 'http://schema.org/', '@id': 'htt..."
3,/10.1594/PANGAEA.816201?format=html,39,816201,"{'@context': 'http://schema.org/', '@id': 'htt..."
4,/10.1594/PANGAEA.805734?format=html,39,805734,"{'@context': 'http://schema.org/', '@id': 'htt..."
...,...,...,...,...
195,/10.1594/PANGAEA.848946?format=zip,9,848946,no title
196,/10.1594/PANGAEA.794398?format=textfile,9,794398,no title
197,/10.1594/PANGAEA.794395?format=textfile,9,794395,no title
198,/10.1594/PANGAEA.794396?format=textfile,9,794396,no title


In [121]:
xx = df[df['title'].apply(lambda x: x == 'no title')] 
xx

Unnamed: 0,resource,count,paper_id,title
180,/10.1594/PANGAEA.914907?format=textfile&charse...,9,914907,no title
181,/10.1594/PANGAEA.794399?format=textfile,9,794399,no title
182,/10.1594/PANGAEA.794394?format=textfile,9,794394,no title
183,/10.1594/PANGAEA.799902?format=textfile,9,799902,no title
184,/10.1594/PANGAEA.799903?format=textfile,9,799903,no title
...,...,...,...,...
58659,/10.1594/PANGAEA.890870?format=html,1,890870,no title
58660,/10.1594/PANGAEA.138008?format=textfile,1,138008,no title
58661,/10.1594/PANGAEA.909876?format=textfile&charse...,1,909876,no title
58662,/10.1594/PANGAEA.108510?format=textfile,1,108510,no title


In [122]:
xd = df[~df['title'].apply(lambda x: x == 'no title')]
xd
# df.to_csv('result_all')

Unnamed: 0,resource,count,paper_id,title
0,/10.1594/PANGAEA.908578?format=html,80,908578,"{'@context': 'http://schema.org/', '@id': 'htt..."
1,/10.1594/PANGAEA.734969?format=textfile,78,734969,"{'@context': 'http://schema.org/', '@id': 'htt..."
2,/10.1594/PANGAEA.774574?format=textfile&charse...,48,774574,"{'@context': 'http://schema.org/', '@id': 'htt..."
3,/10.1594/PANGAEA.816201?format=html,39,816201,"{'@context': 'http://schema.org/', '@id': 'htt..."
4,/10.1594/PANGAEA.805734?format=html,39,805734,"{'@context': 'http://schema.org/', '@id': 'htt..."
...,...,...,...,...
175,/10.1594/PANGAEA.883611?format=html,10,883611,"{'@context': 'http://schema.org/', '@id': 'htt..."
176,/10.1594/PANGAEA.872931?format=html,10,872931,"{'@context': 'http://schema.org/', '@id': 'htt..."
177,/10.1594/PANGAEA.875582?format=zip&charset=UTF-8,9,875582,"{'@context': 'http://schema.org/', '@id': 'htt..."
178,/10.1594/PANGAEA.902277?format=html,9,902277,"{'@context': 'http://schema.org/', '@id': 'htt..."


In [None]:
df

In [44]:
df['year'] = df['paper_id'].apply(fetch_year)
df['author'] = df['paper_id'].apply(fetch_author)
df['dateTime'] = df['paper_id'].apply(fetch_dateTime)

ConnectionError: HTTPConnectionPool(host='ws.pangaea.de', port=80): Max retries exceeded with url: /es/pangaea/panmd/850898 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8a513fe7d0>: Failed to establish a new connection: [Errno 60] Operation timed out'))

In [None]:
df.to_csv('result')

In [None]:
print(df)

In [None]:
# df

In [None]:
# df['year'] = df['paper_id'].apply(fetch_year)

In [52]:
# df['all'] = df['paper_id'].apply(fetch_all)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [54]:
# df['all'][0]

<html><body><p>{"_index":"pangaea_v2","_type":"panmd","_id":"908578","_version":1,"found":true,"_source":{"internal-datestamp":"2020-04-24T10:17:45.000Z","sf-authortitle":"Globevnik Lidija$Broad typology for rivers and lakes in Europe for large scale analysis","meanPosition":{"lat":52.45655,"lon":9.24395},"techKeyword":["citable","author30659","author72410","author72411","author72412","author72413","author72414","author72415","author72416","author72417","author72418","author72419","author72420","author72421","author72422","author72423","author72424","author72425","author72426","author72427","author72428","author72429","event2920149","geocode1600","geocode1601","journal14514","journal16983","journal6666","license21","method10667","param156444","param1570","param16143","param16144","param24303","param25541","param54251","param88478","param99583","pi72410","ref102257","ref102258","ref102259","ref102260","ref102261","ref102262","ref102263","ref102270","term48289","term48549","term51469","t

In [314]:
# df = df[~df['Resource'].apply(lambda x: re.search('\.\d+\?',x))]
# df
# df = df[~df.Resource.str.contains('PANGAEA.')]
# df

Unnamed: 0,Resource,count
29990,/?format=zip,1
54765,/?format=html,1
55240,/?format=textfile,1


In [217]:
# Data = Data[~Data['remove']==True]
# Data.to_csv('review')

In [282]:
# Data.shape

(85537, 13)

In [70]:
# Data.head(100).to_csv('t')

In [147]:
# searchData.referer.value_counts()

In [148]:
# searchData.referer.value_counts().to_csv('c')

In [149]:
# searchData.ip.value_counts()

<md:title>Broad typology for rivers and lakes in Europe for large scale analysis</md:title>

In [311]:
# data = {}
# data['people'] = []
# data['people'].append({
#     'name': 'Scott',
#     'website': 'stackabuse.com',
#     'from': 'Nebraska'
# })

In [312]:
# data

{'people': [{'name': 'Scott',
   'website': 'stackabuse.com',
   'from': 'Nebraska'}]}