In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datetime import datetime
import pytz
#%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"
def parse_str(x):
    """
    Returns the string delimited by two characters.
    """
    if x is None:
        return '-'
    return x[1:-1]

In [None]:
def parse_int(x):
    """Returns parsed string if no error occured during parse else returns 0"""
    if x is None:
        return 0
    try:
        y = int(x)
        return y
    except ValueError:
        return 0

In [None]:
def parse_datetime(x):
    '''
    Parses datetime with timezone formatted as:
        `[day/month/year:hour:minute:second zone]`

    Example:
        `>>> parse_datetime('13/Nov/2015:11:45:42 +0000')`
        `datetime.datetime(2015, 11, 3, 11, 45, 4, tzinfo=<UTC>)`

    Due to problems parsing the timezone (`%z`) with `datetime.strptime`, the
    timezone will be obtained using the `pytz` library.
    '''
    try:
        dt = datetime.strptime(x[1:-7], '%d/%b/%Y:%H:%M:%S')
        dt_tz = int(x[-6:-3])*60+int(x[-3:-1])
        return dt.replace(tzinfo=pytz.FixedOffset(dt_tz))
    except ValueError:
        return '-'

In [None]:
import re
import pandas as pd
conv = {'time': parse_datetime,
        'request': parse_str,
        'status': parse_int,
        'size': parse_int,
        'referer': parse_str,
        'user_agent': parse_str}
df = pd.read_csv('/content/drive/My Drive/web log-data/new_log',
                    header=None,
                    error_bad_lines=False,                    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                    engine='python',
                    na_values='-',
                    usecols=[0, 3, 4, 5, 6, 7, 8],
                    #names=['ip', 'time', 'request', 'status', 'size', 'referer', 'user_agent'],
                    names=['Client_IP_address', 'Time_of_activity','Requested_Page', 'Status_Code', 'Size_of_Page', 'Referer_page','Client_Device'],
                    converters=conv,                  )              
df.head()

Unnamed: 0,Client_IP_address,Time_of_activity,Requested_Page,Status_Code,Size_of_Page,Referer_page,Client_Device
0,109.169.248.247,[12/Dec/2015:18:25:11 +0100],"""GET /administrator/ HTTP/1.1""",200.0,4263.0,"""-""","""Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/2..."
1,109.169.248.247,[12/Dec/2015:18:25:11 +0100],"""POST /administrator/index.php HTTP/1.1""",200.0,4494.0,"""http://almhuette-raith.at/administrator/""","""Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/2..."
2,46.72.177.4,[12/Dec/2015:18:31:08 +0100],"""GET /administrator/ HTTP/1.1""",200.0,4263.0,"""-""","""Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/2..."
3,46.72.177.4,[12/Dec/2015:18:31:08 +0100],"""POST /administrator/index.php HTTP/1.1""",200.0,4494.0,"""http://almhuette-raith.at/administrator/""","""Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/2..."
4,83.167.113.100,[12/Dec/2015:18:31:25 +0100],"""GET /administrator/ HTTP/1.1""",200.0,4263.0,"""-""","""Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/2..."


## Top 10 frequent visitors of the website

In [None]:
ip=set(df['Client_IP_address'])
# print(len(ip))
devices=set(df['Client_Device'])
len(devices)
dict_={}
for i,row in df.iterrows():
    if(tuple((row['Client_IP_address'],row['Client_Device'])) in dict_):
        dict_[tuple((row['Client_IP_address'],row['Client_Device']))]+=1
    
    else:
        dict_[tuple((row['Client_IP_address'],row['Client_Device']))]=1
frequent = sorted(dict_, key = dict_.get, reverse = True)
frequent_10 = frequent[:10]
for i,j in enumerate(frequent_10, start=1):
    print(i,"",j)


1  ('205.167.170.15', '"Go-http-client/1.1"')
2  ('79.142.95.122', '"Mozilla/5.0 (Windows NT 5.1; rv:29.0) Gecko/20100101 Firefox/29.0"')
3  ('148.251.50.49', '"-"')
4  ('205.167.170.15', '"python-requests/1.2.3 CPython/2.7.5 Linux/3.14.27-100.fc19.x86_64"')
5  ('37.1.206.196', '"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36"')
6  ('91.200.12.22', '"Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.9.168 Version/11.50"')
7  ('213.150.254.81', '"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"')
8  ('84.112.161.41', '"Mozilla/5.0 (Linux; Android 5.0.2; HTC_One Build/LRX22G) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/33.0.0.0 Mobile Safari/537.36"')
9  ('205.167.170.15', '"Mozilla/5.0 (X11; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0"')
10  ('84.58.165.21', '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0"')


In [None]:
time=[]
page=[]
al=[]
for i in list(dict_)[:500]:
    for j,row in df.iterrows():
        if(i==tuple((row['Client_IP_address'],row['Client_Device']))):
            time.append(row['Time_of_activity'])
            page.append(row['Requested_Page'])
            al.append(i) 

In [None]:
al[:5]

[('109.169.248.247',
  '"Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0"'),
 ('109.169.248.247',
  '"Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0"'),
 ('109.169.248.247',
  '"Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0"'),
 ('109.169.248.247',
  '"Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0"'),
 ('109.169.248.247',
  '"Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0"')]

In [None]:
time[:5]

['[12/Dec/2015:18:25:11 +0100]',
 '[12/Dec/2015:18:25:11 +0100]',
 '[13/Dec/2015:12:10:05 +0100]',
 '[13/Dec/2015:12:10:05 +0100]',
 '[13/Dec/2015:12:43:46 +0100]']

In [None]:
page[:5]

['"GET /administrator/ HTTP/1.1"',
 '"POST /administrator/index.php HTTP/1.1"',
 '"GET /administrator/ HTTP/1.1"',
 '"POST /administrator/index.php HTTP/1.1"',
 '"GET /administrator/ HTTP/1.1"']

In [None]:
df1=pd.DataFrame({'Time':time,'Pages':page,'Client':al})
df1.head(40)

Unnamed: 0,Time,Pages,Client
0,[12/Dec/2015:18:25:11 +0100],"""GET /administrator/ HTTP/1.1""","(109.169.248.247, ""Mozilla/5.0 (Windows NT 6.0..."
1,[12/Dec/2015:18:25:11 +0100],"""POST /administrator/index.php HTTP/1.1""","(109.169.248.247, ""Mozilla/5.0 (Windows NT 6.0..."
2,[13/Dec/2015:12:10:05 +0100],"""GET /administrator/ HTTP/1.1""","(109.169.248.247, ""Mozilla/5.0 (Windows NT 6.0..."
3,[13/Dec/2015:12:10:05 +0100],"""POST /administrator/index.php HTTP/1.1""","(109.169.248.247, ""Mozilla/5.0 (Windows NT 6.0..."
4,[13/Dec/2015:12:43:46 +0100],"""GET /administrator/ HTTP/1.1""","(109.169.248.247, ""Mozilla/5.0 (Windows NT 6.0..."
5,[13/Dec/2015:12:43:46 +0100],"""POST /administrator/index.php HTTP/1.1""","(109.169.248.247, ""Mozilla/5.0 (Windows NT 6.0..."
6,[12/Dec/2015:18:31:08 +0100],"""GET /administrator/ HTTP/1.1""","(46.72.177.4, ""Mozilla/5.0 (Windows NT 6.0; rv..."
7,[12/Dec/2015:18:31:08 +0100],"""POST /administrator/index.php HTTP/1.1""","(46.72.177.4, ""Mozilla/5.0 (Windows NT 6.0; rv..."
8,[14/Dec/2015:16:39:27 +0100],"""GET /administrator/ HTTP/1.1""","(46.72.177.4, ""Mozilla/5.0 (Windows NT 6.0; rv..."
9,[14/Dec/2015:16:39:28 +0100],"""POST /administrator/index.php HTTP/1.1""","(46.72.177.4, ""Mozilla/5.0 (Windows NT 6.0; rv..."


## Five frequent pages which the users visit before visiting this particular web site

In [None]:
referers = df['Referer_page'].dropna(how='all')
from urllib.parse import urlparse
ref=[]
for i in referers:       
    parsed_uri = urlparse(i.replace('"', ''))
    result = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    ref.append(result)
word_counter = {}
for url in ref:
    if url in word_counter:
        word_counter[url] += 1
    else:
        word_counter[url] = 1
popular_words = sorted(word_counter, key = word_counter.get, reverse = True)
top_5 = popular_words[:6]
for i in top_5:
    if(i==':///'''):
        top_5.remove(i)
for i,j in enumerate(top_5,start=1):
    print(i,"",j)

1  http://almhuette-raith.at/
2  http://www.almhuette-raith.at/
3  https://search.yahoo.com/
4  http://www.almenland.at/
5  https://www.bing.com/


In [None]:
Pages_visited_per_session=list(df1['Pages'])
Pages_accessed={}
for i,row in df1.iterrows():
    if(row['Client'] in Pages_accessed):
        Pages_accessed[row['Client']].append(row['Pages'])    
    else:
        Pages_accessed[row['Client']]=[row['Pages']]
for i in Pages_accessed:
    print('\n Client {} accessed these pages \n {} \n'.format(i,Pages_accessed[i]))
all_pages=[]
for i in Pages_accessed:
    all_pages.append(Pages_accessed[i])


 Client ('109.169.248.247', '"Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0"') accessed these pages 
 ['"GET /administrator/ HTTP/1.1"', '"POST /administrator/index.php HTTP/1.1"', '"GET /administrator/ HTTP/1.1"', '"POST /administrator/index.php HTTP/1.1"', '"GET /administrator/ HTTP/1.1"', '"POST /administrator/index.php HTTP/1.1"'] 


 Client ('46.72.177.4', '"Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0"') accessed these pages 
 ['"GET /administrator/ HTTP/1.1"', '"POST /administrator/index.php HTTP/1.1"', '"GET /administrator/ HTTP/1.1"', '"POST /administrator/index.php HTTP/1.1"', '"GET /administrator/ HTTP/1.1"', '"POST /administrator/index.php HTTP/1.1"', '"GET /administrator/ HTTP/1.1"', '"POST /administrator/index.php HTTP/1.1"'] 


 Client ('83.167.113.100', '"Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0"') accessed these pages 
 ['"GET /administrator/ HTTP/1.1"', '"POST /administrator/index.php HTTP/1.1"', '"GET

### Least 10 frequent sequential patterns

In [None]:
from google.colab import files
src = list(files.upload().values())[0]


Saving GSP.py to GSP.py


In [None]:
open('GSP.py','wb').write(src)

4619

In [None]:
import argparse
import logging
import random
from GSP import GSP
logging.basicConfig(level=logging.DEBUG)
transactions=all_pages

In [None]:
result = GSP(transactions).search(0.2)

DEBUG:root:
        Run 1
        There are 89 candidates.
        The candidates have been filtered down to 2.

DEBUG:root:
        Run 2
        There are 4 candidates.
        The candidates have been filtered down to 2.

DEBUG:root:
        Run 3
        There are 8 candidates.
        The candidates have been filtered down to 2.

DEBUG:root:
        Run 4
        There are 16 candidates.
        The candidates have been filtered down to 2.

DEBUG:root:
        Run 5
        There are 32 candidates.
        The candidates have been filtered down to 2.

DEBUG:root:
        Run 6
        There are 64 candidates.
        The candidates have been filtered down to 2.

DEBUG:root:
        Run 7
        There are 128 candidates.
        The candidates have been filtered down to 2.

DEBUG:root:
        Run 8
        There are 256 candidates.
        The candidates have been filtered down to 2.

DEBUG:root:
        Run 9
        There are 512 candidates.
        The candidates have been fil

In [None]:
result

[{('"GET /administrator/ HTTP/1.1"',): 401,
  ('"POST /administrator/index.php HTTP/1.1"',): 400},
 {('"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"'): 399,
  ('"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"'): 229},
 {('"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"'): 229,
  ('"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"'): 229},
 {('"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"'): 229,
  ('"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"'): 155},
 {('"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrato

In [None]:
navigation=[]
for i in result:
  navigation.append(list(i.keys()))
navigation

[[('"GET /administrator/ HTTP/1.1"',),
  ('"POST /administrator/index.php HTTP/1.1"',)],
 [('"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"'),
  ('"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"')],
 [('"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"'),
  ('"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"')],
 [('"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"'),
  ('"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"')],
 [('"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/

In [None]:
l=list(navigation)
l.sort(reverse=True) #sort in reverse order
l

[[('"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"'),
  ('"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"')],
 [('"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"')],
 [('"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"',
   '"POST /administrator/index.php HTTP/1.1"',
   '"GET /administrator/ HTTP/1.1"'),
  ('"POST /administrator/index.ph

### A graph that shows clusters of users with similar  navigational patterns