### Import Library

In [2]:
import pandas as pd
import os
import numpy as np
from sklearn.cluster import KMeans

### Define Function

In [3]:
def check_status(value):
    if value in range(100, 199):
        return 1
    elif value in range(200, 299):
        return 2
    elif value in range(300, 399):
        return 3
    elif value in range(400, 499):
        return 4
    elif value in range(500, 600):
        return 5

def check_size(value):
    if value in range(0, 1000):
        return 1
    elif value in range(1001, 5000):
        return 2
    elif value in range(5001, 10000):
        return 3
    elif value in range(10001, 15000):
        return 4
    elif value > 15000:
        return 5

### Load Datasets

In [4]:
data = pd.read_csv('DATA.csv',sep= ';')
data

Unnamed: 0,Ip Address,User Identifier,User Identity,Date,Time,Page Request,Status,Size Request,Reference,Browser
0,192.168.1.16,0,0,04-Dec-18,21:52:00,https://portal.uad.ac.id/pembayaran/Kuliah,200,621,0,Google chrome
1,192.168.1.16,0,0,05-Dec-18,18:48:30,https://portal.uad.ac.id/transkrip/Transkrip,200,54947,0,Google chrome
2,64.233.173.168,0,0,06-Dec-18,8:30:03,https://portal.uad.ac.id/prestasi/prestasi,302,35754,0,Google Chrome
3,192.168.1.16,0,0,06-Dec-18,9:28:40,https://portal.uad.ac.id/transkrip/Transkrip,200,542,0,Google Chrome
4,64.233.173.170,0,0,06-Dec-18,20:42:00,https://portal.uad.ac.id/presensi/presensi,304,453,0,Google Chrome
5,192.168.1.16,0,0,07-Dec-18,14:21:02,https://portal.uad.ac.id/cdc/Agenda,200,453,0,Google Chrome
6,192.168.1.16,0,0,07-Dec-18,14:31:00,https://portal.uad.ac.id/wisuda/Wisuda,200,345,0,Google Chrome
7,64.233.173.172,0,0,07-Dec-18,19:37:00,https://portal.uad.ac.id/perpus/sirkulasi,200,455,0,Google Chrome
8,192.168.1.16,0,0,07-Dec-18,20:13:00,https://portal.uad.ac.id/skripsi/Skripsi,304,355,0,Google Chrome
9,192.168.1.16,0,0,07-Dec-18,21:38:04,https://portal.uad.ac.id/lsp/Asesmen/index,200,454,0,Google Chrome


### Define List Month

In [5]:
list_bulan = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

### Preprocessing Datasets

In [6]:
preprocessing = []
for i in range(len(data['Time'])):
    tmp_date = data['Date'][i].split("-")
    tmp_date[1] = list_bulan.index(tmp_date[1]) + 1
    tmp_time = list(map(int, data['Time'][i].split(":")))
    tmp_date = list(map(int, tmp_date))
    stat = [check_status(data['Status'][i])]
    stat_size = [check_size(data['Size Request'][i])]
    gabung = tmp_date + tmp_time + stat + stat_size
    preprocessing.append(gabung)
preprocessing = pd.DataFrame(preprocessing,columns=['Tanggal','Bulan','Tahun','Pukul','Menit','Detik','Status','Size Request'])
preprocessing

Unnamed: 0,Tanggal,Bulan,Tahun,Pukul,Menit,Detik,Status,Size Request
0,4,12,18,21,52,0,2,1
1,5,12,18,18,48,30,2,5
2,6,12,18,8,30,3,3,5
3,6,12,18,9,28,40,2,1
4,6,12,18,20,42,0,3,1
5,7,12,18,14,21,2,2,1
6,7,12,18,14,31,0,2,1
7,7,12,18,19,37,0,2,1
8,7,12,18,20,13,0,3,1
9,7,12,18,21,38,4,2,1


### Get Frequently Access Grouping By Date

In [53]:
get_column_date = pd.DataFrame(preprocessing['Tanggal'])
get_column_date.columns = ['Tanggal']
grouping_by_date = get_column_date.drop_duplicates().sort_values(by=['Tanggal'])
get_frequently = get_column_date.groupby('Tanggal').size()
grouping_by_date['Frequently'] = get_frequently.values
grouping_by_date

Unnamed: 0,Tanggal,Frequently
30,1,13
20,2,19
23,3,22
0,4,18
1,5,23
2,6,19
5,7,35


### Timestamp Transformations

In [8]:
date_list = [x for x in grouping_by_date['Tanggal']]
date_0 = [x for x in grouping_by_date['Frequently']]
date_1 = [x for x in grouping_by_date['Frequently'][1:]]
date_2 = [x for x in grouping_by_date['Frequently'][2:]]
date_transformation = [date_list, date_0, date_1, date_2]
date_transformation_df = pd.DataFrame(date_transformation).transpose().fillna(0).astype(int)
date_transformation_df.columns = ['Tanggal', 'Data T_0', 'Data T_1', 'Data T_2']
date_transformation_df

Unnamed: 0,Tanggal,Data T_0,Data T_1,Data T_2
0,1,13,19,22
1,2,19,22,18
2,3,22,18,23
3,4,18,23,19
4,5,23,19,35
5,6,19,35,0
6,7,35,0,0


### Cluster with K Means

In [9]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(date_transformation_df)
kmeans

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)

In [10]:
labels = kmeans.predict(date_transformation_df)
labels

array([0, 0, 0, 0, 0, 1, 2])

In [11]:
centroids = kmeans.cluster_centers_
list_label = list(range(len(centroids)))
centroids

array([[ 3. , 19. , 20.2, 23.4],
       [ 6. , 19. , 35. ,  0. ],
       [ 7. , 35. ,  0. ,  0. ]])

In [12]:
new_date_transformation_df = date_transformation_df.copy()
new_date_transformation_df['Cluster'] = labels
new_date_transformation_df

Unnamed: 0,Tanggal,Data T_0,Data T_1,Data T_2,Cluster
0,1,13,19,22,0
1,2,19,22,18,0
2,3,22,18,23,0
3,4,18,23,19,0
4,5,23,19,35,0
5,6,19,35,0,1
6,7,35,0,0,2
