## Web Mining

In [1]:
import pandas as pd
from sklearn.cluster import KMeans

### Import cleaned dataset

In [3]:
df = pd.read_csv('CaseStudy2Data/web_log_data.csv')
df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5866 entries, 0 to 5865
Data columns (total 6 columns):
ip           5866 non-null object
date_time    5866 non-null object
request      5866 non-null object
step         5866 non-null int64
session      5866 non-null int64
user_id      5866 non-null int64
dtypes: int64(3), object(3)
memory usage: 275.0+ KB


Unnamed: 0,ip,date_time,request,step,session,user_id
0,c210-49-32-6.rochd2.,18/Apr/2005:21:25:07,/,1,3,3
1,visp.inabox.telstra.,19/Apr/2005:08:24:28,/,1,12,12
2,dsl-61-95-54-84.requ,19/Apr/2005:08:33:01,/,1,13,13
3,d220-236-91-52.dsl.n,19/Apr/2005:09:16:06,/,1,15,15
4,allptrs.eq.edu.au,19/Apr/2005:09:47:54,/,1,22,22
5,cpe-144-136-135-38.q,19/Apr/2005:10:13:37,/,1,23,23
6,225-145-222-203.rev.,19/Apr/2005:11:48:32,/,1,25,25
7,cpe-138-130-198-54.q,19/Apr/2005:12:31:54,/,1,26,26
8,203-219-44-170-qld.t,19/Apr/2005:12:33:49,/,1,29,29
9,cpe-138-130-198-54.q,19/Apr/2005:12:42:51,/,1,30,30


In [6]:
#data time
df['date_time'] =  pd.to_datetime(df['date_time'], format="%d/%b/%Y:%H:%M:%S")
# Create features for year, month, day, hour, and minute 
df['year'] = df['date_time'].dt.year 
df['month'] = df['date_time'].dt.month 
df['day'] = df['date_time'].dt.day 
df['name_of_day'] = df['date_time'].dt.weekday_name 
df['hour'] = df['date_time'].dt.hour 
df['minute'] = df['date_time'].dt.minute

In [7]:
day_map = {'Monday':1, 'Tuesday': 2,'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7, 'Oct': 10, 'Nov': 11, 'Dec': 12}
df['name_of_day'] = df['name_of_day'].map(day_map)
df.head()

Unnamed: 0,ip,date_time,request,step,session,user_id,year,month,day,name_of_day,hour,minute
0,c210-49-32-6.rochd2.,2005-04-18 21:25:07,/,1,3,3,2005,4,18,1,21,25
1,visp.inabox.telstra.,2005-04-19 08:24:28,/,1,12,12,2005,4,19,2,8,24
2,dsl-61-95-54-84.requ,2005-04-19 08:33:01,/,1,13,13,2005,4,19,2,8,33
3,d220-236-91-52.dsl.n,2005-04-19 09:16:06,/,1,15,15,2005,4,19,2,9,16
4,allptrs.eq.edu.au,2005-04-19 09:47:54,/,1,22,22,2005,4,19,2,9,47


In [9]:
from sklearn.preprocessing import StandardScaler

# take 3 variables and drop the rest
df2 = df[['step', 'hour', 'name_of_day']]

# convert df2 to matrix
X = df2.as_matrix()

# scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

  import sys


## Clustering

In [None]:
# list to save the clusters and cost
clusters = []
inertia_vals = []

# this whole process should take a while
for k in range(1, 10, 1):
    # train clustering with the specified K
    model = KMeans(n_clusters=k, random_state=rs, n_jobs=10)
    model.fit(X)
    
    # append model to cluster list
    clusters.append(model)
    inertia_vals.append(model.inertia_)
    
    # plot the inertia vs K values
plt.plot(range(1,10,1), inertia_vals, marker='*')
plt.show()

In [20]:
from sklearn.metrics import silhouette_score

print(clusters[2])
print("Silhouette score for k=3", silhouette_score(X, clusters[2].predict(X)))

print(clusters[3])
print("Silhouette score for k=4", silhouette_score(X, clusters[3].predict(X)))

print(clusters[3])
print("Silhouette score for k=5", silhouette_score(X, clusters[4].predict(X)))

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=10, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)
Silhouette score for k=3 0.2657794040249854
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=10, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)
Silhouette score for k=4 0.3010470296512427
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=10, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)
Silhouette score for k=5 0.28793722657204324


In [11]:
# random state
rs = 42
model = KMeans(n_clusters=4, random_state=rs)
model.fit(X)

# sum of intra-cluster distances
print("Sum of intra-cluster distance:", model.inertia_)

print("Centroid locations:")
for centroid in model.cluster_centers_:
    print(centroid)

Sum of intra-cluster distance: 7659.434160327952
Centroid locations:
[-0.19198788  0.59042456  0.82524143]
[-0.14590137  0.29656485 -0.97154756]
[2.80431189 0.43669609 0.31241287]
[-0.3175147  -1.19703703  0.21580576]


In [12]:
import seaborn as sns
import matplotlib.pyplot as plt

model = KMeans(n_clusters=4, random_state=rs).fit(X)

# assign cluster ID to each record in X
# Ignore the warning, does not apply to our case here
y = model.predict(X)
df2['Cluster_ID'] = y

# how many records are in each cluster
print("Cluster membership")
print(df2['Cluster_ID'].value_counts())

# pairplot the cluster distribution.
cluster_g = sns.pairplot(df2, hue='Cluster_ID')
plt.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Cluster membership
1    2046
0    1844
3    1566
2     410
Name: Cluster_ID, dtype: int64


  binned = fast_linbin(X, a, b, gridsize) / (delta * nobs)
  FAC1 = 2*(np.pi*bw/RANGE)**2


<Figure size 1082.11x1000 with 20 Axes>

## Association mining

In [13]:
# group by account, then list all services
transactions = df.groupby(['ip'])['request'].apply(list)
print(transactions.head(20))

ip
002.b.004.brs.iprimu                                  [/, /services.html]
034.mel0205.mel.ipri                                                  [/]
038.b.004.brs.iprimu                                                  [/]
042.058.dsl.mel.ipri                                                  [/]
058.a.001.cns.iprimu    [/eaglefarm/, /eaglefarm/javascript/menu.js, /...
081.a.004.syd.iprimu                                                  [/]
086.a.002.brs.iprimu         [/eaglefarm/, /eaglefarm/javascript/menu.js]
099.a.001.brs.iprimu                             [/, /, /springwood.html]
1.cust21.qld.dsl.oze                                    [/, /favicon.ico]
103.cust20.qld.dsl.o    [/, /favicon.ico, /favicon.ico, /favicon.ico, ...
107.a.004.brs.iprimu    [/, /, /acacia.html, /direct.html, /eaglefarm,...
11.128-142-203.dart.                    [/, /services.html, /whoare.html]
114.a.001.brs.iprimu    [/, /acacia.html, /direct.html, /eaglefarm, /e...
118.a.002.gct.iprimu    [/, /acacia

In [14]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            # items_base = left side of rules, items_add = right side
            # support, confidence and lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add),
                         rule_set.support, rule.confidence, rule.lift]) 
    
    # typecast it to pandas df
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift']) 

In [15]:
from apyori import apriori

# type cast the transactions from pandas into normal list format and run apriori
transaction_list = list(transactions)
results = list(apriori(transaction_list, min_support=0.1, min_confidence = 0.15, min_lift = 3, min_length = 2, max_length = 2))

# print first 5 rules
print(results[:5])

[RelationRecord(items=frozenset({'/eaglefarm/', '/eaglefarm/javascript/menu.js'}), support=0.21597300337457817, ordered_statistics=[OrderedStatistic(items_base=frozenset({'/eaglefarm/'}), items_add=frozenset({'/eaglefarm/javascript/menu.js'}), confidence=0.923076923076923, lift=3.6150457472043374), OrderedStatistic(items_base=frozenset({'/eaglefarm/javascript/menu.js'}), items_add=frozenset({'/eaglefarm/'}), confidence=0.8458149779735682, lift=3.615045747204337)]), RelationRecord(items=frozenset({'/eaglefarm/pdf/Web_Price_List.pdf', '/eaglefarm/'}), support=0.140607424071991, ordered_statistics=[OrderedStatistic(items_base=frozenset({'/eaglefarm/'}), items_add=frozenset({'/eaglefarm/pdf/Web_Price_List.pdf'}), confidence=0.6009615384615384, lift=3.7360476062399135), OrderedStatistic(items_base=frozenset({'/eaglefarm/pdf/Web_Price_List.pdf'}), items_add=frozenset({'/eaglefarm/'}), confidence=0.8741258741258741, lift=3.7360476062399135)]), RelationRecord(items=frozenset({'/eaglefarm/price

In [16]:
result_df = convert_apriori_results_to_pandas_df(results)
#print(result_df.head(20))
result_df

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
0,/eaglefarm/,/eaglefarm/javascript/menu.js,0.215973,0.923077,3.615046
1,/eaglefarm/javascript/menu.js,/eaglefarm/,0.215973,0.845815,3.615046
2,/eaglefarm/,/eaglefarm/pdf/Web_Price_List.pdf,0.140607,0.600962,3.736048
3,/eaglefarm/pdf/Web_Price_List.pdf,/eaglefarm/,0.140607,0.874126,3.736048
4,/eaglefarm/,/eaglefarm/pricelist,0.156355,0.668269,3.857736
5,/eaglefarm/pricelist,/eaglefarm/,0.156355,0.902597,3.857736
6,/eaglefarm/,/eaglefarm/pricelist/,0.156355,0.668269,3.832847
7,/eaglefarm/pricelist/,/eaglefarm/,0.156355,0.896774,3.832847
8,/eaglefarm/javascript/menu.js,/eaglefarm/pdf/Web_Price_List.pdf,0.155231,0.60793,3.779366
9,/eaglefarm/pdf/Web_Price_List.pdf,/eaglefarm/javascript/menu.js,0.155231,0.965035,3.779366


In [19]:
# sort all acquired rules descending by lift
result_df = result_df.sort_values(by='Lift', ascending=False)
#print(result_df.head(15))
result_df

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
18,/eaglefarm/pricelist,/eaglefarm/pricelist/,0.172103,0.993506,5.69824
19,/eaglefarm/pricelist/,/eaglefarm/pricelist,0.172103,0.987097,5.69824
17,/eaglefarm/pricelist/,/eaglefarm/pdf/Web_Price_List.pdf,0.156355,0.896774,5.575051
16,/eaglefarm/pdf/Web_Price_List.pdf,/eaglefarm/pricelist/,0.156355,0.972028,5.575051
14,/eaglefarm/pdf/Web_Price_List.pdf,/eaglefarm/pricelist,0.154106,0.958042,5.530515
15,/eaglefarm/pricelist,/eaglefarm/pdf/Web_Price_List.pdf,0.154106,0.88961,5.530515
5,/eaglefarm/pricelist,/eaglefarm/,0.156355,0.902597,3.857736
4,/eaglefarm/,/eaglefarm/pricelist,0.156355,0.668269,3.857736
13,/eaglefarm/pricelist/,/eaglefarm/javascript/menu.js,0.170979,0.980645,3.8405
12,/eaglefarm/javascript/menu.js,/eaglefarm/pricelist/,0.170979,0.669604,3.8405


In [18]:
# sort all acquired rules descending by lift
result_df = result_df.sort_values(by='Confidence', ascending=False)
#print(result_df.head(15))
result_df

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
18,/eaglefarm/pricelist,/eaglefarm/pricelist/,0.172103,0.993506,5.69824
19,/eaglefarm/pricelist/,/eaglefarm/pricelist,0.172103,0.987097,5.69824
13,/eaglefarm/pricelist/,/eaglefarm/javascript/menu.js,0.170979,0.980645,3.8405
11,/eaglefarm/pricelist,/eaglefarm/javascript/menu.js,0.169854,0.980519,3.840008
16,/eaglefarm/pdf/Web_Price_List.pdf,/eaglefarm/pricelist/,0.156355,0.972028,5.575051
9,/eaglefarm/pdf/Web_Price_List.pdf,/eaglefarm/javascript/menu.js,0.155231,0.965035,3.779366
14,/eaglefarm/pdf/Web_Price_List.pdf,/eaglefarm/pricelist,0.154106,0.958042,5.530515
0,/eaglefarm/,/eaglefarm/javascript/menu.js,0.215973,0.923077,3.615046
5,/eaglefarm/pricelist,/eaglefarm/,0.156355,0.902597,3.857736
17,/eaglefarm/pricelist/,/eaglefarm/pdf/Web_Price_List.pdf,0.156355,0.896774,5.575051
