### Network Analysis Notebook

In [1]:
#import pandas to load dataframe
import pandas as pd

#import matplot lib for charts
import matplotlib.pyplot as plt

import numpy as np

In [6]:
#import data from data pulled from Google Big Query
df1 = pd.read_csv('data/no nested values/bq-results-20190606-233531-zcn39saozy97.csv')
df1.head(20)

#confirmed that for each fullvisitorID, the hit number is asorted ascending

Unnamed: 0,fullVisitorId,visitId,visitNumber,transactions,hitNumber,pagePath
0,2981907656334968431,1495087388,1,,8,/google+redesign/shop+by+brand
1,2981907656334968431,1495087388,1,,10,/google+redesign/shop+by+brand/quickview
2,2981907656334968431,1495087388,1,,12,/google+redesign/bags/backpacks/waterproof+bac...
3,2981907656334968431,1495087388,1,,13,/google+redesign/apparel
4,2981907656334968431,1495087388,1,,14,/signin.html
5,2981907656334968431,1495087388,1,,15,/registersuccess.html
6,2981907656334968431,1495087388,1,,16,/store.html
7,2981907656334968431,1495087388,1,,17,/google+redesign/apparel/mens/mens+t+shirts
8,2981907656334968431,1495087388,1,,19,/google+redesign/apparel/mens/mens+t+shirts/qu...
9,2981907656334968431,1495087388,1,,21,/google+redesign/apparel/men+s+t+shirts/google...


In [3]:
df1.shape

(3478466, 6)

### Checking Features


#### Check for nan values

In [4]:
#check the columns that have nan values - how many nan values are present
df1.isna().sum()

fullVisitorId          0
visitId                0
visitNumber            0
transactions     3150906
hitNumber              0
pagePath               0
dtype: int64

'Transactions' feature has some Nan values that need to be dealt with before running my model. 

#### Transactions
Total number of ecommerce transactions within the session.

In [5]:
df1['transactions'] = df1['transactions'].fillna(0)
df1['transactions'].value_counts().sort_values(ascending=False)

0.0     3150906
1.0      305329
2.0       14308
3.0        2093
4.0        1274
6.0        1215
7.0         830
5.0         789
25.0        466
8.0         401
21.0        358
10.0        182
12.0        176
15.0        139
Name: transactions, dtype: int64

### Look at all features separatetly


#### FullVisitorId

Some visitors are visiting/making a hit on the website up to 4000 times. This could be in one visit (each hit is a count) or it could be during different sessions.

In [9]:
df1['fullVisitorId'].value_counts().sort_values(ascending=False)


1957458976293878100    4160
824839726118485274     2707
1856749147915772585    1715
9894955795481014038    1496
7634897085866546110    1006
4913801338365738862     858
5208937953046059083     856
232377434237234751      851
2194592743396253647     823
7813149961404844386     815
7483600664917507409     785
8197879643797712877     782
7311242886083854158     773
4835082938415020542     771
6760732402251466726     760
7344519175727343086     760
5590743844926892757     733
1956307607572137989     710
743123551680199202      694
9377429831454005466     685
9089132392240687728     670
6010250598436085923     662
1814166460229302850     640
315017261115039181      621
6254908847172458133     619
4309363468347582056     609
2446685875964479851     598
741993131378886687      591
3694234028523165868     587
9609104828919391966     579
                       ... 
1033234853722074138       1
8959752154799553404       1
1930815861872693566       1
852937226459058492        1
2127219676660142393 

#### hitNumber

The sequenced hit number. For the first hit of each session, this is set to 1.

In [12]:
df1['hitNumber'].value_counts().sort_values(ascending=False)

1      896425
2      413818
3      278001
4      202952
5      161798
6      133891
7      118780
8      100728
9       90008
10      78377
11      70794
12      63227
13      57715
14      51961
15      47522
16      42931
17      39577
18      36400
19      33633
20      30829
21      28672
22      26274
23      24576
24      22904
25      21375
26      19734
27      18615
28      17303
29      16336
30      15351
        ...  
464         9
483         9
476         8
450         8
446         8
441         8
438         8
466         8
420         8
472         8
500         8
479         8
480         8
485         8
497         8
496         8
474         7
461         7
458         7
456         7
491         7
499         7
493         7
490         7
487         6
492         6
494         6
484         5
489         5
481         4
Name: hitNumber, Length: 500, dtype: int64

#### pagePath

The URL path of the page.

In [13]:
df1['pagePath'].value_counts()

/home                                                                                                                                                                                                   981285
/basket.html                                                                                                                                                                                            209360
/google+redesign/shop+by+brand/youtube                                                                                                                                                                  145026
/signin.html                                                                                                                                                                                            101299
/store.html                                                                                                                                                                 

# 1.  Network Analysis for all visitors with all page paths

I will be using the `networkx` package in python to do modeling using network analysis. 

In [41]:
from itertools import combinations
import networkx as nx

In [42]:
df1.columns

Index(['fullVisitorId', 'visitId', 'visitNumber', 'transactions', 'hitNumber',
       'pagePath'],
      dtype='object')

In [43]:
### collapse to a series of lists
pagedata_all = df1.groupby('fullVisitorId').apply(lambda df1: list(df1['pagePath']))

### we can ignore 1 length pages because they have no connections
pagedata_all = pagedata_all[pagedata_all.apply(lambda df1:len(df1) > 1)]

In [44]:
pagedata_all[0]

['/home',
 '/home',
 '/google+redesign/fruit-games',
 '/home',
 '/google+redesign/office',
 '/google+redesign/office/quickview',
 '/home',
 '/google+redesign/apparel/men++s/men++s+t+shirts']

## Directional Network

In [45]:
#sequenced graph

mydict={}

# session = pagedata[0]

for session in pagedata_all:
    for i in range(0,len(session)-1):
        mytuple= (session[i], session[i+1])
        mydict.setdefault((mytuple),[]).append(1)
    
    
#make a graph object
V = nx.DiGraph()
#add the edges, if weight is greater than 20 - ie 20 pages
_ = [V.add_edge(key[0], key[1], weight = len(value)) for key,value in mydict.items()];    

In [46]:
len(mydict)

57787

In [47]:
#instantiate a graph object
V = nx.Graph()

In [50]:
# Edge is the connection between two nodes (webpages)
#Nodes: A node is a unit in the network -website page

#### Connectedness

Check if network is completely connected or if there are subgraphs.

In [51]:
nx.is_connected(V)

False

In [52]:
subgraphs = list(nx.connected_component_subgraphs(V))

In [53]:
#there are 4 connected subgraphs
len(subgraphs)

4

In [54]:
pd.Series(subgraphs)

0    (/home, /google+redesign/fruit-games, /google+...
1           (/google+redesign/shop+by+brand/undefined)
2      (/google+redesign/bags/salutlesmasterdico/home)
3             (/google+redesign/electronics/undefined)
dtype: object

### Cliques

Cliques are groups of linked nodes. In a clique, each individual node is connected to every other node. It maps nicely to our idea of cliques: a group of friends, that are all friends with each other. For each node, there is (at least) one maximally sized clique.

In [55]:
cliques = nx.find_cliques(V)
cliques

<generator object find_cliques at 0x0000025D03B208B8>

In [56]:
# Start with nothing
cliques_gt_5 = list()

# Iterating over each clique, stop when we have 5 cliques
while (len(cliques_gt_5) < 5):
    # If the clique is greater than size 5, add it to my list
    my_clique = next(cliques)
    if len(my_clique) >= 10:
        cliques_gt_5.append(my_clique)
        print(len(my_clique))



13
14
15
15
14


In [57]:
cliques_gt_5

[['/google+redesign/apparel/men+s+goog',
  '/google+redesign/electronics/quickview',
  '/home',
  '/google+redesign/apparel/men++s',
  '/google+redesign/electronics',
  '/google+redesign/apparel/men++s/quickview',
  '/google+redesign/apparel/men++s/men++s+outerwear',
  '/google+redesign/drinkware',
  '/store.html',
  '/store.html/quickview',
  '/google+redesign/apparel/women+s/women+s+outerwear/quickview',
  '/google+redesign/apparel/women+s/women+s+outerwear',
  '/google+redesign/apparel/men++s/men++s+outerwear/quickview'],
 ['/google+redesign/limited+supply/quickview',
  '/google+redesign/kids',
  '/google+redesign/lifestyle/fun',
  '/google+redesign/office/quickview',
  '/google+redesign/electronics/accessories',
  '/google+redesign/wearables',
  '/google+redesign/brands',
  '/google+redesign/office',
  '/google+redesign/gift+cards',
  '/google+redesign/bags/backpacks/quickview',
  '/home',
  '/google+redesign/limited+supply',
  '/google+redesign/bags',
  '/google+redesign/lifestyle

In [76]:
# Writing data out to Gephi format (.gexf)
nx.write_gexf(V, 'GoogleMerch_graph.gexf')

# 2.  Take only the positive class scenarios (purchase made)

In [12]:
# Want class 0 or class 1 - change all points that are not 0 to 1. 
df1['transactions'] = np.where(df1['transactions']==1.0, 1.0, 0)

In [13]:
#check transaction is 0 or 1
df1['transactions'].value_counts()

0.0    3173137
1.0     305329
Name: transactions, dtype: int64

In [16]:
df2 = df1.copy()

In [17]:
df2.head()

Unnamed: 0,fullVisitorId,visitId,visitNumber,transactions,hitNumber,pagePath
0,2981907656334968431,1495087388,1,0.0,8,/google+redesign/shop+by+brand
1,2981907656334968431,1495087388,1,0.0,10,/google+redesign/shop+by+brand/quickview
2,2981907656334968431,1495087388,1,0.0,12,/google+redesign/bags/backpacks/waterproof+bac...
3,2981907656334968431,1495087388,1,0.0,13,/google+redesign/apparel
4,2981907656334968431,1495087388,1,0.0,14,/signin.html


In [18]:
#take only the rows that have a transaction in them
df_transactions = df2.loc[df1['transactions'] == 1.0]

In [19]:
df_transactions.head()

Unnamed: 0,fullVisitorId,visitId,visitNumber,transactions,hitNumber,pagePath
447,2983171108603027589,1481857235,1,1.0,1,/home-2
448,2983171108603027589,1481857235,1,1.0,2,/google+redesign/apparel/men++s/men++s+outerwear
449,2983171108603027589,1481857235,1,1.0,3,/google+redesign/apparel/men++s/men++s+outerwe...
450,2983171108603027589,1481857235,1,1.0,5,/google+redesign/drinkware/mugs+and+cups
451,2983171108603027589,1481857235,1,1.0,6,/google+redesign/apparel/men++s/men++s+outerwear


In [20]:
df_transactions.shape

(305329, 6)

In [21]:
### collapse to a series of lists
pagedata_transaction = df_transactions.groupby('fullVisitorId').apply(lambda df_transactions: list(df_transactions['pagePath']))

### we can ignore 1 length pages because they have no connections
pagedata_transaction = pagedata_transaction[pagedata_transaction.apply(lambda df_transactions:len(df_transactions) > 1)]

In [25]:
#view a couple rows of new table
pagedata_transaction[0:2]

fullVisitorId
213131142648941    [/google+redesign/apparel/mens+outerwear/blm+s...
435324061339869    [/home, /google+redesign/apparel/men++s/men++s...
dtype: object

In [40]:
#sequenced graph

mydict_transaction={}

# session = pagedata[0]

for session in pagedata_transaction:
    for i in range(0,len(session)-1):
        mytuple= (session[i], session[i+1])
        mydict_transaction.setdefault((mytuple),[]).append(1)
    
    
#make a graph object
T = nx.DiGraph()
#add the edges, if weight is greater than 20 - ie 20 pages
_ = [T.add_edge(key[0], key[1], weight = len(value)) for key,value in mydict_transaction.items()];  

In [35]:
len(mydict_transaction)

15415

In [50]:
# Edge is the connection between two nodes (webpages)
#Nodes: A node is a unit in the network -website page

In [33]:
n_nodes_T = len(T.nodes)
n_edges_T = len(T.edges)
print(f'Graph has {n_nodes_T} nodes, and {n_edges_T} edges.')

Graph has 957 nodes, and 15415 edges.


In [36]:
# Writing data out to Gephi format (.gexf)
nx.write_gexf(T, 'GoogleMerch_transactions_graph_final.gexf')

# 3.  Network Analysis Using Content Grouping

This is a network analysis for all pages based (for all visitors) on content grouping. However, when I pulled it into a graph, it didn't tell a story as good as the individual pages. 

I did not end up using this because it was undirectional. However, the content grouping may be very useful in other future analysis.

### Content Grouping

In [58]:
# there are 2571 unique page paths
len(df1['pagePath'].unique())

2571

In [59]:
#getting true false to see if column contains that sub string. 
#Bucket pages together - I am making assumptions about what all of these mean

df1.loc[df1['pagePath'].str.contains('/home'),
       'pagePath'] = 'Home Page'

df1.loc[df1['pagePath'].str.contains('/quickview'),
       'pagePath'] = 'Quick View'

df1.loc[df1['pagePath'].str.contains('/myaccount'),
       'pagePath'] = 'Account Settings'

df1.loc[df1['pagePath'].str.contains('/store-policies/shipping-information'),
       'pagePath'] = 'Shipping Info'

df1.loc[df1['pagePath'].str.contains('questions|faqs'),
       'pagePath'] = 'Frequently Asked Questions'

df1.loc[df1['pagePath'].str.contains('/register'),
       'pagePath'] = 'Register'

df1.loc[df1['pagePath'].str.contains('/signin.html'),
       'pagePath'] = 'Sign in'

df1.loc[df1['pagePath'].str.contains('return-policy'),
       'pagePath'] = 'Return Policy'

df1.loc[df1['pagePath'].str.contains('/yourinfo.html'),
       'pagePath'] = 'Your Info'

df1.loc[df1['pagePath'].str.contains('/payment.html'),
       'pagePath'] = 'Payment'

df1.loc[df1['pagePath'].str.contains('/revieworder.html'),
       'pagePath'] = 'Review Order'

df1.loc[df1['pagePath'].str.contains('/basket.html'),
       'pagePath'] = 'Basket'

df1.loc[df1['pagePath'].str.contains('cart'),
       'pagePath'] = 'Add to Cart'

df1.loc[df1['pagePath'].str.contains('/ordercompleted'),
       'pagePath'] = 'Order Completed'

df1.loc[df1['pagePath'].str.contains('search'),
       'pagePath'] = 'Search'

df1.loc[df1['pagePath'].str.contains("brand"),
       'pagePath'] = 'Select Brand'

df1.loc[df1['pagePath'].str.contains('/guestregister.html'),
       'pagePath'] = 'Guest Register'

df1.loc[df1['pagePath'].str.contains('/wishlist.html|/addtowishlist'),
       'pagePath'] = 'Wish List'

df1.loc[df1['pagePath'].str.contains('/resetpassword'),
       'pagePath'] = 'Password reset'

df1.loc[df1['pagePath'].str.contains('logout'),
       'pagePath'] = 'Logout'

df1.loc[df1['pagePath'].str.contains('/store-policies/terms-of-use'),
       'pagePath'] = 'Terms of Use'

df1.loc[df1['pagePath'].str.contains('/store.html'),
       'pagePath'] = 'Store'

df1.loc[df1['pagePath'].str.contains('apparel|bag|drinkware|accessories|electronics|lifestyle|games|wearables|gift|nest|kids|eco|new|fun|top'),
       'pagePath'] = 'General Item Info'

df1.loc[df1['pagePath'].str.contains('/storeitem.html|/limited|spring|madeinusa|specials|office'),
       'pagePath'] = 'General Item Info'

df1.loc[df1['pagePath'].str.contains('/shop.axd'),
       'pagePath'] = 'Other'


In [60]:
#Check how many unique page paths I have
len(df1['pagePath'].unique())

24

In [61]:
#Check values for each content grouping
import collections
collections.Counter(df1['pagePath'])

Counter({'Select Brand': 227263,
         'Quick View': 359873,
         'General Item Info': 1170942,
         'Sign in': 101299,
         'Register': 24954,
         'Store': 93555,
         'Home Page': 1047241,
         'Basket': 209385,
         'Your Info': 37532,
         'Search': 62422,
         'Account Settings': 51040,
         'Frequently Asked Questions': 5161,
         'Payment': 35810,
         'Review Order': 18929,
         'Order Completed': 25291,
         'Shipping Info': 2957,
         'Return Policy': 1327,
         'Guest Register': 776,
         'Wish List': 1447,
         'Terms of Use': 902,
         'Other': 76,
         'Add to Cart': 276,
         'Password reset': 7,
         'Logout': 1})

### Network Analysis with Content Grouping

I will be using the `networkx` package in python to do modeling using network analysis. 

In [25]:
from itertools import combinations
import networkx as nx

In [62]:
df1.columns

Index(['fullVisitorId', 'visitId', 'visitNumber', 'transactions', 'hitNumber',
       'pagePath'],
      dtype='object')

In [63]:
### collapse to a series of lists
pagedata = df1.groupby('fullVisitorId').apply(lambda df1: list(df1['pagePath']))

### we can ignore 1 length pages because they have no connections
pagedata = pagedata[pagedata.apply(lambda df1:len(df1) > 1)]

In [64]:
#Check how some of the page data looks
pagedata[0:3]

fullVisitorId
5103959234087     [Home Page, Home Page, General Item Info, Home...
10278554503158    [Home Page, General Item Info, General Item In...
20424342248747    [Home Page, Home Page, Select Brand, General I...
dtype: object

In [65]:
#holding dict
mydict = {}

for i in range(len(pagedata)):
    #2 combinations of baskets
    for j,k in combinations(pagedata[i],2):
        #setdefault, and append a 1 for each time it is found
        mydict.setdefault((j,k),[]).append(1)

#make a graph
G = nx.Graph()
#add the edges, if weight is greater than 20 - ie 20 transactions
_ = [G.add_edge(i[0], i[1], weight = len(j)) for i,j in mydict.items() if len(j) > 20];


In [68]:
#check the length of mydict
len(mydict)

470

In [50]:
# Edge is the connection between two nodes (webpages)
#Nodes: A node is a unit in the network -website page

In [67]:
n_nodes = len(G.nodes)
n_edges = len(G.edges)
print(f'Graph has {n_nodes} nodes, and {n_edges} edges.')

Graph has 23 nodes, and 222 edges.


#### page rank - a measure of web page connectedness
- low page rank = fewer inbound links
- high page rank - large number of inbound links


In [69]:
# Find pagerank of each pagepath
pagerank_dict = nx.pagerank(G)

pagerank_df = pd.DataFrame.from_dict(pagerank_dict, orient='index')
pagerank_df.columns = ['pagerank']
pagerank_df.head()

Unnamed: 0,pagerank
Home Page,0.117395
General Item Info,0.336205
Quick View,0.125866
Shipping Info,0.007244
Select Brand,0.027903


In [70]:
# Top 10 Page rank 

# Use nlargest to get 10 nodes with greatest pagerank
pagerank_ten = pagerank_df.nlargest(10, 'pagerank')

#sort values in desc order
pagerank_ten['pagerank'].sort_values(ascending=False)

General Item Info    0.336205
Quick View           0.125866
Home Page            0.117395
Basket               0.106858
Store                0.042278
Sign in              0.028331
Select Brand         0.027903
Payment              0.027660
Search               0.026221
Account Settings     0.024868
Name: pagerank, dtype: float64

In [71]:
# See edges
###how to see edges: connections of web pages viewed for one visitor 
print(f'first few edges: {list(G.edges())[:5]}')

first few edges: [('Home Page', 'Home Page'), ('Home Page', 'General Item Info'), ('Home Page', 'Quick View'), ('Home Page', 'Shipping Info'), ('Home Page', 'Select Brand')]


#### Connectedness

Check if network is completely connected or if there are subgraphs.

In [72]:
#the graph is connected - no subgraphs
nx.is_connected(G)

True

In [83]:
# Writing data out to Gephi format (.gexf)
nx.write_gexf(G, 'GoogleMerch_contentgroup_graph.gexf')