In [10]:
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from module import * #python module containing custom functions
np.random.seed(2) #random seed for reproducibility

The [WEBSPAM-UK2006](https://chato.cl/webspam/datasets/uk2006/) dataset contains 11402 hosts in the `.uk` domain, of which 7866 are labeled as *spam* or *normal*. Newer datasets have been released by the same authors, but this 2006 version remains the one with the highest number of manually labeled samples.\
The file `new_hostnames.csv` contains the names of the hosts in the dataset, while `webspam-uk2006-labels.txt` assigns to 8045 host names a label chosen among *spam*, *normal* or *undecided*. For the purposes of this project, *undecided*-labeled hosts were considered unlabeled, leaving only 7866 hosts labeled as *spam* or *normal*.\
Finally, the file `uk-2006-05.hostgraph_weighted.txt` contains the weighted graph of the hosts, each row containing a host index, the indices of outlinked hosts and, for each of them, the number of outlinks.\
The function `read_graph` returns a sparse `csr_matrix` $\mathcal{R}$, with $\mathcal{R}_{i,j}$ equal to the number of outlinks from host $i$ to host $j$ divided by the total number of outlinks of host $i$.\
Since PageRank algorithm requires a stochastic matrix, but there is no guarantee that each host has at least one outlink (dangling node problem), as proposed in Andersen et al., an artificial node with a single self-loop was added to the graph, with ingoing edges from all dangling nodes.

In [2]:
hostnames=hostnames_list('data/new_hostnames.csv')
labels_dict=labels_dictionary('data/webspam-uk2006/webspam-uk2006-labels.txt')
labels, labeled_dataset=make_dataset(labels_dict,hostnames)
R=read_graph('data/uk-2006-05.hostgraph_weighted.txt',len(hostnames))

For the following computations, it is useful to store the columns of the $\mathcal{R}$ matrix in a list.

In [3]:
columns=columns_list(R)

In the following cell PageRank is computed iteratively, according to the equation:
$$
{rank}_{k+1}^T=\frac{\alpha}{N}\mathbf{1}^T+(1-\alpha)R^T\cdot {rank}_k^T
$$
where ${rank}_k$ is the row vector storing the PageRank scores at step $k$, $\mathbf{1}$ is a row vector of ones, $N$ is the number of nodes (in this case 11403) and $\alpha$ is the teleporting factor. The iterative computation is performed up to a fixed precision of $\epsilon$.\
\
Personalized PageRank is an algorithm directly derived from PageRank, whose result is a matrix $\mathcal{PRM}$ such that $\mathcal{PRM}_{i,j}$ is the contribution of node $i$ to the PageRank of node $j$. This implies that the sum of $\mathcal{PRM}$ is equal to the PageRank vector.

In [5]:
alpha=.01
eps=1e-8
delta=1e-3

print("PageRank computation")
rank=compute_PR(alpha,eps,R)
nl=len(labeled_dataset)
ap=np.zeros((nl,len(rank)))
print("Approximation of Personalized PageRank for labeled hosts")
for v in range(nl):
    print(str(v)+'/'+str(nl),end='\r')
    ap[v]=(approximate_contributions(labeled_dataset[v], alpha, delta*rank[labeled_dataset[v]], rank[labeled_dataset[v]], columns))

PageRank computation
PageRank computed
Approximation of Personalized PageRank for labeled hosts
7865/7866

In [6]:
x=extract_features(R,delta,ap,labeled_dataset,rank)
y=labels[labeled_dataset]

Spam detection is particularly relevant for high PageRank hosts, since people tend to click on highly ranked pages, often within the first page of search engine results.\
However, we can see that PageRank alone is not able to filter out spam pages. In fact, if we restrict our view to the highest ranked 25% of the dataset, 161 hosts out of 2095 (total labeled hosts in this 25%) are labeled as spam, as opposed to 773/7866 on the entire dataset. The proportion of spam hosts drops from 9.8% to 7.7%, but that is surely not enough to consider PageRank as a spam detection or spam-robust algorithm.

In [7]:
n=25
labeled_top=top_n_percent(n,rank,labeled_dataset)
print(labeled_top)

x_top=x[labeled_top]
y_top=y[labeled_top]

print(sum(y_top), len(y_top), sum(y), len(y))

[  47   94  121 ... 7854 7863 7864]


In [14]:
clf=DecisionTreeClassifier(max_depth=1,class_weight='balanced')
pred=cross_val_predict(clf,x,y,cv=10)
print(accuracy(y,pred),precision(y,pred,average=None),recall(y,pred,average=None))

clf=DecisionTreeClassifier(class_weight='balanced')
pred=cross_val_predict(clf,x,y,cv=10)
print(accuracy(y,pred),precision(y,pred,average=None),recall(y,pred,average=None))

clf=RandomForestClassifier(class_weight='balanced_subsample')
pred=cross_val_predict(clf,x,y,cv=10)
print(accuracy(y,pred),precision(y,pred,average=None),recall(y,pred,average=None))



clf=DecisionTreeClassifier(max_depth=1,class_weight='balanced')
pred=cross_val_predict(clf,x_top,y_top,cv=10)
print(accuracy(y_top,pred),precision(y_top,pred,average=None),recall(y_top,pred,average=None))

clf=DecisionTreeClassifier(class_weight='balanced')
pred=cross_val_predict(clf,x_top,y_top,cv=10)
print(accuracy(y_top,pred),precision(y_top,pred,average=None),recall(y_top,pred,average=None))

clf=RandomForestClassifier(class_weight='balanced_subsample')
pred=cross_val_predict(clf,x_top,y_top,cv=10)
print(accuracy(y_top,pred),precision(y_top,pred,average=None),recall(y_top,pred,average=None))


161 2095 773 7866
0.7013729977116705 [0.99252492 0.24163934] [0.67390385 0.9534282 ]
0.8960081362827358 [0.9402273  0.46955345] [0.94473425 0.44890039]
0.9210526315789473 [0.94365232 0.63286713] [0.97039335 0.4683053 ]
0.9045346062052506 [0.99884925 0.44537815] [0.89762151 0.98757764]
0.9369928400954654 [0.96732365 0.58682635] [0.96432265 0.60869565]
0.9479713603818616 [0.96484972 0.6969697 ] [0.97931748 0.57142857]
