# Wikipedia similarity graph

The goal of this notebook is to create a graph where node are wikipedia page and weighted edges are the similarity between articles.


In [4]:
import numpy as np
import wikipedia_preprocessing as wiki
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from os import path

DATA_folder = path.join("Data", "Wiki", "graph")

ModuleNotFoundError: No module named 'gensim'

## Computing similarity

### Step 1: Fetching wikipedia pages data

In [None]:
wiki.get_all_dumps()
wiki.precompute_pages(max_dumps=1)

pages = wiki.Pages()
Ncorp = len(pages)

### Step 2: Compute Word2vec of each page

In [None]:
Nvec = 100

wiki.vectorize_pages(pages)
del pages

D = wiki.VecPages()

### Step 3: List vocabulary

In [None]:
vocab_filepath = path.join(DATA_folder, "vocab.pkl")

if path.isfile(abs_precomp_filepath):
    print("Vocabulary already computed")
    with open(vocab_filepath, "rb") as f:
        voc = pickle.load(f)
else:
    voc = []
    for i, d in enumerate(D):
        print("\r%d/%d" % (i, Ncorp), end="")
        for t in d["content"]:
            if t not in voc:
                voc.append(t)
    print("Saving vocabulary.")
    with open(vocab_filepath, "wb") as f:
        pickle.dump(voc, f)
    


### Step 4: Compute inverse document frequency (idf) of each term

In [None]:
def compute_idf(voc, D):
    idf = np.zeros(len(voc))
    for i, t in enumerate(voc):
        print("\r%d/%d" % (i, len(voc)), end="")
        appears = 0
        for d in D:
            if t in d["content"]:
                appears += 1
        idf[i] = np.log(Ncorp/appears)
    return idf

idf_filepath = path.join(DATA_folder, "idf.pkl")

if path.isfile(idf_filepath):
    print("Pages idf already computed")
    with open(idf_filepath, "rb") as f:
        idf = pickle.load(f)
else:
    idf = compute_idf(voc, D)
    print("Saving pages idf.")
    with open(idf_filepath, "wb") as f:
        pickle.dump(idf, f)

### Step 5: For each document, compute each term frequency (tf) and tf-idf. Keep only the N=100 most important terms.

In [None]:
N = 100

def compute_tf(d):
    n = len(d["content"])
    f = []
    terms = []
    for t in d["content"]:
        if t not in terms:
            terms.append(t)
            f = np.sum(d == t) / n
    terms = np.array(terms)
    f = np.array(f)
    idf = 0.5 + 0.5 * f / np.max(f)
    return terms, f

def compute_doc_vectors():
    doc_vectors = np.zeros((Ncorp, N, Nvec + 1))
    for i_d, d in enumerate(D):
        print("\r%d/%d" % (i_d, Ncorp), end="")
        terms, tf = compute_tf(d)
        tfidf = np.zeros(len(terms))
        for i, t in enumerate(terms):
            tfidf[i] = tf[i] * idf[np.where(voc == t)[0]]
        index_sorted = tfidf.argsort()
        terms = terms[index_sorted]
        tfidf = tfidf[index_sorted]
        doc_vectors[i_d, :, :-1] = terms[:N]
        doc_vectors[i_d, :, -1] = tfidf
        return doc_vectors
        
vec_doc_filepath = path.join(DATA_folder, "vec_doc.pkl")

if path.isfile(vec_doc_filepath):
    print("Vec pages already computed")
    with open(vec_doc_filepath, "rb") as f:
        doc_vectors = pickle.load(f)
else:
    doc_vectors = compute_doc_vectors(voc, D)
    print("Saving Vec pages.")
    with open(vec_doc_filepath, "wb") as f:
        pickle.dump(doc_vectors, f)

### Testing: creating a search function using this similarity

## Creating the graph

### Step1: Compute all document similarities

In [None]:
def doc_similarity(d1, d2):
    sim = 0
    for v1 in d1:
        t1 = v1[:N]
        tfidf1 = v1[-1]
        for v2 in d2:
            t2 = v2[:N]
            tfidf2 = v2[-1]
            sim += cosine_similarity(t1, t2) * tfidf1 * tfidf2
    return sim / N**2

### Step2: Compute similarity histogram

### Step 3: Choose threshold

### Step 4: Create graph

## Results

### Graph representation

### Clustering

### Comparision to wikipedia portals