In [3]:
# if the following command generates an error, you probably didn't enable 
# the cluster security option "Allow API access to all Google Cloud services"
# under Manage Security → Project Access when setting up the cluster
!gcloud dataproc clusters list --region us-central1

NAME          PLATFORM  PRIMARY_WORKER_COUNT  SECONDARY_WORKER_COUNT  STATUS   ZONE           SCHEDULED_DELETE
cluster-518a  GCE       3                                             RUNNING  us-central1-a


In [4]:
!pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes

[0m

In [5]:
import pyspark
import sys
from collections import Counter, OrderedDict, defaultdict
import itertools
from itertools import islice, count, groupby
import pandas as pd
import os
import re
from operator import itemgetter
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
from time import time
from pathlib import Path
import pickle
import pandas as pd
from google.cloud import storage

import hashlib
def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# if nothing prints here you forgot to include the initialization script when starting the cluster
!ls -l /usr/lib/spark/jars/graph*

-rw-r--r-- 1 root root 247882 Mar  5 15:34 /usr/lib/spark/jars/graphframes-0.8.2-spark3.1-s_2.12.jar


In [7]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SQLContext
from graphframes import *

In [8]:
spark

In [9]:
# Put your bucket name below and make sure you can access it without an error
BUCKET_NAME = 'irproj_26051997'
client = storage.Client()

# Get list of blobs in bucket
full_path = f"gs://{BUCKET_NAME}/"

blobs = [b for b in client.list_blobs(BUCKET_NAME, prefix='wiki_files/') if b.name not in ['wiki_files/graphframes.sh', 'wiki_files/']]
paths = [full_path + b.name for b in blobs]


In [10]:
# Put your `generate_graph` function here
def generate_graph(pages):
    ''' Compute the directed graph generated by wiki links.
  Parameters:
  -----------
    pages: RDD
      An RDD where each row consists of one wikipedia articles with 'id' and 
      'anchor_text'.
  Returns:
  --------
    edges: RDD
      An RDD where each row represents an edge in the directed graph created by
      the wikipedia links. The first entry should the source page id and the 
      second entry is the destination page id. No duplicates should be present. 
    vertices: RDD
      An RDD where each row represents a vetrix (node) in the directed graph 
      created by the wikipedia links. No duplicates should be present. 
  '''


    # YOUR CODE HERE
    edges = pages.flatMap(lambda page: [(page[0], linked_page[0]) for linked_page in page[1]])
    edges = edges.distinct()
    #pages = [[id, anchor_text],[],...] , page[0] = id , anchor[0] = link page ,page[1] = list of linked pages
    vertices_flat = edges.flatMap(lambda page : [page[0],page[1]])
    vertices_flat = vertices_flat.distinct()
    vertices = vertices_flat.map(lambda verticle: [verticle])

    return edges, vertices

In [11]:
parquetFile = spark.read.parquet(*paths)
doc_text_pairs = parquetFile.select("text", "id").rdd

                                                                                

In [12]:
# Count number of wiki pages
parquetFile.count()

                                                                                

6348910

In [None]:
t_start = time()
pages_links = parquetFile.select ("id","anchor_text").rdd
# construct the graph 
edges, vertices = generate_graph(pages_links)
# compute PageRank
edgesDF = edges.toDF(['src', 'dst']).repartition(124, 'src')
verticesDF = vertices.toDF(['id']).repartition(124, 'id')
g = GraphFrame(verticesDF, edgesDF)
pr_results = g.pageRank(resetProbability=0.15, maxIter=6)
pr = pr_results.vertices.select("id", "pagerank")
pr = pr.sort(col('pagerank').desc())
pr.repartition(1).write.csv(f'gs://{BUCKET_NAME}/page_rank')
pr_time = time() - t_start
pr.show()

[Stage 5:> (33 + 2) / 124][Stage 7:>  (4 + 2) / 124][Stage 9:>  (0 + 0) / 124]

In [None]:
Page_Rank_dict = pr.toPandas().set_index('id').T.to_dict('list')

import pickle

name = "Page_Rank_dict"
with open(f"{name}.pkl", "wb") as f:
    pickle.dump(Page_Rank_dict, f)

client = storage.Client()
bucket = client.bucket(BUCKET_NAME)
blob_posting_locs = bucket.blob(f"page_rank/{name}.pkl")
blob_posting_locs.upload_from_filename(f"{name}.pkl")

### Page View

In [None]:
# Paths
# Using user page views (as opposed to spiders and automated traffic) for the
# month of August 2021
pv_path = 'https://dumps.wikimedia.org/other/pageview_complete/monthly/2021/2021-08/pageviews-202108-user.bz2'
p = Path(pv_path)
pv_name = p.name
pv_temp = f'{p.stem}-4dedup.txt'
pv_clean = f'{p.stem}.pkl'
# Download the file (2.3GB)
!wget -N $pv_path
# Filter for English pages, and keep just two fields: article ID (3) and monthly
# total number of page views (5). Then, remove lines with article id or page
# view values that are not a sequence of digits.
!bzcat $pv_name | grep "^en\.wikipedia" | cut -d' ' -f3,5 | grep -P "^\d+\s\d+$" > $pv_temp
# Create a Counter (dictionary) that sums up the pages views for the same
# article, resulting in a mapping from article id to total page views.
wid2pv = Counter()
with open(pv_temp, 'rt') as f:
  for line in f:
    parts = line.split(' ')
    wid2pv.update({int(parts[0]): int(parts[1])})
# write out the counter as binary file (pickle it)
with open(pv_clean, 'wb') as f:
  pickle.dump(wid2pv, f)
# read in the counter
# with open(pv_clean, 'rb') as f:
#   wid2pv = pickle.loads(f.read())

In [None]:
name = "Page_View"
with open(f"{name}.pkl", "wb") as f:
    pickle.dump(wid2pv, f)

blob_posting_locs = bucket.blob(f"Page_View/{name}.pkl")
blob_posting_locs.upload_from_filename(f"{name}.pkl")