## Importing the required libraries

In [2]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *
import re
from pyspark.sql.functions import col
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import Row
from pyspark.sql.functions import desc

## Reading the file using 'spark-xml' library

In [4]:
df = spark.read.format("com.databricks.spark.xml") \
    .options(rowTag="page") \
    .load("/FileStore/tables/Wikipedia_20180701215740-19677.xml")

## Extracting The required Columns

In [6]:
new_df = df.select('title','revision.text._VALUE').dropna()
new_df = new_df.filter(~col('title').contains(":"))
new_df = new_df.withColumn('title',regexp_replace('title',' ','_'))

## Creating the regex to extract the links

In [8]:
# this regex extracts the links
my_regex = re.compile(r'\[+([^#,\'\-\(].*?)[\]|#]+')

# this regex is used to extract those links which do not contain colon, comma and ampersand
check_regex = re.compile(r'[:,&]')

## Function to calculate the node scores

In [10]:
def points(text):
  list_links = my_regex.findall(text)
  accepted_links = [link for link in list_links if len(check_regex.findall(link)) == 0]
  return (1.0/float(len(accepted_links)))

udf_points = udf(points,DoubleType())

## Applying the transformation to the Dataframe

In [12]:
df_score = new_df.withColumn('node_score',udf_points('_VALUE')).select('title','node_score')
df_score = df_score.alias('df_score')

## Flattening the Link Array

In [14]:
def to_link(row):
  t = row.title
  redirect_list = my_regex.findall(row._VALUE)
  result = []
  for redirect in redirect_list:
    if(len(check_regex.findall(redirect)) > 0):
      continue;
    l = redirect.replace(" ","_")
    l = l.replace(",","_")
    l = l.replace("&amp", "&")
    row = Row(title=t,link=l)
    result.append(row)
  return result

## Applying the Flatten transformation

In [16]:
df_link = spark.createDataFrame(new_df.rdd.flatMap(to_link))
df_link = df_link.alias('df_link')

## Extracting Only those links which are in title

In [18]:
good_df = df_score.join(df_link,df_score.title == df_link.link,how="inner").select('df_link.link','df_link.title')
good_df = good_df.alias('good_df')

## Applying join to get scores besides good links

In [20]:
joinResult = good_df.join(df_score,good_df.title == df_score.title,how="inner").select('good_df.link','df_score.node_score')

## Caclulating the rank

In [22]:
rank = joinResult.groupBy('link').sum('node_score').withColumnRenamed('sum(node_score)','rank').orderBy(desc('rank'))
rank.show()