# spark-basics

Playing around with Spark via the PySpark interface.

## Imports

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import Row, SparkSession

## Prepare Data

Unzip the data at ˙``data/pagecounts-20160101-000000_parsed.out˙``.

Each line of the dataset, delimited by a white space and contains the statistics for one Wikimedia page. The schema looks as follows:
- Project code: The project identifier for each page.
- Page title: A string containing the title of the page.
- Page hits: Number of requests on the specific hour.
- Page size: Size of the page.

In [2]:
!unzip -o data/pagecounts.zip -d data/

Archive:  data/pagecounts.zip
  inflating: data/pagecounts-20160101-000000_parsed.out  


Start Spark and create a RDD fro the text file.

In [3]:
# Start Spark context
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
spark = SparkSession(sc)

#Create RDD from external Data source
rdd_dirty = sc.textFile("data/pagecounts-20160101-000000_parsed.out")
type(rdd_dirty)

21/09/22 22:25:50 WARN Utils: Your hostname, mark-machine resolves to a loopback address: 127.0.1.1; using 192.168.0.102 instead (on interface wlp8s0)
21/09/22 22:25:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/09/22 22:25:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


pyspark.rdd.RDD

In [28]:
def get_clean_rdd(rdd):
    """ Create a clean rdd. """
    # split each line into the columns seperated by the whitespace delimiter
    rdd_tmp = rdd.map(lambda x: x.split(" "))
    
    # return dataframe
    return rdd_tmp.map(lambda x: Row(code=str(x[0]), title=str(x[1]), hits=int(x[2]), size=int(x[3])))

In [29]:
# get dataframe from rdd
rdd = get_clean_rdd(rdd=rdd_dirty)
print(type(rdd))

<class 'pyspark.rdd.PipelinedRDD'>


In [30]:
def take_n(rdd, n):
    """ Take n from rdd. """
    return rdd.take(n)

In [31]:
# Retrieve the first 15 records and print out the result.
take_n(rdd, 15)

[Row(code='aa', title='271_a.C', hits=1, size=4675),
 Row(code='aa', title='Category:User_th', hits=1, size=4770),
 Row(code='aa', title='Chiron_Elias_Krase', hits=1, size=4694),
 Row(code='aa', title='Dassault_rafaele', hits=2, size=9372),
 Row(code='aa', title='E.Desv', hits=1, size=4662),
 Row(code='aa', title='File:Wiktionary-logo-en.png', hits=1, size=10752),
 Row(code='aa', title='Indonesian_Wikipedia', hits=1, size=4679),
 Row(code='aa', title='Main_Page', hits=5, size=266946),
 Row(code='aa', title='Requests_for_new_languages/Wikipedia_Banyumasan', hits=1, size=4733),
 Row(code='aa', title='Special:Contributions/203.144.160.245', hits=1, size=5812),
 Row(code='aa', title='Special:Contributions/5.232.61.79', hits=1, size=5805),
 Row(code='aa', title='Special:Contributions/Ayarportugal', hits=1, size=5808),
 Row(code='aa', title='Special:Contributions/Born2bgratis', hits=1, size=5812),
 Row(code='aa', title='Special:ListFiles/Betacommand', hits=1, size=5035),
 Row(code='aa', titl

In [32]:
def count_records(rdd):
    """ Count the records in the rdd. """
    return rdd.count()

In [33]:
# Determine the number of records the dataset has in total.
print(f"There are {count_records(rdd)} records in the dataframe.")



There are 3324129 records in the dataframe.


                                                                                

In [34]:
def summary_stats_col(rdd, col_name):
    """ Computes the min, max, and average (mean) of a column in an rdd."""
    assert col_name in ["hits", "size"]
    col = rdd.map(lambda x: x[col_name])
    return {
        "min": col.min(),
        "max": col.max(),
        "mean": col.mean()
    }

In [35]:
# Compute the min, max, and average page size.
col_name="size"
summary_stats = summary_stats_col(rdd=rdd, col_name=col_name)
print(f"min {col_name} = {summary_stats['min']}, "
      f"max {col_name} = {summary_stats['max']}, "
      f"mean {col_name} = {summary_stats['mean']}")



min size = 0, max size = 141180155987, mean size = 132239.5695744666


                                                                                

In [41]:
# Determine the record(s) with the largest page size. If multiple records have the same size, list all of them.
col_name = "size"
assert col_name in ["hits", "size"]

mymax = rdd.map(lambda x: x[col_name]).max()
recs = rdd.filter(lambda x: mymax <= x[col_name])
recs.collect()

                                                                                

[Row(code='en.mw', title='en', hits=5466346, size=141180155987)]