In [1]:
from pyspark.sql import SparkSession

# Create a New SparkSession
spark = SparkSession \
    .builder \
    .appName("learning_spark_sql") \
    .getOrCreate()

# Read in Wikipedia Unique Visitors Dataset
wiki_uniq_df = spark.read\
    .option('header', True) \
    .option('delimiter', ',') \
    .option('inferSchema', True) \
    .csv("wiki_uniq_march_2022_w_site_type.csv")

1. Filter the DataFrame to sites with `language_code` is `"ar"`.

In [2]:
# use filter to only select rows with specified feature values
ar_site_visitors = wiki_uniq_df.filter(wiki_uniq_df.language_code=='ar')

# show the DataFrame
ar_site_visitors.show()

+--------------------+-------------------+-----------------+-------------------+-------------+-----------+
|              domain|uniq_human_visitors|uniq_bot_visitors|total_visitor_count|language_code|  site_type|
+--------------------+-------------------+-----------------+-------------------+-------------+-----------+
|  ar.m.wikipedia.org|            1644253|           750620|            2394873|           ar|  wikipedia|
|    ar.wikipedia.org|             212695|            97700|             310395|           ar|  wikipedia|
| ar.m.wikisource.org|              56124|            52885|             109009|           ar| wikisource|
|   ar.wikisource.org|               2134|             4355|               6489|           ar| wikisource|
|  ar.m.wikiquote.org|                776|             3511|               4287|           ar|  wikiquote|
|   ar.wiktionary.org|                262|             2335|               2597|           ar| wiktionary|
| ar.m.wiktionary.org|               

2. Filter the DataFrame to sites with `language_code` is `"ar"` and keep only the columns `domain` and `uniq_human_visitors`. 

In [4]:
# use select with a list of column names to extract just those columns
ar_visitors_slim = wiki_uniq_df.filter(wiki_uniq_df.language_code=='ar')\
                   .select(['domain', 'uniq_human_visitors'])

# show the DataFrame
ar_visitors_slim.show()

+--------------------+-------------------+
|              domain|uniq_human_visitors|
+--------------------+-------------------+
|  ar.m.wikipedia.org|            1644253|
|    ar.wikipedia.org|             212695|
| ar.m.wikisource.org|              56124|
|   ar.wikisource.org|               2134|
|  ar.m.wikiquote.org|                776|
|   ar.wiktionary.org|                262|
| ar.m.wiktionary.org|                448|
|ar.m.wikiversity.org|                389|
|  ar.m.wikibooks.org|                378|
+--------------------+-------------------+



3. Calculate the sum of all `uniq_human_visitors` grouped by `site_type` and ordered from highest to lowest page views.

In [12]:
# sum() takes the sum
# groupBy groups by specified variable values
top_visitors_site_type = wiki_uniq_df\
                         .select(['site_type', 'uniq_human_visitors'])\
                         .groupBy('site_type')\
                         .sum()\
                         .orderBy('sum(uniq_human_visitors)', ascending=False)

# show the DataFrame
top_visitors_site_type.show()

+-----------+------------------------+
|  site_type|sum(uniq_human_visitors)|
+-----------+------------------------+
|  wikipedia|               116527479|
| wiktionary|                  892193|
|  wikimedia|                  312995|
| wikisource|                  172179|
|   wikidata|                   69744|
|  wikibooks|                   54680|
|  wikiquote|                   38048|
| wikivoyage|                   14648|
|       wiki|                   13067|
|wikiversity|                   12548|
|   wikinews|                    5578|
|   wikitech|                     751|
+-----------+------------------------+

