In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("learning_spark_sql") \
    .getOrCreate()

# read in the Wikipedia unique visitors dataset
uniq_views_df = spark.read\
    .option('header', True) \
    .option('delimiter', ',') \
    .option('inferSchema', True) \
    .csv("wiki_uniq_march_2022.csv")

1. Print the DataFrame schema for `uniq_views_df`.

In [2]:
# the schema defines the dataframe's structure columns and datatypes
uniq_views_df.printSchema()


root
 |-- domain: string (nullable = true)
 |-- uniq_human_visitors: integer (nullable = true)
 |-- uniq_bot_visitors: integer (nullable = true)
 |-- total_visitor_count: integer (nullable = true)



2. Show a description of the data for `uniq_views_df`.

In [8]:
uniq_views_df_desc = uniq_views_df.describe()

# show summary
uniq_views_df_desc.show()

+-------+----------------+-------------------+-----------------+-------------------+
|summary|          domain|uniq_human_visitors|uniq_bot_visitors|total_visitor_count|
+-------+----------------+-------------------+-----------------+-------------------+
|  count|             760|                760|              760|                760|
|   mean|            null|  155413.0394736842| 51431.0552631579| 206844.09473684212|
| stddev|            null| 1435327.5409314982| 376318.441663093| 1809320.9789242456|
|    min|aa.wikibooks.org|                  0|              170|               1005|
|    max|zu.wikipedia.org|           33261399|          8400247|           41661646|
+-------+----------------+-------------------+-----------------+-------------------+



3. Drop the columns `total_visitor_count` and `uniq_bot_visitors`.

In [10]:
# to drop multiple columns, provide them as seperate arguments
# NOT as a list etc.
uniq_counts_human_df = uniq_views_df.drop('total_visitor_count', 'uniq_bot_visitors')

# show the first 5 rows
uniq_counts_human_df.show(5, truncate=False)

+------------------+-------------------+
|domain            |uniq_human_visitors|
+------------------+-------------------+
|en.m.wikipedia.org|33261399           |
|en.wikipedia.org  |17009339           |
|es.m.wikipedia.org|5668575            |
|ru.m.wikipedia.org|5816762            |
|ja.m.wikipedia.org|5396108            |
+------------------+-------------------+
only showing top 5 rows



4. Rename `uniq_human_visitors` to `unique_site_visitors`.

In [12]:
## YOUR SOLUTION HERE ##
uniq_counts_final_df = uniq_counts_human_df.withColumnRenamed('uniq_human_visitors', 'unique_site_visitors')

# show the first 5 rows
uniq_counts_final_df.show(5, truncate=True)

+------------------+--------------------+
|            domain|unique_site_visitors|
+------------------+--------------------+
|en.m.wikipedia.org|            33261399|
|  en.wikipedia.org|            17009339|
|es.m.wikipedia.org|             5668575|
|ru.m.wikipedia.org|             5816762|
|ja.m.wikipedia.org|             5396108|
+------------------+--------------------+
only showing top 5 rows

