# Creating a Spark session

In [1]:
from pyspark.sql import *
from pyspark.sql import functions as f
from pyspark.sql.types import *

spark = SparkSession.builder.appName("SparkIntro").getOrCreate()

# The Spark UI
When you run a cell with Spark code in Jupyter, the code is submitted as a job to Spark. You can access the Spark UI on [http://127.0.0.1:4040/](http://127.0.0.1:4040/).

The Spark UI gives you all the information you need about your Spark job.

# The web traffic data

In [2]:
df = spark.read.option('delimiter', '\t').csv('data/traffic_2')

In [3]:
df.show()

+---------------+-----------------+
|            _c0|              _c1|
+---------------+-----------------+
| 186.99.192.116|       python.org|
| 202.152.82.171|    wikipedia.org|
|130.126.231.205|       python.org|
|116.142.112.214|pandas.pydata.org|
|113.124.204.127|       python.org|
|  143.30.183.87|    wikipedia.org|
| 138.74.228.219|       python.org|
|  56.120.106.87|    wikipedia.org|
| 189.119.55.225|    wikipedia.org|
| 180.110.73.101|    wikipedia.org|
|125.147.103.124|       python.org|
|   89.161.15.82|    wikipedia.org|
| 64.108.133.139|pandas.pydata.org|
|   87.91.133.89|pandas.pydata.org|
|111.141.147.118|    wikipedia.org|
|    97.65.99.76|    wikipedia.org|
|   80.99.56.157|    wikipedia.org|
| 122.86.146.117|    wikipedia.org|
| 200.132.86.152|pandas.pydata.org|
|  98.200.179.72|       python.org|
+---------------+-----------------+
only showing top 20 rows



In [4]:
distinct_ips = df.groupBy('_c1').agg(f.countDistinct('_c0'))

In [5]:
distinct_ips.show()

+-----------------+-------------------+
|              _c1|count(DISTINCT _c0)|
+-----------------+-------------------+
|pandas.pydata.org|           60714478|
|   databricks.com|            6265888|
|    datarobot.com|            1266847|
|       github.com|            6270433|
|       google.com|           12483960|
|   scala-lang.org|                  1|
|    wikipedia.org|          221880372|
|       python.org|          117667749|
|           dtu.dk|           12485696|
| spark.apache.org|            2522151|
+-----------------+-------------------+



In [18]:
distinct_ips.explain()

== Physical Plan ==
*(3) HashAggregate(keys=[_c1#105], functions=[count(distinct _c0#104)])
+- Exchange hashpartitioning(_c1#105, 200)
   +- *(2) HashAggregate(keys=[_c1#105], functions=[partial_count(distinct _c0#104)])
      +- *(2) HashAggregate(keys=[_c1#105, _c0#104], functions=[])
         +- Exchange hashpartitioning(_c1#105, _c0#104, 200)
            +- *(1) HashAggregate(keys=[_c1#105, _c0#104], functions=[])
               +- *(1) FileScan csv [_c0#104,_c1#105] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/jovyan/work/data/traffic_2], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>
