## How to Run Spark on Windows
1. Download Spark [v3.2.0] and save file contents to `C:/BigData/spark`
  - `spark-submit --version`
  - https://spark.apache.org/downloads.html
2. Download winutils from `hadoop-3.0.0/bin` and add it to `C:/BigData/hadoop/bin`
  - https://github.com/steveloughran/winutils
3. Download Java Version 8 [v1.8.0_311]
  - `java --version`
4. Add to environment variables
  - `HADOOP_HOME` = `C:/BigData/hadoop`
  - `SPARK_HOME` = `C:/BigData/spark`
  - `SPARK_LOCAL_DIRS` = `C:/BigData/tmp`
  - `SPARK_LOCAL_HOSTNAME` = `localhost`
  - `SPARK_LOCAL_IP` = `127.0.0.1`
  - `PYSPARK_DRIVER_PYTHON` = `jupyter`
  - `PYSPARK_DRIVER_PYTHON_OPTS` = `notebook`
5. Download pyspark and graphframes [graphframes==0.8.2]
  - Install PySpark: `pip install pyspark`
  - Install GraphFrames: `pip install graphframes`
  - Check version: `pip freeze > requirements.txt`
6. Download scala [v2.12.14]
  - Download version 2 --> SBT Version
  - Run `sbt console` in current directory to install 
  - Check global scalaVersion: `sbt scalaVersion`
  - Check pyspark scalaVersion: `spark-submit --version`
  - https://www.scala-lang.org/download/scala2.html

- Create new dir `C:/BigData/spark`
- Create new dir `C:/BigData/hadoop`
- Create new dir `C:/BigData/tmp`
- Modify permissions to `C:/User/tmp` to be modified by everyone

In [2]:
from pyspark.sql.types import *
from graphframes import *
from pyspark import SparkContext
from pyspark.sql import SparkSession

import os
os.environ['SPARK_LOCAL_IP'] = '127.0.0.1'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'notebook'
os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages graphframes:graphframes:0.8.2-spark3.2-s_2.12 pyspark-shell"
)

from graphframes import *
from pyspark import *
from pyspark.sql import *
spark = SparkSession.builder.appName("fun").master("local[*]").config("spark.driver.bindAddress", "localhost").getOrCreate()

# GraphFrame from the CSV files
def create_transport_graph():
    node_fields = [
        StructField("id", StringType(), True),
        StructField("latitude", FloatType(), True),
        StructField("longitude", FloatType(), True),
        StructField("population", IntegerType(), True)
    ]
    nodes = spark.read.csv("data/transport-nodes.csv", header=True,
    schema=StructType(node_fields))
    rels = spark.read.csv("data/transport-relationships.csv", header=True)
    reversed_rels = (rels.withColumn("newSrc", rels.dst)
        .withColumn("newDst", rels.src)
        .drop("dst", "src")
        .withColumnRenamed("newSrc", "src")
        .withColumnRenamed("newDst", "dst")
        .select("src", "dst", "relationship", "cost"))
    relationships = rels.union(reversed_rels)

    return GraphFrame(nodes, relationships)
g = create_transport_graph()
print(g)

GraphFrame(v:[id: string, latitude: float ... 2 more fields], e:[src: string, dst: string ... 2 more fields])


In [1]:
import os
os.environ['SPARK_LOCAL_IP'] = '127.0.0.1'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'notebook'
os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--packages graphframes:graphframes:0.8.2-spark3.2-s_2.12 pyspark-shell"
)

from graphframes import *
from pyspark import *
from pyspark.sql import *
spark = SparkSession.builder.appName("fun").master("local[*]").config("spark.driver.bindAddress", "localhost").getOrCreate()

vertices = spark.createDataFrame([('1', 'Carter', 'Derrick', 50), 
                                  ('2', 'May', 'Derrick', 26),
                                 ('3', 'Mills', 'Jeff', 80),
                                  ('4', 'Hood', 'Robert', 65),
                                  ('5', 'Banks', 'Mike', 93),
                                 ('98', 'Berg', 'Tim', 28),
                                 ('99', 'Page', 'Allan', 16)],
                                 ['id', 'name', 'firstname', 'age'])
edges = spark.createDataFrame([('1', '2', 'friend'), 
                               ('2', '1', 'friend'),
                              ('3', '1', 'friend'),
                              ('1', '3', 'friend'),
                               ('2', '3', 'follows'),
                               ('3', '4', 'friend'),
                               ('4', '3', 'friend'),
                               ('5', '3', 'friend'),
                               ('3', '5', 'friend'),
                               ('4', '5', 'follows'),
                              ('98', '99', 'friend'),
                              ('99', '98', 'friend')],
                              ['src', 'dst', 'type'])
g = GraphFrame(vertices, edges)
## Take a look at the DataFrames
g.vertices.show()
g.edges.show()
## Check the number of edges of each vertex
g.degrees.show()

+---+------+---------+---+
| id|  name|firstname|age|
+---+------+---------+---+
|  1|Carter|  Derrick| 50|
|  2|   May|  Derrick| 26|
|  3| Mills|     Jeff| 80|
|  4|  Hood|   Robert| 65|
|  5| Banks|     Mike| 93|
| 98|  Berg|      Tim| 28|
| 99|  Page|    Allan| 16|
+---+------+---------+---+

+---+---+-------+
|src|dst|   type|
+---+---+-------+
|  1|  2| friend|
|  2|  1| friend|
|  3|  1| friend|
|  1|  3| friend|
|  2|  3|follows|
|  3|  4| friend|
|  4|  3| friend|
|  5|  3| friend|
|  3|  5| friend|
|  4|  5|follows|
| 98| 99| friend|
| 99| 98| friend|
+---+---+-------+

+---+------+
| id|degree|
+---+------+
|  1|     4|
|  2|     3|
|  3|     7|
|  4|     3|
|  5|     3|
| 98|     2|
| 99|     2|
+---+------+

