# Running Apache Spark on Google Colab

## Installing

In [0]:
# Spark is written in scala. We need java and jvm for that.
# Install jvm using java 8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Downloading apache spark 2.4.5
# Choose the package from downloads.apahce.org/spark
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
# Extract the tar file
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
# Install findspark package
# Allows to findspark and set the system variable
# Pyspark isn't on sys path by default to add that and use as a normal libraray we install findspark.
!pip install -q findspark

In [2]:
! ls /usr/lib/jvm

default-java		   java-11-openjdk-amd64     java-8-openjdk-amd64
java-1.11.0-openjdk-amd64  java-1.8.0-openjdk-amd64


In [3]:
# This is optional.
# To bring data from spark df to pandas df.
# Pyspark uses py4j binding. it sends data through pickle to driver and unpickles it.
# Serialization is expensive. We can use pyarrow to transfer the objects. Pyarrow is compatible with both sparkdf and pandas df
! pip install pyarrow



## Set the paths

In [0]:
# Set the java home and the spark home
import os
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ['SPARK_HOME'] = "/content/spark-2.4.5-bin-hadoop2.7"

In [0]:
import findspark
# findspark will set it in system path.
findspark.init()

# We need spark session and spark context to run
from pyspark.sql import SparkSession
# We do not have distributed environ, driver and executor node are same.
spark = SparkSession.builder.master("local[*]").getOrCreate()

# set the config for spark. # We can set the memory for both driver and executor.
spark.conf.set("spark.executor.memory", "4g")
spark.conf.set("spark.driver.memory", "4g")
spark.conf.set("spark.memory.fraction", "0.9")

# Submitting jdk or py files

- This is for submitting spark files in pyspark

os.environ['PYSPARK_SUBMIT_ARGS'] = 

spark.spark.Context.addPyFile('../')

# Testing

In [0]:
import sys, tempfile, urllib

In [0]:
import ssl
# Work around for the ssl error.
# Ignore if it works properly
ssl._create_default_https_context = ssl._create_unverified_context

In [0]:
BASE_DIR = "/tmp"
OUTPUT_FILE = os.path.join(BASE_DIR, 'credit_data.csv')

In [0]:
credit_data = urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data", OUTPUT_FILE)

In [12]:
!ls /tmp

blockmgr-3aad7eb6-02a8-431b-a298-f3cf76b60a3e
credit_data.csv
hsperfdata_root
spark-2b5d23b9-b4bf-4b8b-ab3b-00249d299452
spark-9366ec80-6c82-47c0-b884-c2f881042385


In [0]:
credit_df = spark.read.option("inferSchema", "true").csv("/tmp/credit_data.csv", header=False)

In [14]:
credit_df.show()

+---+-----+------+---+---+---+---+-----+---+---+----+----+----+-----+-----+----+
|_c0|  _c1|   _c2|_c3|_c4|_c5|_c6|  _c7|_c8|_c9|_c10|_c11|_c12| _c13| _c14|_c15|
+---+-----+------+---+---+---+---+-----+---+---+----+----+----+-----+-----+----+
|  b|30.83|   0.0|  u|  g|  w|  v| 1.25|  t|  t|   1|   f|   g|00202|    0|   +|
|  a|58.67|  4.46|  u|  g|  q|  h| 3.04|  t|  t|   6|   f|   g|00043|  560|   +|
|  a|24.50|   0.5|  u|  g|  q|  h|  1.5|  t|  f|   0|   f|   g|00280|  824|   +|
|  b|27.83|  1.54|  u|  g|  w|  v| 3.75|  t|  t|   5|   t|   g|00100|    3|   +|
|  b|20.17| 5.625|  u|  g|  w|  v| 1.71|  t|  f|   0|   f|   s|00120|    0|   +|
|  b|32.08|   4.0|  u|  g|  m|  v|  2.5|  t|  f|   0|   t|   g|00360|    0|   +|
|  b|33.17|  1.04|  u|  g|  r|  h|  6.5|  t|  f|   0|   t|   g|00164|31285|   +|
|  a|22.92|11.585|  u|  g| cc|  v| 0.04|  t|  f|   0|   f|   g|00080| 1349|   +|
|  b|54.42|   0.5|  y|  p|  k|  h| 3.96|  t|  f|   0|   f|   g|00180|  314|   +|
|  b|42.50| 4.915|  y|  p|  

In [15]:
credit_df.count()

690