# Apache Spark example in Python

## Word count

In [1]:
import os
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
text_file = sc.textFile("raw/example.txt")
counts = text_file.flatMap(lambda line: line.split(" ")).map(lambda word: (word,1)).reduceByKey(lambda a, b: a+b)
if "word_count" in os.listdir("./raw"):
    pass
else:
    counts.saveAsTextFile("raw/word_count")

 - SparkContext():
 Main entry point for Spark functionality. A SparkContext represents the connection to a Spark cluster, and can be used to create RDD and broadcast variables on that cluster.
 - getOrCreate(conf=None):
 Get or instantiate a SparkContext and register it as a singleton object.
 - flatMap():
 Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.
 -  map(f, preservesPartitioning=False):
 Return a new RDD by applying a function to each element of this RDD.
 -  reduceByKey(func): Merge the values for each key using an associative and commutative reduce function.
 -  saveAsTextFile(path):
 Save this RDD as a text file, using string representations of elements.

## Pi Estimation

In [50]:
import random
NUM_SAMPLES = 1000000
def inside(p):
    x, y = random.random(), random.random()
    return x*x + y*y < 1
count = sc.parallelize(xrange(0, NUM_SAMPLES)).filter(inside).count()
print "Pi is roughly %f" % (4.0 * count / NUM_SAMPLES)

Pi is roughly 3.140828


- parallelize(c, numSlices=None): Distribute a local Python collection to form an RDD. Using xrange is recommended if the input represents a range for performance.
-  filter(f): Return a new RDD containing only the elements that satisfy a predicate.
-  count(): Return the number of elements in this RDD.

## Text Search 

In [73]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col

spark = SparkSession(sc)

text_file = sc.textFile("example.txt")

#Create a dataframe having a single column named "line"
df = text_file.map(lambda r: Row(r)).toDF(["line"])
assigns = df.filter(col("line").like("%Assignment%"))
assigns.count()
assigns.collect()

[Row(line=u'Assignment 1: Train a Logistic Regression model on the notMNIST dataset, which is able to recognize a subset of English letters in different fonts.'),
 Row(line=u'Assignment 2: Train a fully-connected network using Gradient Descent and Stochastic Gradient Descent with TensorFlow')]

-  class pyspark.sql.SparkSession(sparkContext, jsparkSession=None):The entry point to programming Spark with the Dataset and DataFrame API.
-  class pyspark.sql.Row: A row in DataFrame. The fields in it can be accessed:

        like attributes (row.key)
        like dictionary values (row[key])
-  toDF(*cols): Returns a new class:DataFrame that with new specified column names
-  pyspark.sql.functions.col(col): Returns a Column based on the given column name.
-  like(other): binary operator
-  collect(): return a list that contains all of the elements in this RDD.

## Simple Data Operations

In [79]:
# Create a DataFrame based on a table named "people"
# stored in a MySQL database
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
url = \
  "jdbc:mysql://yourIP:yourPort/test?user=yourUsername;password=yourPassword"
df = sqlContext \
  .read \
  .format("jdbc") \
  .option("url", url) \
  .option("dbtable", "people") \
  .load()

# Looks the schema of this DataFrame.
df.printSchema()

# Counts people by age
#countsByAge = df.groupBy("age").count()
#countsByAge.show()

# Saves countsByAge to S3 in the JSON format.
#countsByAge.write.format("json").save("s3a://...")

Py4JJavaError: An error occurred while calling o1128.load.
: java.sql.SQLException: No suitable driver
	at java.sql.DriverManager.getDriver(DriverManager.java:315)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$7.apply(JDBCOptions.scala:84)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$7.apply(JDBCOptions.scala:84)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:83)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:34)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:32)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:306)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
