In [50]:
#!export JAVA_HOME=$(/usr/libexec/java_home -v 1.8)

# Creating RDDs

In [1]:
from pyspark import SparkConf, SparkContext
import os
os.environ['PYSPARK_PYTHON'] = '/Library/Frameworks/Python.framework/Versions/3.6/bin/python3'

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("rdd_demo").getOrCreate()

#sc.stop()
conf = SparkConf().setMaster("local").setAppName("MinTemperatures")
sc = SparkContext.getOrCreate()

ModuleNotFoundError: No module named 'pyspark'

In [217]:
##### read using datasource API

flightData2015 = spark\
  .read\
  .option("inferSchema", "false")\
  .option("header", "true")\
  .csv("/Users/aakash/training/spark/data/flight-data/csv/2015-summary.csv")

# COMMAND ----------

flightData2015=flightData2015.toDF("dest","source","count").rdd
print(type(flightData2015))
print(flightData2015.take(1))

<class 'pyspark.rdd.RDD'>
[Row(dest='United States', source='Romania', count='15')]


In [218]:
##### read using sparkcontext
spth="/Users/aakash/training/spark/data/flight-data/csv/2015-summary.csv"
sc_flightData2015=spark.sparkContext.textFile(spth)
print(type(sc_flightData2015))
print(sc_flightData2015.take(2))

<class 'pyspark.rdd.RDD'>
['DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count', 'United States,Romania,15']


In [219]:
##### convert pandas file to RDD
import pandas as pd

spth="/Users/aakash/training/spark/data/flight-data/csv/2015-summary.csv"
pd_flightData2015=pd.read_csv(spth, header=0)
print(type(pd_flightData2015))
print(pd_flightData2015.head())
pd_flightData2015=spark.createDataFrame(pd_flightData2015).rdd
print(type(pd_flightData2015))
print(pd_flightData2015.take(1))



<class 'pandas.core.frame.DataFrame'>
  DEST_COUNTRY_NAME ORIGIN_COUNTRY_NAME  count
0     United States             Romania     15
1     United States             Croatia      1
2     United States             Ireland    344
3             Egypt       United States     15
4     United States               India     62
<class 'pyspark.rdd.RDD'>
[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)]


In [220]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
  .split(" ")
words = spark.sparkContext.parallelize(myCollection, 2)
words.take(5)

['Spark', 'The', 'Definitive', 'Guide', ':']

In [221]:
##### from a collection of text
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
  .split(" ")
words = spark.sparkContext.parallelize(myCollection, 2)

words.setName("myWords")
words.name() # myWords
print(type(words))
print(words.count())

<class 'pyspark.rdd.RDD'>
10


In [222]:
### from a range of numbers
myRange = spark.range(1000).toDF("number").rdd.map(lambda row: row[0])
#myRange = spark.range(1000).toDF("number")
myRange.take(5)

[0, 1, 2, 3, 4]

# a look into some low level issues

In [223]:
#### some low level access issues
pd_flightData2015.count()
pd_flightData2015.take(2)[-1]

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1)

# filtering

In [228]:
def parseLine(line):
    fields = line.split(',')
    date = fields[0]
    p_open = fields[1]
    p_close = fields[5]
    return (date, p_open, p_close)

In [229]:
spth="/Users/aakash/Sarasvati/NSE/RELIANCE.csv"
sdt=spark.sparkContext.textFile(spth)
sdt=sdt.map(parseLine)
sdt.take(2)

[('Date', 'Open', 'Close'), ('1998-03-23', '178.5', '180.2')]

In [230]:
spth="/Users/aakash/Sarasvati/NSE/RELIANCE.csv"
o_sdt = spark.read.format("CSV").option("header","true").option("inferSchema", "true") \
    .load(spth)
o_sdt=o_sdt.toDF("Date","Open","High","Low","Last","Close","Volume","Turnover").rdd \
    .map(lambda row: (row[0], row[1], row[5]))
print(o_sdt.count())
#print(type(o_sdt))
#print(o_sdt.take(2))
o_sdt=o_sdt.filter(lambda row: row[2] > row[1])
#print(o_sdt.take(5))
#print(type(o_sdt))
print(o_sdt.count())

5366
2525


## filter function

In [236]:
def HighClose(row):
    ## Discuss
    if row[2] > row[1]:
        return(row)

Notice anything in output below?

In [237]:
o_sdt = spark.read.format("CSV").option("header","true").load(spth)
o_sdt=o_sdt.toDF("Date","Open","High","Low","Last","Close","Volume","Turnover").rdd.map(lambda row: (row[0], row[1], row[5]))
print(o_sdt.count())
o_sdt=o_sdt.filter(lambda row: HighClose(row))
print(o_sdt.take(1))
#print(type(o_sdt))
print(o_sdt.count())

5366
[('1998-03-23', '178.5', '180.2')]
2536


How about now?

In [79]:
o_sdt = spark.read.format("CSV").option("header","true").option("inferSchema", "true").load(spth)
o_sdt=o_sdt.toDF("Date","Open","High","Low","Last","Close","Volume","Turnover").rdd.map(lambda row: (row[0], row[1], row[5]))
print(o_sdt.count())
o_sdt=o_sdt.filter(lambda row: HighClose(row))
print(o_sdt.take(5))
print(type(o_sdt))
print(o_sdt.count())


5366
[(datetime.datetime(1998, 3, 23, 0, 0), 178.5, 180.2), (datetime.datetime(1998, 3, 24, 0, 0), 184.0, 178.7), (datetime.datetime(1998, 3, 25, 0, 0), 181.5, 183.85), (datetime.datetime(1998, 3, 26, 0, 0), 183.85, 179.45), (datetime.datetime(1998, 3, 27, 0, 0), 179.1, 180.4)]
<class 'pyspark.rdd.PipelinedRDD'>
5366


In [99]:
o_sdt = spark.read.format("CSV").option("header","true").option("inferSchema", "true").load(spth)
o_sdt=o_sdt.toDF("Date","Open","High","Low","Last","Close","Volume","Turnover").rdd.map(lambda row: (row[0], row[1], row[5]))
print(o_sdt.count())
o_sdt=o_sdt.filter(lambda row: HighClose(row))
print(o_sdt.take(5))
print(type(o_sdt))
print(o_sdt.count())


5366
[(datetime.datetime(1998, 3, 23, 0, 0), 178.5, 180.2), (datetime.datetime(1998, 3, 25, 0, 0), 181.5, 183.85), (datetime.datetime(1998, 3, 27, 0, 0), 179.1, 180.4), (datetime.datetime(1998, 4, 1, 0, 0), 177.5, 182.85), (datetime.datetime(1998, 4, 3, 0, 0), 181.5, 184.95)]
<class 'pyspark.rdd.PipelinedRDD'>
2525


## Map

In [238]:
def to_to_mill(row):
    return (row[0], row[1], row[2], round(row[3],0))

In [239]:
spth="/Users/aakash/Sarasvati/NSE/TATASTEEL.csv"
o_sdt = spark.read.format("CSV").option("header","true").option("inferSchema", "true").load(spth)
o_sdt=o_sdt.toDF("Date","Open","High","Low","Last","Close","Volume","Turnover").rdd.map(lambda row: (row[0], row[1], row[5], row[7]))
print(o_sdt.take(2))
o_sdt=o_sdt.map(to_to_mill)
print(o_sdt.take(2))
print(type(o_sdt))
print(o_sdt.count())



[(datetime.datetime(2005, 10, 17, 0, 0), 380.0, 384.35, 17639.41), (datetime.datetime(2005, 10, 18, 0, 0), 386.85, 374.85, 13024.16)]
[(datetime.datetime(2005, 10, 17, 0, 0), 380.0, 384.35, 17639.0), (datetime.datetime(2005, 10, 18, 0, 0), 386.85, 374.85, 13024.0)]
<class 'pyspark.rdd.PipelinedRDD'>
3461


# flatMap

In [125]:
def Func(lines):
    lines = lines.lower()
    lines = lines.split(" ")
    return lines

#sc.stop()
conf = SparkConf().setMaster("local").setAppName("wordcount")
sc = SparkContext.getOrCreate()

spth="/Users/aakash/training/spark/data/sherlock_holmes.txt"
input_file = sc.textFile("data/sherlock_holmes.txt")
#print(input_file.take(5))
rdd1 = input_file.flatMap(Func)
rdd2=rdd1.map(lambda x: (x,1)).groupByKey().mapValues(sum).map(lambda x: (x[1],x[0])).sortByKey(False)
rdd2.take(5)

['', "Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle", '', 'This eBook is for the use of anyone anywhere at no cost and with', 'almost no restrictions whatsoever.  You may copy it, give it away or']


[(5703, 'the'), (3137, ''), (2882, 'and'), (2758, 'of'), (2720, 'to')]

## Reduce

In [127]:
spark.sparkContext.parallelize(range(1,200)).reduce(lambda x, y: x+y)

19900

## count

In [135]:
rdd1.count()

110739

In [141]:
rdd1.countApprox(1, 0.95)

110739

In [240]:
rdd1.countByValue()

AttributeError: 'collections.defaultdict' object has no attribute 'countByValue'

In [247]:
spth="/Users/aakash/training/spark/data/s1.txt"
spth_1="/Users/aakash/training/spark/data/sherlock_holmes.txt"
spth_2="/Users/aakash/training/spark/data/little_sherlock_holmes.txt"
input_file = sc.textFile(spth_1,6)


input_file.pipe("wc -l").collect()
### why 2 outputs???

['  408379', '  408212', '  408267', '  408337', '  408257', '  408238']

In [245]:
input_file.count()

1514130

In [243]:
print(type(input_file))
print(input_file.count())
print(input_file.getNumPartitions())
print(sc.defaultParallelism)

<class 'pyspark.rdd.RDD'>
849690
2
4


In [157]:
input_file.saveAsTextFile('/Users/aakash/training/spark/wc.txt')
## check outputs