In [1]:
from pyspark import SparkContext, SparkConf
sc = SparkContext("local", "First App")

numbers = sc.parallelize(list(range(1,16)))

In [2]:
print("Elements:", numbers.collect())
print("Number of partitions:", numbers.getNumPartitions())

Elements: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
Number of partitions: 1


In [3]:
numbers.collect()[0]

1

In [4]:
evens = numbers.filter(lambda x: x % 2 == 0)

In [5]:
print(evens.collect())

[2, 4, 6, 8, 10, 12, 14]


In [6]:
squares = evens.map(lambda x: x ** 2)
print(squares.collect())

[4, 16, 36, 64, 100, 144, 196]


In [7]:
agg = squares.reduce(lambda x, y: x + y)
print(agg)

560


In [9]:
squares.saveAsTextFile(r"\Users\ldwen\Documents\Big_Data_Analytics\DSA-4620\ICP_8\hdd.txt")

In [10]:
list1 = sc.parallelize(list(range(1,6)))
list2 = sc.parallelize(list(range(6,11)))

combo = list1.union(list2)
print(combo.collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [11]:
cart = list1.cartesian(list2)
print(cart.collect())

[(1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (2, 6), (2, 7), (2, 8), (2, 9), (2, 10), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (4, 6), (4, 7), (4, 8), (4, 9), (4, 10), (5, 6), (5, 7), (5, 8), (5, 9), (5, 10)]


In [12]:
dictionary = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}
rdd_dict = sc.parallelize(dictionary.items())

print(rdd_dict.collect())


[('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]


In [13]:
sample = sc.parallelize([1,5,9,9,1,2,5,2,4,8,5,4])

count = sample.countByValue()
print(count)

defaultdict(<class 'int'>, {1: 2, 5: 3, 9: 2, 2: 2, 4: 2, 8: 1})


In [None]:
hdd = sc.parallelize([1,2,3,4,5])
hdd.saveAsTextFile(r"\Users\ldwen\Documents\Big_Data_Analytics\DSA-4620\ICP_8\hdd2.txt")
combination = sc.textFile(r"\Users\ldwen\Documents\Big_Data_Analytics\DSA-4620\ICP_8\hdd.txt, \Users\ldwen\Documents\Big_Data_Analytics\DSA-4620\ICP_8\hdd.txt")

In [22]:

rdd = sc.parallelize(["Line 1", "Line 2", "Line 3", "Line 4", "Line 5", "Line 6"])

print(rdd.take(5))

['Line 1', 'Line 2', 'Line 3', 'Line 4', 'Line 5']


In [24]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Datafrane").getOrCreate()

data = [('a',1),('b',2),('c',3)]
df = spark.createDataFrame(data, ['letter', 'number'])
df.show()

+------+------+
|letter|number|
+------+------+
|     a|     1|
|     b|     2|
|     c|     3|
+------+------+



In [None]:
"""The RDD could hold this, but it cant be tabular:
[('a',1),('b',2),('c',3)]
"""

# The dataframe is tabular:
import pandas as pd
df = pd.DataFrame(data, columns=['letter', 'number'])

# The dataset above is a dataframe saved in an rdd.
