In [1]:
import findspark

In [2]:
findspark.init('/usr/local/spark')

In [3]:
# Import SParkkSession
from pyspark.sql import SparkSession

In [4]:
# Build  the SParkSession
spark = SparkSession.builder.master("local").appName("Linear Regression Model").config("spark.executor.memory","1gb").getOrCreate()

In [5]:
sc = spark.sparkContext

In [10]:
x = sc.parallelize([1,2,3])
y = x.map(lambda x: (x,x**2))
print(x.collect())
print(y.collect())

[1, 2, 3]
[(1, 1), (2, 4), (3, 9)]


In [11]:
x = sc.parallelize([1,2,3])
y = x.flatMap(lambda x: (x,100*x,x**2))
print(x.collect())
print(y.collect())

[1, 2, 3]
[1, 100, 1, 2, 200, 4, 3, 300, 9]


In [12]:
x = sc.parallelize([1,2,3],2)
def f(iterator):
    yield sum(iterator)
y = x.mapPartitions(f)
print(x.glom().collect())
print(y.glom().collect())

[[1], [2, 3]]
[[1], [5]]


In [13]:
x = sc.parallelize([1,2,3],2)
def f(paritionIndex, iterator):
    yield (paritionIndex,sum(iterator))
y = x.mapPartitionsWithIndex(f)
print(x.glom().collect())
print(y.glom().collect())

[[1], [2, 3]]
[[(0, 1)], [(1, 5)]]


In [14]:
x = sc.parallelize([1,2,3],2)
y = x.getNumPartitions()
print(x.glom().collect())
print(y)

[[1], [2, 3]]
2


In [15]:
x = sc.parallelize([1,2,3],2)
y = x.filter(lambda x: x%2 == 1)
print(x.collect())
print(y.collect())

[1, 2, 3]
[1, 3]


In [16]:
x = sc.parallelize(['A','A','B'])
y = x.distinct()
print(x.collect())
print(y.collect())

['A', 'A', 'B']
['A', 'B']


In [20]:
x = sc.parallelize(range(7))
ylist = [x.sample(withReplacement=False, fraction=0.5) for i in range(5)]
print('x = ' + str(x.collect()))
for cnt, y in zip(range(len(ylist)), ylist):
    print('sample:' + str(cnt) + 'y =' + str(y.collect()))

x = [0, 1, 2, 3, 4, 5, 6]
sample:0y =[0, 1, 2, 4]
sample:1y =[1, 3, 4, 5]
sample:2y =[0, 3, 5]
sample:3y =[1, 5]
sample:4y =[0, 1, 3]


In [26]:
x = sc.parallelize(range(7))
print(x.collect())
print(x.sample(withReplacement=False,fraction=0.5).collect())

[0, 1, 2, 3, 4, 5, 6]
[1, 2, 4]


In [28]:
for y in ylist:
    print y.collect()

[0, 1, 2, 4]
[1, 3, 4, 5]
[0, 3, 5]
[1, 5]
[0, 1, 3]


In [29]:
x = sc.parallelize(['A','B','C'])
y = sc.parallelize(['D','C','A'])
z = x.union(y)
print(x.collect())
print(y.collect())
print(z.collect())

['A', 'B', 'C']
['D', 'C', 'A']
['A', 'B', 'C', 'D', 'C', 'A']


In [30]:
x = sc.parallelize(['A','B','C'])
y = sc.parallelize(['D','C','A'])
z = x.intersection(y)
print(x.collect())
print(y.collect())
print(z.collect())

['A', 'B', 'C']
['D', 'C', 'A']
['A', 'C']


In [32]:
x = sc.parallelize([('B',1),('A',2),('C',3)])
y = x.sortByKey()
print(x.collect())
print(y.collect())

[('B', 1), ('A', 2), ('C', 3)]
[('A', 2), ('B', 1), ('C', 3)]
