In [1]:
from pyspark import SparkConf, SparkContext

In [2]:
conf = SparkConf().setAppName('My App').setMaster('local')
sc = SparkContext.getOrCreate(conf=conf) # SparkContext

## map算子

In [3]:
rdd1 = sc.parallelize([1, 2, 3, 4, 5]).map(lambda x: x + 1)
print(rdd1.collect())

[2, 3, 4, 5, 6]


In [4]:
rdd2 = sc.parallelize(['dog', 'tiger', 'cat', 'tiger', 'tiger', 'cat']).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
print(rdd2.collect())

[('dog', 1), ('tiger', 3), ('cat', 2)]


## flatMap算子

In [5]:
print(sc.parallelize(['hello world', 'hello python', 'hello spark'])
      .flatMap(lambda x: x.split(' '))
      .collect())

['hello', 'world', 'hello', 'python', 'hello', 'spark']


In [6]:
print(sc.parallelize(['hello world', 'hello python', 'hello spark'])
      .flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1))
      .collect())

[('hello', 1), ('world', 1), ('hello', 1), ('python', 1), ('hello', 1), ('spark', 1)]


In [7]:
print(sc.parallelize(['hello world', 'hello python', 'hello spark'])
     .flatMap(lambda x: x.split(' '))
     .map(lambda x: (x, 1))
     .reduceByKey(lambda x, y: x + y)
     .collect())

[('hello', 3), ('world', 1), ('python', 1), ('spark', 1)]


## filter算子

In [8]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9]).filter(lambda x: x > 4)
print(rdd.collect())

[5, 6, 7, 8, 9]


## groupBy算子

In [9]:
print(sc.parallelize(['hello world', 'hello python', 'hello spark'])
      .flatMap(lambda x: x.split(' '))
      .map(lambda x: (x, 1))
      .groupByKey()
      .map(lambda x: {x[0]: list(x[1])})
      .collect())

[{'hello': [1, 1, 1]}, {'world': [1]}, {'python': [1]}, {'spark': [1]}]


## join算子

In [10]:
a = sc.parallelize([('a', 1), ('b', 2), ('c', 3), ('c', 4)])
b = sc.parallelize([('a', 2), ('b', 3), ('e', 5)])
print('a join b:           ', a.join(b).collect())
print('a fullOuterJoin b:  ', a.fullOuterJoin(b).collect())
print('a leftOuterJoin b:  ', a.leftOuterJoin(b).collect())
print('a rightOuterJoin b: ', a.rightOuterJoin(b).collect())

a join b:            [('b', (2, 3)), ('a', (1, 2))]
a fullOuterJoin b:   [('b', (2, 3)), ('c', (3, None)), ('c', (4, None)), ('a', (1, 2)), ('e', (None, 5))]
a leftOuterJoin b:   [('b', (2, 3)), ('c', (3, None)), ('c', (4, None)), ('a', (1, 2))]
a rightOuterJoin b:  [('b', (2, 3)), ('a', (1, 2)), ('e', (None, 5))]


## sortByKey算子

In [11]:
print(sc.parallelize(['hello world', 'hello python', 'hello spark'])
      .flatMap(lambda x: x.split(' '))
      .map(lambda x: (x, 1))
      .reduceByKey(lambda x, y: x + y)
      .map(lambda x: (x[1], x[0]))
      .collect())

[(3, 'hello'), (1, 'world'), (1, 'python'), (1, 'spark')]


In [12]:
print(sc.parallelize(['hello world', 'hello python', 'hello spark'])
      .flatMap(lambda x: x.split(' '))
      .map(lambda x: (x, 1))
      .reduceByKey(lambda x, y: x + y)
      .map(lambda x: (x[1], x[0]))
      .sortByKey(False)
      .map(lambda x: (x[1], x[0]))
      .collect())

[('hello', 3), ('world', 1), ('python', 1), ('spark', 1)]


In [13]:
print(sc.parallelize(['hello world', 'hello python', 'hello spark'])
      .flatMap(lambda x: x.split(' '))
      .map(lambda x: (x, 1))
      .reduceByKey(lambda x, y: x + y)
      .map(lambda x: (x[1], x[0])) # 交换位置
      .sortByKey(False)
      .map(lambda x: (x[1], x[0])) # 再次交换位置, 即还原
      .collect())

[('hello', 3), ('world', 1), ('python', 1), ('spark', 1)]


## union & distinct算子

In [14]:
print(sc.parallelize([1, 1, 2, 3]).union(sc.parallelize([2, 3, 3, 4])).collect())

[1, 1, 2, 3, 2, 3, 3, 4]


In [15]:
print(sc.parallelize([1, 1, 2, 3]).distinct().collect())

[1, 2, 3]


## WordCount

In [16]:
data = sc.textFile('data/')

output = data.flatMap(lambda line: line.split(' ')).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).sortBy(lambda line: line[1], False).collect()

for word, count in output:
    print('{} \t {}'.format(word, count))

5 	 13
1 	 10
4 	 8
2 	 5
3 	 4
2005-04-06 	 3
2005-05-13 	 2
2005-12-26 	 2
2004-05-03 	 2
50272 	 2
2004-05-05 	 2
1488844 	 1
2005-09-06 	 1
822109 	 1
885013 	 1
2005-10-19 	 1
30878 	 1
823519 	 1
1422244 	 1
222104 	 1
225013 	 1
2005-10-14 	 1
30272 	 1
223514 	 1
5422244 	 1
222504 	 1
2005-05-55 	 1
225055 	 1
2005-50-54 	 1
2005-52-26 	 1
225554 	 1
2422244 	 1
222204 	 1
2005-05-25 	 1
225025 	 1
2005-20-24 	 1
2005-22-26 	 1
225524 	 1


In [17]:
data = sc.textFile('data/file.txt')
data.collect()

['1 1488844 3 2005-09-06',
 '1 822109 5 2005-05-13',
 '1 885013 4 2005-10-19',
 '1 30878 4 2005-12-26',
 '1 823519 3 2004-05-03',
 '1 1422244 3 2005-04-06',
 '1 222104 5 2005-05-13',
 '1 225013 4 2005-10-14',
 '1 30272 4 2005-12-26',
 '1 223514 3 2004-05-03',
 '5 5422244 5 2005-04-06',
 '5 222504 5 2005-05-55',
 '5 225055 4 2005-50-54',
 '5 50272 4 2005-52-26',
 '5 225554 5 2004-05-05',
 '2 2422244 5 2005-04-06',
 '2 222204 5 2005-05-25',
 '2 225025 4 2005-20-24',
 '2 50272 4 2005-22-26',
 '2 225524 5 2004-05-05']

In [18]:
data.map(lambda line: line.split(' ')).collect()

[['1', '1488844', '3', '2005-09-06'],
 ['1', '822109', '5', '2005-05-13'],
 ['1', '885013', '4', '2005-10-19'],
 ['1', '30878', '4', '2005-12-26'],
 ['1', '823519', '3', '2004-05-03'],
 ['1', '1422244', '3', '2005-04-06'],
 ['1', '222104', '5', '2005-05-13'],
 ['1', '225013', '4', '2005-10-14'],
 ['1', '30272', '4', '2005-12-26'],
 ['1', '223514', '3', '2004-05-03'],
 ['5', '5422244', '5', '2005-04-06'],
 ['5', '222504', '5', '2005-05-55'],
 ['5', '225055', '4', '2005-50-54'],
 ['5', '50272', '4', '2005-52-26'],
 ['5', '225554', '5', '2004-05-05'],
 ['2', '2422244', '5', '2005-04-06'],
 ['2', '222204', '5', '2005-05-25'],
 ['2', '225025', '4', '2005-20-24'],
 ['2', '50272', '4', '2005-22-26'],
 ['2', '225524', '5', '2004-05-05']]

In [19]:
data = sc.textFile('data/file.txt')
print(data \
      .map(lambda line: line.split(' ')) \
      # 只取第1, 3列
      .map(lambda line: (line[0], int(line[2]))) \
      .combineByKey((lambda x: (x, 1)),
                    (lambda x, y: (x[0] + y, x[1] + 1)),
                    (lambda x, y: (x[0] + y[0], x[1] + y[1]))) \
      .collect())

[('1', (38, 10)), ('5', (23, 5)), ('2', (23, 5))]


In [20]:
data = sc.textFile('data/file.txt')
print(data \
      .map(lambda line: line.split(' ')) \
      # 只取第1, 3列
      .map(lambda line: (line[0], int(line[2]))) \
      .map(lambda x: (x, 1))
      .collect())

[(('1', 3), 1), (('1', 5), 1), (('1', 4), 1), (('1', 4), 1), (('1', 3), 1), (('1', 3), 1), (('1', 5), 1), (('1', 4), 1), (('1', 4), 1), (('1', 3), 1), (('5', 5), 1), (('5', 5), 1), (('5', 4), 1), (('5', 4), 1), (('5', 5), 1), (('2', 5), 1), (('2', 5), 1), (('2', 4), 1), (('2', 4), 1), (('2', 5), 1)]


In [21]:
data = sc.textFile('data/file.txt')
print(data \
      .map(lambda line: line.split(' ')) \
      # 只取第1, 3列
      .map(lambda line: (line[0], int(line[2]))) \
      .combineByKey((lambda x: (x, 1)),
                    (lambda x, y: (x[0] + y, x[1] + 1)),
                    (lambda x, y: (x[0] + y[0], x[1] + y[1]))) \
      .map(lambda line: (line[0], (line[1][0] / line[1][1]))) \
      .collectAsMap())

{'1': 3.8, '5': 4.6, '2': 4.6}


In [22]:
data = sc.textFile('data/file.txt')
print(data \
      .map(lambda line: line.split(' ')) \
      # 只取第1, 3列
      .map(lambda line: (line[0], int(line[2]))) \
      .combineByKey((lambda x: (x, 1)),
                    (lambda x, y: (x[0] + y, x[1] + 1)),
                    (lambda x, y: (x[0] + y[0], x[1] + y[1]))) \
      .map(lambda line: (line[0], (line[1][0] / line[1][1]))) \
      .filter(lambda line: line[1] > 4) \
      .collectAsMap())

{'5': 4.6, '2': 4.6}
