# Working with key/value pair RDDs

In [1]:
data_file = "file:///home/lygbug666/workdir/spark-py-notebooks/kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file)

## Creating a pair RDD for interaction types

In [2]:
csv_data = raw_data.map(lambda x: x.split(","))
key_value_data = csv_data.map(lambda x: (x[41], x)) # x[41] contains the network interaction tag

In [3]:
key_value_data.take(1)

[('normal.',
  ['0',
   'tcp',
   'http',
   'SF',
   '181',
   '5450',
   '0',
   '0',
   '0',
   '0',
   '0',
   '1',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '8',
   '8',
   '0.00',
   '0.00',
   '0.00',
   '0.00',
   '1.00',
   '0.00',
   '0.00',
   '9',
   '9',
   '1.00',
   '0.00',
   '0.11',
   '0.00',
   '0.00',
   '0.00',
   '0.00',
   '0.00',
   'normal.'])]

## Data aggregations with key/value pair RDDs

In [12]:
key_value_duration = csv_data.map(lambda x: (x[41], float(x[0])))
duration_by_key = key_value_duration.reduceByKey(lambda x,y: x+y) # value + value
duration_by_key.collect()

[('normal.', 21075991.0),
 ('buffer_overflow.', 2751.0),
 ('loadmodule.', 326.0),
 ('perl.', 124.0),
 ('neptune.', 0.0),
 ('smurf.', 0.0),
 ('guess_passwd.', 144.0),
 ('pod.', 0.0),
 ('teardrop.', 0.0),
 ('portsweep.', 1991911.0),
 ('ipsweep.', 43.0),
 ('land.', 0.0),
 ('ftp_write.', 259.0),
 ('back.', 284.0),
 ('imap.', 72.0),
 ('satan.', 64.0),
 ('phf.', 18.0),
 ('nmap.', 0.0),
 ('multihop.', 1288.0),
 ('warezmaster.', 301.0),
 ('warezclient.', 627563.0),
 ('spy.', 636.0),
 ('rootkit.', 1008.0)]

In [13]:
kind_by_key = csv_data.map(lambda x: (x[41], 1)).reduceByKey(lambda x,y : x+y)
kind_by_key.collect()

[('normal.', 97278),
 ('buffer_overflow.', 30),
 ('loadmodule.', 9),
 ('perl.', 3),
 ('neptune.', 107201),
 ('smurf.', 280790),
 ('guess_passwd.', 53),
 ('pod.', 264),
 ('teardrop.', 979),
 ('portsweep.', 1040),
 ('ipsweep.', 1247),
 ('land.', 21),
 ('ftp_write.', 8),
 ('back.', 2203),
 ('imap.', 12),
 ('satan.', 1589),
 ('phf.', 4),
 ('nmap.', 231),
 ('multihop.', 7),
 ('warezmaster.', 20),
 ('warezclient.', 1020),
 ('spy.', 2),
 ('rootkit.', 10)]

In [10]:
counts_by_key = key_value_data.countByKey()  # 已经是collections了
print (counts_by_key)

defaultdict(<class 'int'>, {'normal.': 97278, 'buffer_overflow.': 30, 'loadmodule.': 9, 'perl.': 3, 'neptune.': 107201, 'smurf.': 280790, 'guess_passwd.': 53, 'pod.': 264, 'teardrop.': 979, 'portsweep.': 1040, 'ipsweep.': 1247, 'land.': 21, 'ftp_write.': 8, 'back.': 2203, 'imap.': 12, 'satan.': 1589, 'phf.': 4, 'nmap.': 231, 'multihop.': 7, 'warezmaster.': 20, 'warezclient.': 1020, 'spy.': 2, 'rootkit.': 10})


## Using combineByKey

In [33]:
sum_counts = key_value_duration.combineByKey(
    (lambda x: (x, 1)), # the initial value, with value x and count 1
    (lambda acc, value: (acc[0]+value, acc[1]+1)), # how to combine a pair value with the accumulator: sum value, and increment count
    (lambda acc1, acc2: (acc1[0]+acc2[0], acc1[1]+acc2[1])) # combine accumulators
)
print (sum_counts.collect())
sum_counts.collectAsMap()



[('normal.', (21075991.0, 97278)), ('buffer_overflow.', (2751.0, 30)), ('loadmodule.', (326.0, 9)), ('perl.', (124.0, 3)), ('neptune.', (0.0, 107201)), ('smurf.', (0.0, 280790)), ('guess_passwd.', (144.0, 53)), ('pod.', (0.0, 264)), ('teardrop.', (0.0, 979)), ('portsweep.', (1991911.0, 1040)), ('ipsweep.', (43.0, 1247)), ('land.', (0.0, 21)), ('ftp_write.', (259.0, 8)), ('back.', (284.0, 2203)), ('imap.', (72.0, 12)), ('satan.', (64.0, 1589)), ('phf.', (18.0, 4)), ('nmap.', (0.0, 231)), ('multihop.', (1288.0, 7)), ('warezmaster.', (301.0, 20)), ('warezclient.', (627563.0, 1020)), ('spy.', (636.0, 2)), ('rootkit.', (1008.0, 10))]


{'back.': (284.0, 2203),
 'buffer_overflow.': (2751.0, 30),
 'ftp_write.': (259.0, 8),
 'guess_passwd.': (144.0, 53),
 'imap.': (72.0, 12),
 'ipsweep.': (43.0, 1247),
 'land.': (0.0, 21),
 'loadmodule.': (326.0, 9),
 'multihop.': (1288.0, 7),
 'neptune.': (0.0, 107201),
 'nmap.': (0.0, 231),
 'normal.': (21075991.0, 97278),
 'perl.': (124.0, 3),
 'phf.': (18.0, 4),
 'pod.': (0.0, 264),
 'portsweep.': (1991911.0, 1040),
 'rootkit.': (1008.0, 10),
 'satan.': (64.0, 1589),
 'smurf.': (0.0, 280790),
 'spy.': (636.0, 2),
 'teardrop.': (0.0, 979),
 'warezclient.': (627563.0, 1020),
 'warezmaster.': (301.0, 20)}

In [22]:
# duration_means_by_type = sum_counts.map(lambda key,value: (key, round(value[0]/value[1],3))).collectAsMap()
# Print them sorted
# for tag in sorted(duration_means_by_type, key=duration_means_by_type.get, reverse=True):
#     print (tag, duration_means_by_type[tag])


In [1]:
duration_means_by_type = sum_counts.map(lambda key,value : (key, round(value[0]/value[1],3))).collect()

print (duration_means_by_type)

NameError: name 'sum_counts' is not defined

In [None]:
duration_means_by_type = sum_counts.mapValues(lambda value : round(value[0]/value[1],3)).collect()


In [None]:
# TypeError: <lambda>() missing 1 required positional argument: 'value' ??? 原来是一个tuple('normal.', (21075991.0, 97278))