In [1]:
sc

# Getting the data and creating the RDD

In [1]:
data_file = "file:///home/lygbug666/workdir/spark-py-notebooks/kddcup.data_10_percent.gz"

In [2]:
raw_data = sc.textFile(data_file)

# The filter transformation

In [6]:
raw_data.count()

494021

In [7]:
normal_raw_data = raw_data.filter(lambda x: 'normal.' in x)

In [8]:
from time import time
t0 = time()
normal_count = normal_raw_data.count()
tt = time() - t0
print ("There are {} 'normal' interactions".format(normal_count))
print ("Count completed in {} seconds".format(round(tt,3)))

There are 97278 'normal' interactions
Count completed in 1.605 seconds


In [9]:
# lazy computation

# The map transformation

In [10]:
from pprint import pprint
csv_data = raw_data.map(lambda x: x.split(","))
t0 = time()
head_rows = csv_data.take(5)
tt = time() - t0
print ("Parse completed in {} seconds".format(round(tt,5)))
pprint(head_rows[0])

Parse completed in 0.09266 seconds
['0',
 'tcp',
 'http',
 'SF',
 '181',
 '5450',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '8',
 '8',
 '0.00',
 '0.00',
 '0.00',
 '0.00',
 '1.00',
 '0.00',
 '0.00',
 '9',
 '9',
 '1.00',
 '0.00',
 '0.11',
 '0.00',
 '0.00',
 '0.00',
 '0.00',
 '0.00',
 'normal.']


In [11]:
# round(tt,5) ５位数字

In [12]:
t0 = time()
head_rows = csv_data.take(100000)
tt = time() - t0
print ("Parse completed in {} seconds".format(round(tt,3)))

Parse completed in 3.155 seconds


# Using map and predefined functions

In [15]:
def parse_interaction(line):
    elems = line.split(",")
    tag = elems[41]
    # normal, list[42]
    return (tag, elems)

key_csv_data = raw_data.map(parse_interaction)
head_rows = key_csv_data.take(5)
print(head_rows[0], head_rows[1])

('normal.', ['0', 'tcp', 'http', 'SF', '181', '5450', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '9', '9', '1.00', '0.00', '0.11', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.']) ('normal.', ['0', 'tcp', 'http', 'SF', '239', '486', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8', '8', '0.00', '0.00', '0.00', '0.00', '1.00', '0.00', '0.00', '19', '19', '1.00', '0.00', '0.05', '0.00', '0.00', '0.00', '0.00', '0.00', 'normal.'])


# The collect action


In [16]:
t0 = time()
all_raw_data = raw_data.collect()
tt = time() - t0
print ("Data collected in {} seconds".format(round(tt,3)))

Data collected in 5.639 seconds


In [19]:
# parse into key-value pairs
key_csv_data = raw_data.map(parse_interaction)

# filter normal key interactions
normal_key_interactions = key_csv_data.filter(lambda x: x[1][41] == "normal.")
# Upper function equals to 
# normal_key_interactions = key_csv_data.filter(lambda x: x[0] == "normal.")

# collect all
t0 = time()
all_normal = normal_key_interactions.collect()
tt = time() - t0
normal_count = len(all_normal)
print ("Data collected in {} seconds".format(round(tt,3)))
print ("There are {} 'normal' interactions".format(normal_count))

Data collected in 5.369 seconds
There are 97278 'normal' interactions
