# Spark extras

## Generating Big Data

In [1]:
# import libraries
import csv, time
import numpy as np
import pandas as pd
from pyspark import StorageLevel

In [2]:
# Read mock data
mock_data = []
with open("data/MOCK_DATA.csv") as mock_file:
    mock_data = mock_file.readlines()

# randomize index
num_range = len(mock_data) - 1
sample_items = 100
num_iterations = 100000

# generate Big Data
with open("data/big_data.csv", "w") as big_data:
    # write header
    writer = csv.writer(big_data)
    writer.writerow(mock_data[0].strip().split(","))
    
    # write lines by random sampling 100 items over 1000 lines
    next_line = 1
    for i in range(0, num_iterations):
        lines = []
        choices = np.random.choice(num_range, sample_items)
        for idx in choices:
            line = mock_data[idx].strip().split(",")
            line[0] = next_line
            lines.append(line)
            next_line += 1

        writer.writerows(lines)

print "Done generating."        

Done generating.


In [3]:
!ls data/

MOCK_DATA.csv              sample_kmeans_data.txt
big_data.csv               sample_lda_libsvm_data.txt


In [4]:
!du -hs data/big_data.csv

638M	data/big_data.csv


## Store as Parquet format

In [5]:
df = sqlContext.read \
    .format("com.databricks.spark.csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("data/big_data.csv")

df.write.parquet("data/big_data_parquet", mode="overwrite")
print "Store parquet completed."

Store parquet completed.


In [6]:
!du -hs data/big_data_parquet

 89M	data/big_data_parquet


# Self implement vs Pandas

In [7]:
start = time.time()
with open("data/big_data.csv", "r") as big_data:
    # read csv file
    reader = csv.reader(big_data)
    header = reader.next()
    num_row = 0
    for row in reader:
        num_row += 1

print "Numer of rows:", num_row
done = time.time()
elapsed = done - start
print "Total running time:", elapsed, "seconds"

Numer of rows: 10000000
Total running time: 11.6682560444 seconds


In [8]:
start = time.time()

pd_big_data = pd.read_csv("data/big_data.csv")
pd_big_data["id"].count()

done = time.time()
elapsed = done - start
print "Total running time:", elapsed, "seconds"

Total running time: 9.40140104294 seconds


# RDD vs Dataframe

## Loading

In [9]:
# loading csv to RDD
lines = sc.textFile("data/big_data.csv")
parts = lines.map(lambda l: l.split(","))
rdd_people = parts.map(lambda p: (p[0], p[1], p[2], p[3], p[4], p[5].strip()))
rdd_people.count()

10000001

In [10]:
# loading csv to Dataframe
df_people = sqlContext.read \
    .format("com.databricks.spark.csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("data/big_data.csv")
df_people.count()

10000000

In [11]:
# loading parquet to Dataframe
df_people = sqlContext.read.parquet("data/big_data_parquet")
df_people.count()

10000000

## Caching
- DISK_ONLY = StorageLevel(True, False, False, False, 1)
- DISK_ONLY_2 = StorageLevel(True, False, False, False, 2)
- MEMORY_AND_DISK = StorageLevel(True, True, False, False, 1)
- MEMORY_AND_DISK_2 = StorageLevel(True, True, False, False, 2)
- MEMORY_AND_DISK_SER = StorageLevel(True, True, False, False, 1)
- MEMORY_AND_DISK_SER_2 = StorageLevel(True, True, False, False, 2)
- MEMORY_ONLY = StorageLevel(False, True, False, False, 1)
- MEMORY_ONLY_2 = StorageLevel(False, True, False, False, 2)
- MEMORY_ONLY_SER = StorageLevel(False, True, False, False, 1)
- MEMORY_ONLY_SER_2 = StorageLevel(False, True, False, False, 2)
- OFF_HEAP = StorageLevel(True, True, True, False, 1)

In [12]:
# RDD cache
rdd_people.cache()
rdd_people.count()

10000001

In [13]:
# disk only
rdd_people.unpersist()
rdd_people.persist(storageLevel=StorageLevel(True, False, False, False, 1))
rdd_people.count()

10000001

In [14]:
rdd_people.count()

10000001

In [15]:
# Dataframe cache
df_people.cache()
df_people.count()

10000000

In [16]:
df_people.count()

10000000

In [17]:
# unpersist all
rdd_people.unpersist()
df_people.unpersist()

DataFrame[id: int, first_name: string, last_name: string, email: string, gender: string, ip_address: string]

## Processing

In [18]:
# RDD cache
rdd_people.cache()
rdd_people.count()

10000001

In [19]:
# Dataframe cache
df_people.cache()
df_people.count()

10000000

In [20]:
rdd_people.distinct().count()

10000001

In [21]:
df_people.distinct().count()

10000000

In [22]:
rdd_people.filter(lambda x: x[4] == "Male").count()

5160545

In [23]:
df_people.filter(df_people["gender"] == "Male").count()

5160545

## SQL vs Built-in functions

In [24]:
df_people.filter(df_people["gender"] == "Male")\
        .select(df_people["first_name"], df_people["email"]\
        .alias("mail")).show(5)

+----------+--------------------+
|first_name|                mail|
+----------+--------------------+
|    Linoel|    lcobden7g@hp.com|
|    Ripley|rchiplenhp@barnes...|
|    Rooney|rchesworth7e@utex...|
|    Lovell|lfellgatep8@umich...|
|   Cordell|cduplain4a@artist...|
+----------+--------------------+
only showing top 5 rows



In [25]:
sqlContext.registerDataFrameAsTable(df_people, "tbl_people")
sqlContext.sql("""
    SELECT first_name, email AS mail
    FROM tbl_people
    WHERE gender = 'Male'
""").show(5)

+----------+--------------------+
|first_name|                mail|
+----------+--------------------+
|    Linoel|    lcobden7g@hp.com|
|    Ripley|rchiplenhp@barnes...|
|    Rooney|rchesworth7e@utex...|
|    Lovell|lfellgatep8@umich...|
|   Cordell|cduplain4a@artist...|
+----------+--------------------+
only showing top 5 rows



# Machine Learning

## k-Means

In [26]:
from pyspark.ml.clustering import KMeans

# Loads data.
dataset = spark.read.format("libsvm").load("data/sample_kmeans_data.txt")
print dataset.take(2)

# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(wssse))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

[Row(label=0.0, features=SparseVector(3, {})), Row(label=1.0, features=SparseVector(3, {0: 0.1, 1: 0.1, 2: 0.1}))]
Within Set Sum of Squared Errors = 0.12
Cluster Centers: 
[ 0.1  0.1  0.1]
[ 9.1  9.1  9.1]


## LDA

In [27]:
from pyspark.ml.clustering import LDA

# Loads data.
dataset = spark.read.format("libsvm").load("data/sample_lda_libsvm_data.txt")
print dataset.take(1)

# Trains a LDA model.
lda = LDA(k=10, maxIter=10)
model = lda.fit(dataset)

ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = model.describeTopics(2)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

# Shows the result
transformed = model.transform(dataset)
transformed.show(truncate=False)

[Row(label=0.0, features=SparseVector(11, {0: 1.0, 1: 2.0, 2: 6.0, 4: 2.0, 5: 3.0, 6: 1.0, 7: 1.0, 10: 3.0}))]
The lower bound on the log likelihood of the entire corpus: -807.523779034
The upper bound on perplexity: 3.1058607098
The topics described by their top-weighted terms:
+-----+-----------+------------------------------------------+
|topic|termIndices|termWeights                               |
+-----+-----------+------------------------------------------+
|0    |[4, 7]     |[0.10782279117289076, 0.09748059781126188]|
|1    |[1, 6]     |[0.16755680545542245, 0.14746675160950057]|
|2    |[1, 3]     |[0.10064404940528088, 0.10044227953257671]|
|3    |[1, 3]     |[0.10157580719995081, 0.0997449393879735] |
|4    |[9, 10]    |[0.10479880814180582, 0.10207371063193371]|
|5    |[8, 5]     |[0.10843493258130431, 0.09701505371078402]|
|6    |[8, 5]     |[0.09874157104646761, 0.09654281855423051]|
|7    |[9, 4]     |[0.1125248473532763, 0.09755082892584456] |
|8    |[5, 4]     |[0.15487

## TF-IDF

In [28]:
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
print rescaledData.take(1)

# Trains a LDA model.
lda = LDA(k=2, maxIter=10)
model = lda.fit(rescaledData)

ll = model.logLikelihood(rescaledData)
lp = model.logPerplexity(rescaledData)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = model.describeTopics(2)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

# Shows the result
transformed = model.transform(rescaledData)
transformed.show(1)

[Row(label=0.0, sentence=u'Hi I heard about Spark', words=[u'hi', u'i', u'heard', u'about', u'spark'], rawFeatures=SparseVector(20, {0: 1.0, 5: 1.0, 9: 1.0, 17: 2.0}), features=SparseVector(20, {0: 0.6931, 5: 0.6931, 9: 0.2877, 17: 1.3863}))]
The lower bound on the log likelihood of the entire corpus: -37.6225456281
The upper bound on perplexity: 4.40556309703
The topics described by their top-weighted terms:
+-----+-----------+-------------------------------------------+
|topic|termIndices|termWeights                                |
+-----+-----------+-------------------------------------------+
|0    |[17, 15]   |[0.05823196670532641, 0.05751424724780337] |
|1    |[0, 3]     |[0.062056681029749566, 0.05575172335627538]|
+-----+-----------+-------------------------------------------+

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|            feature

# Another LDA

In [29]:
from pyspark.sql import Row
from pyspark.ml.clustering import LDA
from pyspark.ml.linalg import SparseVector

ls_row = [Row(label=1.0, features=SparseVector(20, {1: 0.0, 2: 0.0, 3: 0.0, 4: 0.5754, 5: 0.5754, 6: 0.0, 7: 0.2877, 8: 0.2877, 9: 0.0, 10: 0.0, 11: 0.5754, 13: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 19: 0.0})),
 Row(label=1.0, features=SparseVector(20, {0: 1.1507, 1: 0.0, 2: 0.0, 3: 0.0, 5: 0.5754, 6: 0.0, 7: 0.5754, 8: 0.5754, 9: 0.0, 10: 0.0, 11: 0.2877, 12: 0.5754, 13: 0.0, 14: 0.5754, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.863, 19: 0.0})),
 Row(label=1.0, features=SparseVector(20, {0: 0.863, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.5754, 6: 0.0, 9: 0.0, 10: 0.0, 12: 0.2877, 13: 0.0, 14: 0.5754, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.863, 19: 0.0}))]

rescaledData = sc.parallelize(ls_row).toDF()

# Trains a LDA model.
lda = LDA(k=2, maxIter=10)
model = lda.fit(rescaledData)

ll = model.logLikelihood(rescaledData)
lp = model.logPerplexity(rescaledData)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = model.describeTopics(2)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

# Shows the result
transformed = model.transform(rescaledData)
transformed.take(1)

The lower bound on the log likelihood of the entire corpus: -44.9571717581
The upper bound on perplexity: 4.22351179184
The topics described by their top-weighted terms:
+-----+-----------+-------------------------------------------+
|topic|termIndices|termWeights                                |
+-----+-----------+-------------------------------------------+
|0    |[17, 15]   |[0.05823196670532641, 0.05751424724780337] |
|1    |[0, 3]     |[0.062056681029749566, 0.05575172335627538]|
+-----+-----------+-------------------------------------------+



[Row(features=SparseVector(20, {1: 0.0, 2: 0.0, 3: 0.0, 4: 0.5754, 5: 0.5754, 6: 0.0, 7: 0.2877, 8: 0.2877, 9: 0.0, 10: 0.0, 11: 0.5754, 13: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 19: 0.0}), label=1.0, topicDistribution=DenseVector([0.3716, 0.6284]))]