# Spark DataFrame Assignments

In [1]:
import itertools as it
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# plotting options
%matplotlib inline
np.set_printoptions(linewidth=250)
plt.rc('font'  , size=18)
plt.rc('figure', figsize=(10, 8))
plt.rc('axes'  , labelsize=22)
plt.rc('legend', fontsize=16)

np.set_printoptions(precision=3)
plt.rc('figure', figsize=(10, 8))

In [2]:
os.chdir('%s/courses/coursera_bigdata/course3/week5' % os.getenv('DST'))
pwd = os.getcwd()
print(pwd)

/Users/rwk7t/Development/dst/courses/coursera_bigdata/course3/week5


## setup PySpark

In [3]:
import os
spark_home = os.environ.get('SPARK_HOME', None)
spark_home

'/usr/local/Cellar/apache-spark/1.5.2/libexec'

In [4]:
from pyspark import SparkContext, SparkConf, SQLContext

from pyspark import SparkContext, SparkConf, SQLContext, HiveContext

myConf = SparkConf().setAppName('TestApp')\
                    .set('spark.executor.memory', '2G')\
                    .set('spark.hadoop.validateOutputSpecs', 'false')

sc      = SparkContext(conf=myConf)
sc._jsc.hadoopConfiguration().set('textinputformat.record.delimiter', '\r\n')
sql_ctx = HiveContext(sc)

In [5]:
sql_ctx.createDataFrame([("somekey", 1)])

DataFrame[_1: string, _2: bigint]

## Slides walkthrough

In [6]:
text_RDD = sc.textFile('file:%s/testfile1.txt'%pwd)
text_RDD.collect()

[u'A long time ago in a galaxy far far away\n']

In [7]:
def split_words(line):
    return line.split()

def create_pair(word):
    return (word, 1)

pairs_RDD = text_RDD.flatMap(split_words).map(create_pair)
pairs_RDD.collect()

[(u'A', 1),
 (u'long', 1),
 (u'time', 1),
 (u'ago', 1),
 (u'in', 1),
 (u'a', 1),
 (u'galaxy', 1),
 (u'far', 1),
 (u'far', 1),
 (u'away', 1)]

In [8]:
students = sc.parallelize([
    [100, 'Ryan', 8.5, 'computer science'],
    [101, 'Bob' , 7.1, 'engineering'     ],
    [101, 'Carl', 6.2, 'engineering'     ]
])
students.collect()

[[100, 'Ryan', 8.5, 'computer science'],
 [101, 'Bob', 7.1, 'engineering'],
 [101, 'Carl', 6.2, 'engineering']]

In [9]:
def extract_grade(row):
    return row[2]

students.map(extract_grade).mean()

7.266666666666667

In [10]:
def extract_degree_grade(row):
    return (row[3], row[2])

students.map(extract_degree_grade).collect()

[('computer science', 8.5), ('engineering', 7.1), ('engineering', 6.2)]

In [11]:
degree_grade_RDD = students.map(extract_degree_grade)
degree_grade_RDD.reduceByKey(max).collect()

[('engineering', 7.1), ('computer science', 8.5)]

In [12]:
students_df = sql_ctx.createDataFrame(students, ['id', 'name', 'grade', 'degree'])
students_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- grade: double (nullable = true)
 |-- degree: string (nullable = true)



In [13]:
students_df.agg({'grade': 'mean'}).collect()

[Row(avg(grade)=7.266666666666667)]

In [14]:
students_df.groupBy('degree').max('grade').collect()

[Row(degree=u'computer science', max(grade)=8.5),
 Row(degree=u'engineering', max(grade)=7.1)]

In [15]:
students_df.groupBy('degree').max('grade').show()

+----------------+----------+
|          degree|max(grade)|
+----------------+----------+
|computer science|       8.5|
|     engineering|       7.1|
+----------------+----------+



#### creating dataframes

In [16]:
from pyspark.sql.types import *

schema = StructType([
    StructField('id'    , LongType()  , True),
    StructField('name'  , StringType(), True),
    StructField('grade' , DoubleType(), True),
    StructField('degree', StringType(), True),
])
schema

StructType(List(StructField(id,LongType,true),StructField(name,StringType,true),StructField(grade,DoubleType,true),StructField(degree,StringType,true)))

In [17]:
students_df = sql_ctx.createDataFrame(students, schema)
students_df.show()

+---+----+-----+----------------+
| id|name|grade|          degree|
+---+----+-----+----------------+
|100|Ryan|  8.5|computer science|
|101| Bob|  7.1|     engineering|
|101|Carl|  6.2|     engineering|
+---+----+-----+----------------+



In [18]:
students_json = """\
{"id":100, "name":"Alice", "grade":8.5, "degree":"Computer Science"}
{"id":101, "name":"Bob", "grade":7.1, "degree":"Engineering"}
"""
students_json

'{"id":100, "name":"Alice", "grade":8.5, "degree":"Computer Science"}\n{"id":101, "name":"Bob", "grade":7.1, "degree":"Engineering"}\n'

In [19]:
import string
with open("students.json", 'w+') as f:
    f.write(students_json)

In [20]:
sql_ctx.read.json('file:%s/students.json'%pwd).show()

+----------------+-----+---+-----+
|          degree|grade| id| name|
+----------------+-----+---+-----+
|Computer Science|  8.5|100|Alice|
+----------------+-----+---+-----+



In [21]:
yelp_df = sql_ctx.load(
    source      ='com.databricks.spark.csv',
    header      = 'true',
    inferSchema = 'true',
    path        = 'file:%s/index_data.csv'%pwd
)



In [22]:
yelp_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: string (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- id: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- useful: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- full_address: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- neighborhoods: string (nullable = true)
 |-- open: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- state: string (nullable = true)



In [23]:
yelp_df.count()

0

In [24]:
yelp_df.useful

Column<useful>

In [25]:
yelp_df['useful']

Column<useful>

In [26]:
yelp_df.select('useful')

DataFrame[useful: string]

### filtering

In [27]:
yelp_df.filter(yelp_df.useful >= 1).count()

0

In [28]:
yelp_df.filter(yelp_df['useful'] >= 1).count()

0

In [29]:
yelp_df.filter('useful >= 1').count()

0

In [30]:
try:
    yelp_df['useful'].agg({'useful':'max'}).collect()
except Exception, e:
    print type(e), e

<type 'exceptions.TypeError'> 'Column' object is not callable


In [31]:
yelp_df.select('useful')

DataFrame[useful: string]

In [32]:
yelp_df.select('useful').agg({'useful': 'max'}).collect()

[Row(max(useful)=None)]

### manipulation

In [33]:
yelp_df.select('id', 'useful').take(5)

[]

In [34]:
yelp_df.select('id', yelp_df.useful/28*100).show(5)

+---+---------------------+
| id|((useful / 28) * 100)|
+---+---------------------+
+---+---------------------+



In [35]:
yelp_df.select('id', (yelp_df.useful/28*100).cast('int')).show(5)

+---+----------------------------------+
| id|cast(((useful / 28) * 100) as int)|
+---+----------------------------------+
+---+----------------------------------+



In [36]:
yelp_df.select('id', (yelp_df.useful/28*100).cast('int').alias('useful_scaled')).show(5)

+---+-------------+
| id|useful_scaled|
+---+-------------+
+---+-------------+



In [37]:
useful_perc_data = yelp_df.select(
    yelp_df['id'].alias('uid'), 
    (yelp_df.useful/28*100).cast('int').alias('useful_scaled')
)
useful_perc_data.printSchema()

root
 |-- uid: string (nullable = true)
 |-- useful_scaled: integer (nullable = true)



### ordering by column

In [38]:
from pyspark.sql.functions import asc, desc

In [39]:
useful_perc_data = yelp_df.select(
    yelp_df['id'].alias('uid'), 
    (yelp_df.useful/28*100).cast('int').alias('useful_scaled')
).orderBy(desc('useful_scaled'))

useful_perc_data.printSchema()

root
 |-- uid: string (nullable = true)
 |-- useful_scaled: integer (nullable = true)



In [40]:
useful_perc_data.show(5)

+---+-------------+
|uid|useful_scaled|
+---+-------------+
+---+-------------+



### Joins

In [41]:
joined_df = useful_perc_data.join(
    yelp_df,
    yelp_df.id==useful_perc_data.uid,
    'inner'
).select(useful_perc_data.uid, 'useful_scaled', 'review_count')

In [42]:
joined_df.printSchema()

root
 |-- uid: string (nullable = true)
 |-- useful_scaled: integer (nullable = true)
 |-- review_count: string (nullable = true)



In [43]:
joined_df.show(5)

+---+-------------+------------+
|uid|useful_scaled|review_count|
+---+-------------+------------+
+---+-------------+------------+



In [44]:
joined_df = useful_perc_data.join(
    yelp_df,
    yelp_df.id==useful_perc_data.uid,
    'inner'
).cache().select(useful_perc_data.uid, 'useful_scaled', 'review_count').show(5)

+---+-------------+------------+
|uid|useful_scaled|review_count|
+---+-------------+------------+
+---+-------------+------------+



In [45]:
joined_df = useful_perc_data.join(
    yelp_df,
    yelp_df.id==useful_perc_data.uid,
    'inner'
).cache().select(
    useful_perc_data.uid, 
    'useful_scaled', 
    'review_count'
).show(5)

+---+-------------+------------+
|uid|useful_scaled|review_count|
+---+-------------+------------+
+---+-------------+------------+



### server logs

In [50]:
logs_df = sql_ctx.load(
    source      = 'com.databricks.spark.csv',
    header      = 'true',
    inferSchema = 'true',
    path        = 'file:%s/logs.csv'%pwd
)
logs_df.printSchema()

root
 |-- code: integer (nullable = true)
 |-- protocol: string (nullable = true)
 |-- request: string (nullable = true)
 |-- app: string (nullable = true)
 |-- user_agent_major: integer (nullable = true)
 |-- region_code: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- subapp: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- method: string (nullable = true)
 |-- client_ip: string (nullable = true)
 |-- user_agent_family: string (nullable = true)
 |-- bytes: integer (nullable = true)
 |-- referer: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- extension: string (nullable = true)
 |-- url: string (nullable = true)
 |-- os_major: integer (nullable = true)
 |-- longitude: double (nullable = true)
 |-- device_family: string (nullable = true)
 |-- record: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- time: string (nullable 

In [51]:
logs_df.count()

9410

In [52]:
logs_df.show(5)

+----+--------+--------------------+---------+----------------+-----------+------------+--------------------+---------+------+------------------+------+---------------+-----------------+-----+-------+------------+---------+--------------------+--------+------------------+-------------+--------------------+--------------------+--------------------+---------+-------------+
|code|protocol|             request|      app|user_agent_major|region_code|country_code|                  id|     city|subapp|          latitude|method|      client_ip|user_agent_family|bytes|referer|country_name|extension|                 url|os_major|         longitude|device_family|              record|          user_agent|                time|os_family|country_code3|
+----+--------+--------------------+---------+----------------+-----------+------------+--------------------+---------+------+------------------+------+---------------+-----------------+-----+-------+------------+---------+--------------------+--------

In [53]:
logs_df.groupBy('code').count().show()

+----+-----+
|code|count|
+----+-----+
| 500|    2|
| 301|   71|
| 302| 1943|
| 502|    6|
| 304|  117|
| 200| 7235|
| 400|    1|
| 401|   10|
| 404|   11|
| 408|   14|
+----+-----+



In [55]:
logs_df.groupBy('code').count().orderBy(desc('count')).show()

+----+-----+
|code|count|
+----+-----+
| 200| 7235|
| 302| 1943|
| 304|  117|
| 301|   71|
| 408|   14|
| 404|   11|
| 401|   10|
| 502|    6|
| 500|    2|
| 400|    1|
+----+-----+



In [56]:
logs_df.groupBy('code').avg('bytes').show()

+----+------------------+
|code|        avg(bytes)|
+----+------------------+
| 500|            4684.5|
| 301|424.61971830985914|
| 302| 415.6510550694802|
| 502|             581.0|
| 304|185.26495726495727|
| 200| 41750.03759502419|
| 400|               0.0|
| 401|           12472.8|
| 404|17872.454545454544|
| 408|440.57142857142856|
+----+------------------+



In [58]:
import pyspark.sql.functions as F

logs_df.groupBy('code').agg(
    logs_df.code,
    F.avg(logs_df.bytes),
    F.min(logs_df.bytes),
    F.max(logs_df.bytes)
).show()

+----+----+------------------+----------+----------+
|code|code|        avg(bytes)|min(bytes)|max(bytes)|
+----+----+------------------+----------+----------+
| 500| 500|            4684.5|       422|      8947|
| 301| 301|424.61971830985914|       331|       499|
| 302| 302| 415.6510550694802|       304|      1034|
| 502| 502|             581.0|       581|       581|
| 304| 304|185.26495726495727|       157|       204|
| 200| 200| 41750.03759502419|         0|   9045352|
| 400| 400|               0.0|         0|         0|
| 401| 401|           12472.8|      8318|     28895|
| 404| 404|17872.454545454544|      7197|     23822|
| 408| 408|440.57142857142856|         0|       514|
+----+----+------------------+----------+----------+



## Quiz 1

### question 1

In [60]:
yelp_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: string (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- id: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- useful: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- full_address: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- neighborhoods: string (nullable = true)
 |-- open: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- state: string (nullable = true)



In [None]:
yelp_df