In [1]:
import itertools as it
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# plotting options
%matplotlib inline
np.set_printoptions(linewidth=250)
plt.rc('font'  , size=18)
plt.rc('figure', figsize=(10, 8))
plt.rc('axes'  , labelsize=22)
plt.rc('legend', fontsize=16)

np.set_printoptions(precision=3)
plt.rc('figure', figsize=(10, 8))

In [2]:
os.chdir('%s/examples/drills' % os.getenv('DST'))
pwd = os.getcwd()
print(pwd)

/home/cloudera/Development/dst/examples/drills


In [3]:
import os
spark_home = os.environ.get('SPARK_HOME', None)
spark_home

'/usr/lib/spark'

In [4]:
from pyspark import SparkContext, SparkConf, SQLContext

from pyspark import SparkContext, SparkConf, SQLContext, HiveContext

myConf = SparkConf().setAppName('TestApp')\
                    .set('spark.executor.memory', '2G')\
                    .set('spark.hadoop.validateOutputSpecs', 'false')

sc      = SparkContext(conf=myConf)
sql_ctx = HiveContext(sc)

## Load Tables

In [5]:
def run_sql(sql):
    return sql_ctx.sql(sql).toPandas()

In [9]:
users_df = sql_ctx.load(
    source      = 'com.databricks.spark.csv',
    header      = 'true',
    inferSchema = 'true',
    path        = 'file:%s/users.txt'%pwd
)
users_df.registerTempTable('users')
run_sql('select * from users')

Unnamed: 0,user_id,user_name,age
0,0,Ryan,38
1,1,Randy,38
2,2,Jen,32
3,3,Scarlett,1
4,4,Mike,35
5,5,Reid,40
6,6,Clark,65
7,7,Dianne,65


In [47]:
programs_df = sql_ctx.load(
    source      = 'com.databricks.spark.csv',
    header      = 'true',
    inferSchema = 'true',
    path        = 'file:%s/programs.txt'%pwd
)
programs_df.registerTempTable('programs')
run_sql('select * from programs')

Unnamed: 0,program_id,program_name,program_start_time,program_end_time,duration,is_rerun,series_id,genre
0,0,Sesame Street E1,8:00,8:30,30,True,0,kids
1,1,Sesame Street E2,8:00,8:30,30,True,0,kids
2,2,Hartof Dixie E1,20:00,21:00,60,False,1,drama
3,3,Hartof Dixie E2,20:00,21:00,60,False,1,drama
4,4,Premier League G1,4:00,6:00,60,False,2,sports
5,5,Premier League G2,6:00,8:00,60,False,2,sports


In [31]:
programs_df = sql_ctx.load(
    source      = 'com.databricks.spark.csv',
    header      = 'true',
    inferSchema = 'true',
    path        = 'file:%s/views.txt'%pwd
)
programs_df.registerTempTable('views')
run_sql('select * from views')

Unnamed: 0,user_id,program_id,seconds_viewed,viewing_start_time,viewing_end_time
0,0,5,4500,4:05,5:20
1,0,6,4560,6:05,8:21
2,0,3,3600,20:00,21:00
3,1,5,4500,4:05,5:20
4,3,0,1800,8:00,8:30
5,3,1,600,8:00,8:10
6,3,1,600,8:20,8:30
7,2,2,3600,20:00,21:00
8,2,3,3000,20:00,20:50
9,2,3,300,20:55,21:00


## Fact table questions

A the fact table is something simple like: views

* user_id, program_id, seconds_viewed, viewing_start_time, viewing_end_time

#### How many programs did each user watch?

In [32]:
sql = """\
select
    v.user_id as uid
  , count(distinct v.program_id) as n_watched
from views v
group by user_id
"""
run_sql(sql)

Unnamed: 0,uid,n_watched
0,0,3
1,1,1
2,2,2
3,3,2


get user info

In [33]:
sql = """\
select 
    u.user_name
  , u.age
  , c.n_watched
from users u
    inner join (    
        select
            v.user_id as uid
          , count(distinct v.program_id) as n_watched
        from views v
        group by user_id
    ) c on c.uid=u.user_id
"""
run_sql(sql)

Unnamed: 0,user_name,age,n_watched
0,Ryan,38,3
1,Randy,38,1
2,Jen,32,2
3,Scarlett,1,2


In [40]:
sql = """\
select 
    u.user_name
  , u.age
  , c.n_watched as n_watched_raw
  , nvl(c.n_watched, 0) as n_watched
from users u
    left join (    
        select
            v.user_id as uid
          , count(distinct v.program_id) as n_watched
        from views v
        group by user_id
    ) c on c.uid=u.user_id
"""
run_sql(sql)

Unnamed: 0,user_name,age,n_watched_raw,n_watched
0,Ryan,38,3.0,3
1,Randy,38,1.0,1
2,Jen,32,2.0,2
3,Scarlett,1,2.0,2
4,Mike,35,,0
5,Reid,40,,0
6,Clark,65,,0
7,Dianne,65,,0


#### How many users for each program?

In [35]:
sql = """\
select
    v.program_id as pid
  , count(distinct v.user_id) as n_users
from views v
group by program_id
"""
run_sql(sql)

Unnamed: 0,pid,n_users
0,0,1
1,1,1
2,2,1
3,3,2
4,5,2
5,6,1


In [53]:
sql = """\
select 
    p.program_name
  , nvl(c.n_users, 0) as n_users
from programs p
    left join (
        select
            v.program_id as pid
          , count(distinct v.user_id) as n_users
        from views v
        group by program_id
    ) c on c.pid=p.program_id
"""
run_sql(sql)

Unnamed: 0,program_name,n_users
0,Sesame Street E1,1
1,Sesame Street E2,1
2,Hartof Dixie E1,1
3,Hartof Dixie E2,2
4,Premier League G1,0
5,Premier League G2,2


#### how many users watched 1+ (one or more) programs?  2+, 3+?

In [59]:
sql = """\
select
    sum(case when c.n_programs >=1 then 1 else 0 end) as n_1plus
  , sum(case when c.n_programs >=2 then 1 else 0 end) as n_2plus
  , sum(case when c.n_programs >=3 then 1 else 0 end) as n_3plus
from (
    select
        v.user_id as uid
      , count(distinct v.program_id) as n_programs
    from views v
    group by user_id
) c
"""
run_sql(sql)

Unnamed: 0,n_1plus,n_2plus,n_3plus
0,4,3,1
