In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import desc
from pyspark.sql.functions import asc
from pyspark.sql.functions import sum as Fsum

import datetime

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
spark = SparkSession \
    .builder \
    .appName("Wrangling Data") \
    .getOrCreate()

In [3]:
path = "/FileStore/tables/Youtuber_log_file"
user_log = spark.read.json(path)

In [4]:
user_log.printSchema()

In [5]:
user_log.show(n=1)

In [6]:
user_log.describe().show()

In [7]:
user_log.describe("Youtuber").show()

In [8]:
user_log.describe("sessionId").show()

In [9]:
user_log.count()

In [10]:
user_log.select("page").dropDuplicates().sort("page").show()

In [11]:
user_log.select(["userId", "firstname", "page", "video"]).where(user_log.userId == "73").collect()

In [12]:
get_hour = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0). hour)

In [13]:
user_log = user_log.withColumn("hour", get_hour(user_log.ts))

In [14]:
user_log.show(n=2)

In [15]:
videos_in_hour = user_log.filter(user_log.page == "NextVideo").groupby(user_log.hour).count().orderBy(user_log.hour.cast("float"))

In [16]:
videos_in_hour.show()

In [17]:
videos_in_hour_pd = videos_in_hour.toPandas()
videos_in_hour_pd.hour = pd.to_numeric(videos_in_hour_pd.hour)

In [18]:
plt.scatter(videos_in_hour_pd["hour"], videos_in_hour_pd["count"])
plt.xlim(-1, 24);
plt.ylim(0, 1.2 * max(videos_in_hour_pd["count"]))
plt.xlabel("Hour")
plt.ylabel("Videos played");