# Calculate Daily Clicks and Impressions for Team Mobile App Cards

## Description

This notebook generates the daily (EST) number of clicks and impressions (total, unique, registered users, non-registered users, etc.) on mobile cards for each team app using the YinzCam Realtime API data.

## Preprocessing

* Load the Actions & Sessions data from the ADL
* Filter Actions data for Cards only
* Join Cards and sessions tables on `device_id`
* Aggregate the types of actions by card id

Notes: The `card_id` column takes the first argument of the type minor information and return it as a tag for this particular card. Therefore there is no need to join with the meta file.


## Outputs

CSV to Data Lake <br>
SQL Table to MS SQL
* `team_cards_agg` table: date, card_id, total_clicks, total_unique_clicks,	total_reg_clicks, total_reg_unique_clicks, total_nonreg_clicks, total_nonreg_unique_clicks, total_imps, total_unique_imps, total_reg_imps, total_reg_unique_imps, total_nonreg_imps, total_nonreg_unique_imps. Where `team in {nhl,tfc,nba}`

## QA

* Khaled Elshamouty, Data Scientist, Khaled.Elshamouty@MLSE.com (Primary)
* Nicole Ridout, Data Engineer, Nicole.Ridout@MLSE.com
* Farah Bastien, Manager of Data Science, Farah.Bastien@MLSE.com

##### Load the necessary functions from `PySpark` and `Python`

In [0]:
#SQL-like functions from PySpark
from pyspark.sql.functions import col,date_format,from_utc_timestamp, from_unixtime, unix_timestamp, sum, count,\
                                  countDistinct, collect_list, size,month,min,max,when,upper, \
                                  lag,split,size,length, lit, mean, collect_set, concat, upper,create_map, weekofyear,\
                                  year,round,first, explode, isnan, regexp_replace, to_date, floor, when
from pyspark.sql.types import TimestampType, IntegerType, DoubleType, StringType, DateType, StructType
from pyspark.sql.window import Window

#datetime from Python
from datetime import datetime,timedelta,date
import numpy as np
import os, re, glob
from itertools import chain

#### Helper Functions

In [0]:
# Returns formatted card id
def get_tag_(s):
  try:
    fields = s.split("|")
    tag = fields[0]
  except:
    tag = None
  return tag

get_tag = udf(get_tag_)

# Returns formatted card location
def get_tag_2(s):
  try:
    fields = s.split("|")
    tag = fields[2]
  except:
    tag = None
  return tag

get_tag2 = udf(get_tag_2)

##### Load the ADL configuration

In [0]:
url = "https://login.microsoftonline.com/{0}/oauth2/token".format(dbutils.secrets.get(scope = "adl_cred", key = "directory_id"))
spark.conf.set("dfs.adls.oauth2.access.token.provider.type", "ClientCredential")
spark.conf.set("dfs.adls.oauth2.client.id", dbutils.secrets.get(scope = "adl_cred", key = "client_id"))
spark.conf.set("dfs.adls.oauth2.credential", dbutils.secrets.get(scope = "adl_cred", key = "credential"))
spark.conf.set("dfs.adls.oauth2.refresh.url", url)

##### Define the team

In [0]:
dbutils.widgets.text("team", "","")
dbutils.widgets.get("team")
team = getArgument("team")
print("Working with {0} team".format(team))

#### Load the Actions from the ADL and filter for `cards` only

In [0]:
adlurl = "adl://mlse1.azuredatalakestore.net/yinz_cam/"+ team +"_tor/realtime_api/"
files = dbutils.fs.ls(adlurl + 'actions')
file_paths = [x.path for x in files]
# Read Actions and Sessions CSVs from ADL, Select Cards
cards  = (spark.read.csv(file_paths, header=True)
          .drop_duplicates()
          .select('request_date_time', 'session_id', 'resource_major', 'type_major', 'type_minor', 'yinzid')
          .withColumn('date', to_date(from_utc_timestamp(col('request_date_time').cast(TimestampType()), "America/Toronto")))
          .drop('request_date_time')
          .where(col('type_major').like("%CARD%"))
          .withColumn("card_id", get_tag("type_minor"))
          .withColumn("card_location", get_tag2("type_minor"))
         )

# cards_2  = (spark.read.csv(file_paths[1596:],header=True)
#           .drop_duplicates()
#           .select('request_date_time', 'session_id', 'resource_major', 'type_major', 'type_minor', 'yinzid')
#           .withColumn('date', to_date(from_utc_timestamp(col('request_date_time').cast(TimestampType()), "America/Toronto")))
#           .drop('request_date_time')
#           .where(col('type_major').like("%CARD%"))
#           .withColumn("card_id", get_tag("type_minor"))
#           .withColumn("card_location", get_tag2("type_minor"))
#          )

In [0]:
# cards = cards.union(cards_2)
# display(cards.select(max ('date')))

#### Load Sessions from ADL and Join Cards and Sessions tables on `device_id`

In [0]:
sessions = (spark.read.csv(adlurl + 'sessions',header=True)
           .select('device_generated_id', 'device_id')
           )

# Provides addtional information about the users - registered/unregistered
cards_devid =(cards
              .join(sessions
                    .withColumn("session_id", sessions.device_generated_id)
                    .select("session_id","device_id"), "session_id", "left"
                   )
             )

#### Aggregate Card Actions by Date, Resource, Card Name and Card Location

In [0]:
total_cards = (cards_devid
                .groupBy(["date","resource_major","card_id","card_location"])
                .agg(sum(when(col("type_major").like("%CLICK%"), 1).otherwise(0)).alias("total_clicks"),
                     countDistinct(when(col("type_major").like("%CLICK%"), col("device_id"))).alias("total_unique_clicks"),
                     sum(when(col("yinzid").isNotNull() & col("type_major").like("%CLICK%"), 1).otherwise(0)).alias("total_reg_clicks"),
                     countDistinct(when(col("type_major").like("%CLICK%") & col("yinzid").isNotNull(), col("device_id"))).alias("total_reg_unique_clicks"),
                     sum(when(col("yinzid").isNull() & col("type_major").like("%CLICK%"), 1).otherwise(0)).alias("total_nonreg_clicks"),
                     countDistinct(when(col("type_major").like("%CLICK%") & col("yinzid").isNull(), col("device_id"))).alias("total_nonreg_unique_clicks"),
                     sum(when(col("type_major").like("%IMP%"), 1).otherwise(0)).alias("total_imps"),
                     countDistinct(when(col("type_major").like("%IMP%"), col("device_id"))).alias("total_unique_imps"),
                     sum(when(col("yinzid").isNotNull() & col("type_major").like("%IMP%"), 1).otherwise(0)).alias("total_reg_imps"),
                     countDistinct(when(col("type_major").like("%IMP%") & col("yinzid").isNotNull(), col("device_id"))).alias("total_reg_unique_imps"),
                     sum(when(col("yinzid").isNull() & col("type_major").like("%IMP%"), 1).otherwise(0)).alias("total_nonreg_imps"),
                     countDistinct(when(col("type_major").like("%IMP%") & col("yinzid").isNull(), col("device_id"))).alias("total_nonreg_unique_imps"),
                    )
               )

In [0]:
# display(total_cards.orderBy(col('date'), ascending=False))

### Write to SQL

In [0]:
sqlserver = dbutils.secrets.get(scope = "jdbc", key = "sqlserver")
port = '1433'
database = 'mlse_freq'
user = dbutils.secrets.get(scope = "jdbc", key = "username")
pswd = dbutils.secrets.get(scope = "jdbc", key = "password")
url = 'jdbc:sqlserver://' + sqlserver + ':' + port + ';database=' + database

In [0]:
(total_cards 
 .coalesce(8)
 .write
 .option('user', user)
 .option('password', pswd)
 .jdbc(url, team.lower() + "_cards_agg", mode = "overwrite")
)