# Push notifications - Real Time API

## Description 
Notebook to count clicks, views and open apps by user (mlse_id)

## Preprocessing

* Load the YinzCam Realtime API & User Profile Data from the ADL
* Join the Realtime data (actions, sessions) into a single table
* Aggregate the actions_sessions data into push actions by user (mlse_id)

## Output

SQL Table to MS SQL
* `team_push_by_user`: mlse_id, Total_push_view, Total_push_click, Total_push_opened_app. Where `team in {nhl,tfc,nba}`

## QA 
* Erika Munoz, Data Scientist, Erika.Munoz@MLSE.com (Primary)
* Nicole Ridout, Data Engineer, Nicole.Ridout@MLSE.com
* Farah Bastien, Manager of Data Science, Farah.Bastien@MLSE.com

##### Load the necessary functions from `PySpark`

In [0]:
#SQL-like functions from PySpark
from pyspark.sql.functions import col,date_format,from_utc_timestamp, from_unixtime, unix_timestamp, sum, count,\
                                  countDistinct, collect_list, size,month,min,max,when,upper, \
                                  lag,split,size,length, lit, mean, collect_set, concat, upper,create_map, weekofyear,\
                                  year,round,first, explode, isnan, regexp_replace, to_date, floor, when
from pyspark.sql.types import TimestampType, IntegerType, DoubleType, StringType, DateType, StructType
from pyspark.sql.window import Window

#datetime from Python
from datetime import datetime,timedelta,date
import numpy as np
import os, re, glob
from itertools import chain

##### Load ADL credentials
The following cell has the `Spark` configuration for accessing the Azure Data Lake, and it uses *masked* credentials stored in DataBricks

In [0]:
url = "https://login.microsoftonline.com/{0}/oauth2/token".format(dbutils.secrets.get(scope = "adl_cred", key = "directory_id"))
spark.conf.set("dfs.adls.oauth2.access.token.provider.type", "ClientCredential")
spark.conf.set("dfs.adls.oauth2.client.id", dbutils.secrets.get(scope = "adl_cred", key = "client_id"))
spark.conf.set("dfs.adls.oauth2.credential", dbutils.secrets.get(scope = "adl_cred", key = "credential"))
spark.conf.set("dfs.adls.oauth2.refresh.url", url)

## Load the YinzCam Realtime API & User Profile Data from the ADL

##### Define the team and url from ADL

In [0]:
dbutils.widgets.text("team", "","")
dbutils.widgets.get("team")
team = getArgument("team")
print("Working with {0} team".format(team))

In [0]:
adlurl = "adl://mlse1.azuredatalakestore.net/yinz_cam/"+team +"_tor/realtime_api/" # real time API

##### Load Realtime data from ADL

In [0]:
actions  = (spark.read.csv(adlurl+'actions',header=True)
            .drop_duplicates()
            .withColumnRenamed('id','action_id')
            .withColumn('request_date_time',from_utc_timestamp(col('request_date_time').cast(TimestampType()), "America/Toronto"))
            .withColumn('invisible_date_time',from_utc_timestamp(col('invisible_date_time').cast(TimestampType()), "America/Toronto"))
            .withColumn('action_date',date_format('request_date_time', 'yyyy-MM-dd'))
           )
sessions = (spark.read.csv(adlurl+'sessions',header=True)
            .withColumnRenamed('id','ses_id')
            .withColumn('start_date_time', from_utc_timestamp(col('start_date_time').cast(TimestampType()), "America/Toronto"))
            .withColumn('end_date_time', from_utc_timestamp(col('end_date_time').cast(TimestampType()), "America/Toronto"))
            .withColumn('session_date',date_format('start_date_time', 'yyyy-MM-dd'))
            .orderBy('end_date_time',ascending=False)
            .drop_duplicates(subset=['ses_id'])
            .where(col('ses_id').isNotNull())
            .withColumn("hardware_device_id",col("hardware_device_id").cast(IntegerType()))
           )

##### Load the YinzCam User Profile data

In [0]:
sqlserver = dbutils.secrets.get(scope = "jdbc", key = "sqlserver")
port = '1433'
database = 'mlse_sqldb' #user table is in this database
user = dbutils.secrets.get(scope = "jdbc", key = "username")
pswd = dbutils.secrets.get(scope = "jdbc", key = "password")
url = 'jdbc:sqlserver://' + sqlserver + ':' + port + ';database=' + database

user_table = (spark
              .read
              .option('user', user)
              .option('password', pswd)
              .jdbc(url, team + "_user_table")
              .withColumnRenamed("yinzid","yinzid_user_table")
              .withColumnRenamed("device_id","device_id_user_table")
             )

## Join the Realtime data (`actions`, `sessions`) into a single table

(to get information about the `device_id`)

In [0]:
actions_sessions = (actions
                    .join(sessions,
                          col('session_id') == col('device_generated_id')) #inner        
                    .withColumn("yinzid",when(col("yinzid").isNotNull(),col("yinzid")).otherwise("EMPTY"))#reescribe
                    .withColumn("mlse_id",concat("device_id","yinzid")) #concat no se puede if Null, 
                    .join(user_table,'mlse_id','inner')
                   ) #join actions and session to get mlse_id
actions_sessions_count = actions_sessions.cache().count()
print(actions_sessions_count)

## Aggregate the `actions_sessions` data into push actions by user (mlse_id)

In [0]:
push_by_user = (actions_sessions
                .groupBy('mlse_id')
                .agg(sum(when(col('type_major')=='PUSH_VIEW',1).otherwise(0)).alias('Total_push_view'),
                     sum(when(col('type_major')=='PUSH_CLICK',1).otherwise(0)).alias('Total_push_click'),
                     sum(when(col('type_major')=='PUSH_OPENED_APP',1).otherwise(0)).alias('Total_push_opened_app')
                    )
               )
push_by_user_count = push_by_user.cache().count()
print(push_by_user_count)

## Write to SQL

In [0]:
sqlserver = dbutils.secrets.get(scope = "jdbc", key = "sqlserver")
port = '1433'
database = 'mlse_freq'
user = dbutils.secrets.get(scope = "jdbc", key = "username")
pswd = dbutils.secrets.get(scope = "jdbc", key = "password")
url = 'jdbc:sqlserver://' + sqlserver + ':' + port + ';database=' + database

In [0]:
(push_by_user
   .coalesce(8)
   .write
   .option('user', user)
   .option('password', pswd)
   .jdbc(url, team + "_push_by_user", mode = 'overwrite' )
)