# Notebook Name

- **Author**: your name here
- **Created on**: today's date
- **Service account**: `sa-uber`

**Description**: what does this notebook do?

**Additional Documentation**:
- Link to additional documentation (if any)

## Methodology

1. Pull data.
2. Do stuff.
3. Save table.
4. ...

---

# 1. Configuration

In [None]:
import time
script_start_ts = time.time()  # Begin script stopwatch

## User Configurations

Define variables that require user configuration.

In [None]:
script_name = 'NOTEBOOK_NAME'
initials = 'ABC'
spark_config_size = 'small'  # One of 'small', 'medium', 'large' or 'extra_large'

In [None]:
import ai.datetools
end_dt = ai.datetools.date()
start_dt = end_dt - 90

In [None]:
# These timestamps are in epoch-milliseconds format
start_ts = ai.datetools.datetime(start_dt).hql
end_ts = ai.datetools.datetime(end_dt).hql

## Libraries

Import libraries for reference.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import socket
from datetime import timedelta, datetime

# Pyspark imports
import pyspark.sql.functions as F
import pyspark.sql.types as T

# AI/Athena imports
from athena.pyspark_utils import SparkSessionManager

## Options

Set global options.

In [None]:
%matplotlib inline
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 2000)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

## Spark Setup

Instantiate Spark session.

In [None]:
def build_spark_app_name(initials, script_name):
    """
    Build Spark application name string.
    """
    return '_'.join([initials.lower(),
                     script_name.lower().replace(' ', '_'),
                     str(int(time.time())),
                     socket.gethostname()])


def define_spark_config(app_name, config_size):
    """
    Get a dictionary of Spark configurations.
    """
    config_options = {
        'small': {
            'spark.executor.instances': 10,
            'spark.executor.cores': 2,
            'spark.executor.memory': '5g',
            'spark.executor.memoryOverhead': '1g',
        },
        'medium': {
            'spark.executor.instances': 14,
            'spark.executor.cores': 4,
            'spark.executor.memory': '8g',
            'spark.executor.memoryOverhead': '2g',
        },
        'large': {
            'spark.executor.instances': 20,
            'spark.executor.cores': 4,
            'spark.executor.memory': '12g',
            'spark.executor.memoryOverhead': '2g',
        },
        'extra_large': {
            'spark.executor.memory': '24g',
            'spark.executor.instances': 20,
            'spark.executor.cores': 4,
            'spark.driver.memory': '16g',
            'spark.driver.maxResultSize': '8g',
            'spark.sql.shuffle.partitions': 80,
        },
    }
    
    config = {
        'spark.driver.memory': '5g',
        'spark.driver.memoryOverhead': '2g',
        'spark.task.cpus': 1,
        'spark.sql.shuffle.partitions': '50',
    }
    
    config.update(config_options[config_size])
    return config


def is_spark_active(spark):
    """
    Return the active status of a SparkSession object as a boolean.
    """
    return not spark._jsc.sc().isStopped()


def initialize_spark(initials, script_name, config_size):
    """
    Initialize a Spark session and return the SparkSession object. If
    there's already a Spark session in that global environment that's
    active, return that object instead.
    """
    app_name = build_spark_app_name(initials, script_name)

    if 'spark' in globals():
        if is_spark_active(spark):
            print('Active, global SparkSession object `spark` already initialized')
            return spark

    config = define_spark_config(app_name, config_size)
    return SparkSessionManager.initialize_session(application_name=app_name, conf_dict=config)


spark = initialize_spark(initials, script_name, spark_config_size)
spark

## Plotting

Configure default plotting behavior.

In [None]:
def awesome_settings():
    """
    Configure default plotting settings.
    """
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    sns.set_style('whitegrid')    
    sns.set_context('paper', font_scale=2.5)
    sns.set_palette('hls')

    plt.rcParams['figure.figsize'] = (20.0, 8.0)
    plt.rcParams['savefig.dpi'] = 300
    plt.rcParams['lines.linewidth'] = 2
    plt.rcParams['legend.fancybox'] = True
    plt.rcParams['legend.shadow'] = True
    plt.rcParams['font.family'] = 'sans-serif'
    plt.rcParams['figure.autolayout'] = True

    
awesome_settings()

## Verbosity

Configure verbose behavior.

In [None]:
class Stopwatch(object):
    """
    Manage script runtime statistics.
    """
    def __init__(self, script_start_ts):
        self.script_start_ts = script_start_ts  # Should be the result of time.time()
    
    def lap(self, msg=None):
        """
        Print an optional message with a timestamp and time elapsed since script start.
        """
        import re
        import time
        from datetime import datetime
        from pytz import reference

        now = datetime.now()
        now_ts = time.time()
        
        localtime = reference.LocalTimezone()
        now_str = now.strftime('%a, %d-%b-%Y %H:%M:%S, ' + localtime.tzname(now))
        
        elapsed_minutes = round((now_ts - self.script_start_ts) / 60, 2)
    
        msg_str = '{msg} at '.format(msg=msg) if isinstance(msg, str) else 'Lap at '
        
        stopwatch_str = '{msg_str}{now_str}, {elapsed_minutes} minutes since script start time'.format(**locals())
        stopwatch_str = re.sub(r'\s+', ' ', stopwatch_str).strip()
        print(stopwatch_str)

In [None]:
stopwatch = Stopwatch(script_start_ts)
stopwatch.lap('Setup completed')

---

# 2. Operational Code

---

# X. Cleanup

## End Spark Session

Release our resources.

In [None]:
spark.stop()

## End Script Stopwatch

In [None]:
stopwatch.lap('Completed successfully')