#### Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import env

# Exercise

- The dataset for these exercises lives in the Codeup Data Science MySQL Server. The database name is curriculum_logs.

- Go through the lesson commenting code, adding docstrings, and adding markdown to support what is happening.

In [None]:
def acquire_logs(user=env.username, password=env.password, host=env.host):
    '''
    Function will utilize: username, password, host from env file.
    Output: df containing curriculum log data from Codeup
    '''
    url = f'mysql+pymysql://{env.user}:{env.password}@{env.host}/curriculum_logs'
    query = '''
    SELECT date,
           path as endpoint,
           user_id,
           cohort_id,
           ip as source_ip
    FROM logs;
    '''
    df = pd.read_sql(query, url)
    return df

In [None]:
def one_user_df_prep(df, user):
    '''
    Inputs: df, user - isolates 1 user
    Outputs: pages_one_user dataframe
    '''
    df = df[df.user_id == user].copy()
    df.date = pd.to_datetime(df.date)
    df = df.set_index(df.date)
    df = df.sort_index()
    pages_one_user = df['endpoint'].resample('d').count()
    return pages_one_user

In [None]:
def compute_pct_b(pages_one_user, span, k, user):
    '''
    Inputs: pages_one_user, span, k, user
    Calculates: midband, stdev, upper-band, lower-band
    Combines all of the above together into one df
    Then calculates pct_b
    Outputs: my_df
    '''
    midband = pages_one_user.ewm(span=span).mean()
    stdev = pages_one_user.ewm(span=span).std()
    ub = midband + stdev*k
    lb = midband - stdev*k
    
    my_df = pd.concat([pages_one_user, midband, ub, lb], axis=1)
    my_df.columns = ['pages_one_user', 'midband', 'ub', 'lb']
    
    my_df['pct_b'] = (my_df['pages_one_user'] - my_df['lb'])/(my_df['ub'] - my_df['lb'])
    my_df['user_id'] = user
    return my_df

In [None]:
def plot_bands(my_df, user):
    '''
    Inputs: my_df, user
    Outputs: plots my_df in subplots
    '''
    fig, ax = plt.subplots(figsize=(12,8))
    ax.plot(my_df.index, my_df.pages_one_user, label='Number of Pages, User: '+str(user))
    ax.plot(my_df.index, my_df.midband, label = 'EMA/midband')
    ax.plot(my_df.index, my_df.ub, label = 'Upper Band')
    ax.plot(my_df.index, my_df.lb, label = 'Lower Band')
    ax.legend(loc='best')
    ax.set_ylabel('Number of Pages')
    plt.show()

In [None]:
def find_anomalies(df, user, span, weight, plot=False):
    '''
    
    '''
    pages_one_user = one_user_df_prep(df, user)
    
    my_df = compute_pct_b(pages_one_user, span, weight, user)
    
    if plot:
        plot_bands(my_df, user)
    
    return my_df[my_df.pct_b>1]