# Setup

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import re
import numpy as np
from datetime import datetime, date, timedelta

In [None]:
import sqlite3

conn = sqlite3.connect("../EXOPLANETS.db")
c = conn.cursor()

# Freshness Detector

In [None]:
def show_query_plot(filename, x, y):
    with open(filename, 'r') as sql_file:
        sql = sql_file.read()

    # '.headers on' works for command line execution but causes errors in the python environment
    data = pd.read_sql_query(re.sub('.headers on', '', sql), conn)
    data = data.rename(columns={clmn: clmn.lower() for clmn in data.columns})
    return px.bar(data, x=x, y=y)

In [None]:
show_query_plot('../queries/freshness/rows-added.sql', 'date_added', 'rows_added').show()

In [None]:
def show_plot_with_detections(detections_file, data_file, x, y):
    fig = show_query_plot(data_file, x, y)

    with open(detections_file, 'r') as sql_file:
        sql = sql_file.read()
    detections = pd.read_sql_query(re.sub('.headers on', '', sql), conn)
    detections = detections.rename(columns={clmn: clmn.lower() for clmn in detections.columns})
    for _, row in detections.iterrows():
        fig.add_vline(x=row['date_added'], line_color='red')
    
    return fig

In [None]:
FRESHNESS_DETECTOR_QUERY = """
WITH UPDATES AS(
  SELECT
    DATE_ADDED,
    COUNT(*) AS ROWS_ADDED
  FROM
    EXOPLANETS
  GROUP BY
    DATE_ADDED
),

NUM_DAYS_UPDATES AS (
  SELECT
    DATE_ADDED,
    JULIANDAY(DATE_ADDED) - JULIANDAY(LAG(DATE_ADDED)
      OVER(
        ORDER BY DATE_ADDED
      )
    ) AS DAYS_SINCE_LAST_UPDATE
  FROM
    UPDATES
)

SELECT
  *
FROM
  NUM_DAYS_UPDATES
WHERE
  DAYS_SINCE_LAST_UPDATE > {threshold_days};
"""

In [None]:
def get_freshness_detections(threshold):
    fig = show_query_plot('../queries/freshness/rows-added.sql', 'date_added', 'rows_added')

    detections = pd.read_sql_query(FRESHNESS_DETECTOR_QUERY.format(threshold_days=threshold), conn)
    detections = detections.rename(columns={clmn: clmn.lower() for clmn in detections.columns})
    for _, row in detections.iterrows():
        fig.add_vline(x=row['date_added'], line_color='red')
    
    return fig, detections

In [None]:
VALID_OUTAGE_DATES = set([
    "2020-02-08",
    "2020-05-14",
    "2020-06-17",
    "2020-06-30"
])

# Calculate Accuracy for Given `THRESHOLD_DAYS`

In [None]:
THRESHOLD_DAYS = 3

In [None]:
fig, detections = get_freshness_detections(THRESHOLD_DAYS)
fig.show()

In [None]:
TP = len(set(detections["date_added"]).intersection(VALID_OUTAGE_DATES))
FP = len(set(detections["date_added"]).difference(VALID_OUTAGE_DATES))
FN = len(VALID_OUTAGE_DATES.difference(set(detections["date_added"])))

In [None]:
print("Precision: {}".format(TP / (TP + FP)))
print("Recall:    {}".format(TP / (TP + FN)))
print("F_1 score: {}".format(TP / (TP + 0.5*(FP + FN))))
print("\nTry me again with a different threshold setting!")

# Plot Accuracies as a Function of `THRESHOLD_DAYS`

In [None]:
def F_score(beta, precision, recall):
    return (1 + beta**2) * ((precision * recall) / ((beta**2 * precision) + recall))

In [None]:
precisions, recalls, f1s, f0pt5s, f2s = [], [], [], [], []
for t_days in range(8):
    _, detections = get_freshness_detections(t_days)
    TP = len(set(detections["date_added"]).intersection(VALID_OUTAGE_DATES))
    FP = len(set(detections["date_added"]).difference(VALID_OUTAGE_DATES))
    FN = len(VALID_OUTAGE_DATES.difference(set(detections["date_added"])))
    precision = 0 if TP + FP == 0 else TP / (TP + FP)
    recall = TP / (TP + FN)
    f1 = F_score(1, precision, recall)
    f0pt5 = F_score(0.5, precision, recall)
    f2 = F_score(2, precision, recall)
    
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    f0pt5s.append(f0pt5)
    f2s.append(f2)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(8), y=precisions, name="Precision"))
fig.add_trace(go.Scatter(x=np.arange(8), y=recalls, name="Recall"))
fig.add_trace(go.Scatter(x=np.arange(8), y=f1s, name="F_1-Score"))

fig.update_layout(
    title="Model Accuracy with Different Parameters",
    xaxis_title="THRESHOLD_DAYS",
    yaxis_title="Accuracy"
)


fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(8), y=f0pt5s, name="F_0.5-Score"))
fig.add_trace(go.Scatter(x=np.arange(8), y=f1s, name="F_1-Score"))
fig.add_trace(go.Scatter(x=np.arange(8), y=f2s, name="F_2-Score"))

fig.update_layout(
    title="F-Scores with Different Parameters",
    xaxis_title="THRESHOLD_DAYS",
    yaxis_title="Accuracy"
)


fig.show()