# Setup DB

In [1]:
import pandas as pd
from sqlalchemy import create_engine

from app.core.config import settings

In [2]:
# Replace these with your actual credentials
db_username = settings.POSTGRES_USER
db_password = settings.POSTGRES_PASSWORD
db_host = settings.POSTGRES_SERVER
db_port = settings.POSTGRES_PORT
db_name = settings.POSTGRES_DB

# Create an engine that connects to PostgreSQL
engine = create_engine(f'postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}')


# Analysis

In [19]:
sql_query = """
    SELECT
        ni.issue_id,
        ni.subscription_id,
        ni.timestamp,
        im.time_to_generate,
        tc.article_id,
        tc.action,
        tc.input_tokens,
        tc.output_tokens,
        ns.newsletter_description
    FROM newsletter_issue as ni
    RIGHT OUTER JOIN token_cost as tc ON ni.issue_id = tc.metrics_id
    JOIN issue_metrics as im ON ni.issue_id = im.metrics_id
    JOIN subscription as ns ON ns.id = ni.subscription_id
"""

# Execute the query and load data into a DataFrame
df = pd.read_sql(sql_query, engine)

In [20]:
df["input_tokens_no_s"] = df["input_tokens"] * (df["action"] != "summary")
df["output_tokens_no_s"] = df["output_tokens"] * (df["action"] != "summary")
df["date_time"] = pd.to_datetime(df['timestamp'], unit='s')

In [21]:
gdf = df.groupby("issue_id").agg(
    {
        "subscription_id": "max",
        "date_time": "max",
        "time_to_generate": "max",
        "article_id": "max",
        "input_tokens": "sum",
        "output_tokens": "sum",
        "input_tokens_no_s": "sum",
        "output_tokens_no_s": "sum",
        "newsletter_description": "max",
    }
)
gdf["cost"] = gdf["input_tokens"] / 1_000 * 0.01 + gdf["output_tokens"] / 1_000 * 0.03
gdf["non_summary_cost"] = gdf["input_tokens_no_s"] / 1_000 * 0.01 + gdf["output_tokens_no_s"] / 1_000 * 0.03
gdf["diff"] = gdf["cost"] - gdf["non_summary_cost"]

gdf.loc[:, [
    "date_time",
    "time_to_generate",
    "cost",
    "non_summary_cost",
    "diff",
    "newsletter_description",
]].sort_values(by=["date_time"])

Unnamed: 0_level_0,date_time,time_to_generate,cost,non_summary_cost,diff,newsletter_description
issue_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-11-1700640278,2023-11-22 08:04:38,190,0.33685,0.20979,0.12706,I want news about the Gaza Strip war. I am int...
1-13-1700641931,2023-11-22 08:32:11,6,0.02733,0.02733,0.0,I want news about the private space sector ind...
1-11-1700642012,2023-11-22 08:33:32,169,0.39343,0.26427,0.12916,I want news about the Gaza Strip war. I am int...
1-11-1700642181,2023-11-22 08:36:21,162,0.44134,0.31398,0.12736,I want news about the Gaza Strip war. I am int...
1-13-1700648793,2023-11-22 10:26:33,5,0.02928,0.02928,0.0,I want news about the private space sector ind...
1-11-1700648799,2023-11-22 10:26:39,173,0.34046,0.21379,0.12667,I want news about the Gaza Strip war. I am int...
1-11-1700649318,2023-11-22 10:35:18,257,0.31521,0.11888,0.19633,I want news about the Gaza Strip war. I am int...
1-13-1700649576,2023-11-22 10:39:36,5,0.02922,0.02922,0.0,I want news about the private space sector ind...
1-13-1700649582,2023-11-22 10:39:42,5,0.02922,0.02922,0.0,I want news about the private space sector ind...
1-14-1700656472,2023-11-22 12:34:32,144,0.16163,0.04738,0.11425,I want news about the electric car industry.


In [77]:
gdf.loc[gdf["time_to_generate"] > 10, [
    "timestamp",
    "time_to_generate",
    "cost",
    "non_summary_cost",
    "diff",
]].mean()

timestamp           1.700645e+09
time_to_generate    1.902000e+02
cost                3.654580e-01
non_summary_cost    2.241420e-01
diff                1.413160e-01
dtype: float64

## Target Issues

**Original Issues** (articles not summarized)



**Recycled Issues** (articles already summarized)

