## Get Data From HuggingFace

In [1]:
from datasets import load_dataset

cnn_ds = load_dataset('cnn_dailymail', '3.0.0', split = 'test')

Found cached dataset cnn_dailymail (/home/luntaixia/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


In [2]:
samples = (
    cnn_ds
    .filter(lambda example: (len(example['article']) >= 500) and (len(example['highlights']) >= 20))
    .shuffle(seed = 42)
)

Loading cached processed dataset at /home/luntaixia/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de/cache-bbb1a9e22bb69e22.arrow
Loading cached shuffled indices for dataset at /home/luntaixia/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de/cache-6f45b05462e7bd9c.arrow


In [3]:
import pandas as pd
import nltk
nltk.download('punkt')

df = pd.DataFrame(
    zip(samples['article'], samples['highlights']), 
    columns = ['article', 'orig_summ']
)
df['summ'] = df['orig_summ'].apply(
    lambda x : nltk.sent_tokenize(x.strip())[0] # get first sentence of the target
)
df = df.drop(columns=['orig_summ'])
df = df[
        (df['article'].str.len() < 10000) 
        & (df['summ'].str.len() < 1000)
    ].reset_index(drop=True)
df

[nltk_data] Downloading package punkt to /home/luntaixia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,article,summ
0,When she was 15 weeks pregnant with her fifth ...,"Clare Van Santen, 37, has Stage 4 breast cance..."
1,The 150 people killed in the Germanwings air d...,"1,500 people are attending the touching ceremo..."
2,Sri Lanka's Kumar Sangakkara should reconsider...,Kumar Sangakkara retired from one-day cricket ...
3,The future of Andy Murray’s coaching arrangeme...,Jonas Bjorkman joined Andy Murray's camp on an...
4,Two rare paintings used to plan the filming of...,Two rare paintings by art director Jack Martin...
...,...,...
11390,Fernando Torres has hailed manager Diego Simeo...,Atletico Madrid host Real Madrid in Champions ...
11391,There will be a white-hot atmosphere at White ...,Sherwood is a passionate man but he can’t affo...
11392,Family members and friends have been left hear...,Madison Small woke up feeling ill on Monday ni...
11393,Former Redgum frontman John Schumann has slamm...,Footage has emerged of I Was Only 19 being pla...


## Export sample for UI batch scoring

In [4]:
df.sample(10).to_csv("samples_pairs.csv")
df[['article']].sample(10).to_csv("sample_articles_for_batch_prediction.csv")

## Generate Fake Requests

In [4]:
import requests

def post_req(url:str, params = None, data: dict = None) -> requests.Response:
    headers = {
        "Content-type": "application/json",
    }
    try:
        r = requests.post(url, params = params, json=data, headers=headers)
    except Exception as e:
        print("error happen here:\n", e)
    else:
        if r.status_code == 200:
            return r
        else:
            print("request code is not 200")
        
def get_req(url:str, params = None) -> requests.Response:
    headers = {"Content-type": "application/json"}
    try:
        r = requests.get(url, params = params, headers=headers)
    except Exception as e:
        print("error happen here:\n", e)
    else:
        if r.status_code == 200:
            return r.json()
        else:
            print("request code is not 200")

#### For local and mlflow backend, they are free of charge, so use whatever you what

In [5]:
def mimic_mlflow_batch(samples: pd.DataFrame, send_arize:bool = False):
    # get samples from monitoring
    # samples = get_req(
    #     url = 'http://localhost:9020/sample/pairs', 
    #     params = dict(num_sample = num_sample)
    # )
    # get samples from transformer
    #samples = df.sample(num_sample).to_dict(orient = 'list')
        
    # get scores from mlflow model
    r = post_req(
        url = "http://localhost:5000/article/summarize_batch", 
        data = dict(articles=samples['article'])
    )
    summs = r.json()
    # log to mysql
    post_req(
        url = 'http://localhost:9020/log/batch', 
        data = dict(
            articles = samples['article'],
            summs = summs,
            targets = samples['summ'],
            model_source = 'Mlflow',
            send_arize = send_arize
        )
    )

In [6]:
def mimic_local_batch(samples: pd.DataFrame, send_arize:bool = False):
    # get samples from monitoring
    # samples = get_req(
    #     url = 'http://localhost:9020/sample/pairs', 
    #     params = dict(num_sample = num_sample)
    # )
    # get samples from transformer
    #samples = df.sample(num_sample).to_dict(orient = 'list')
        
    # get scores from mlflow model
    r = post_req(
        url = "http://localhost:8000/article/summarize_batch", 
        data = dict(
            articles=dict(articles=samples['article']),
            config=dict(num_beans=8, temperature=1.0)    
        )
    )
    summs = r.json()
    # log to mysql
    post_req(
        url = 'http://localhost:9020/log/batch', 
        data = dict(
            articles = samples['article'],
            summs = summs,
            targets = samples['summ'],
            model_source = 'Local',
            send_arize = send_arize
        )
    )

#### for AWS services (lambda/sagemaker), they are charged so use it less

In [7]:
def mimic_lambda_batch(samples: pd.DataFrame, send_arize:bool = False):
    # get samples from monitoring
    # samples = get_req(
    #     url = 'http://localhost:9020/sample/pairs', 
    #     params = dict(num_sample = num_sample)
    # )
    # get samples from transformer
    #samples = df.sample(num_sample).to_dict(orient = 'list')
        
    # get scores from mlflow model
    r = post_req(
        url = "xxxxxx", # lambda api gateway endpoint here 
        data = dict(articles=samples['article'], num_beans=8, temperature=1.0)
    )
    summs = r.json()
    # log to mysql
    post_req(
        url = 'http://localhost:9020/log/batch', 
        data = dict(
            articles = samples['article'],
            summs = summs,
            targets = samples['summ'],
            model_source = 'Lambda',
            send_arize = send_arize
        )
    )

In [None]:
from tqdm.notebook import tqdm
import time
import random

loop = tqdm(random.choices(
    population=[mimic_mlflow_batch, mimic_local_batch, mimic_lambda_batch],
    weights = [0.5, 0.49, 0.01],
    # population=[mimic_mlflow_batch, mimic_local_batch],
    # weights = [0.7, 0.3],
    k = 500
))
for func in loop:
    n = random.randint(2, 5)
    loop.set_description(f"{func.__name__}[{n}]")
    
    df_sample = df.sample(n).to_dict(orient = 'list')
    try:
        func(samples=df_sample)
    except:
        pass

#### change time stamp randomly

we assume on average, each query will arive on a 6min (360s) interval (i.e., there will be 10 query in one hour on average), then the query interval will follow exponential distribution, wthl scale factor equal to 360

In [9]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

start_dt = datetime(2023, 6, 9 )
num_samples = 4800 # make sure this number is larger than the actual sample size

rng = np.random.default_rng()
intervals = rng.exponential(scale=360, size=num_samples).round(0)
cum_intervals = intervals.cumsum()
dts = [start_dt]
for i in range(num_samples):
    dts.append(start_dt + timedelta(seconds=cum_intervals[i]))

In [10]:
# randomly assign the generated dates to the samples
import mysql.connector

mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="luntaixia",
  database="summarizer"
)

In [11]:
with mydb.cursor() as cursor:
    sql = f"""
    select prediction_id 
    from summarizer.summarize_log
    """
    cursor.execute(sql)
    r = cursor.fetchall()

In [12]:
from tqdm.notebook import tqdm

dts_ = iter(dts)
with mydb.cursor() as cursor:
    for id_ in tqdm(r):
        
        sql = f"""
        update summarizer.summarize_log
        set prediction_ts = '{next(dts_).strftime("%Y-%m-%d %H:%M:%S")}'
        where prediction_id = '{id_[0]}'
        """
        cursor.execute(sql)
mydb.commit()

  0%|          | 0/4661 [00:00<?, ?it/s]

## Generate samples for past observations

In [11]:
import pandas as pd

df = pd.read_parquet("summarize_log.parquet")
df['prediction_ts'] = pd.to_datetime(df['prediction_ts'])
df

Unnamed: 0,prediction_id,prediction_ts,document,summary,reference_summary,model_source,score_blue,score_rouge1,score_rouge2,score_rougeL
0,000c4204,2023-06-09 00:00:00,"They gave us sanitation, wine, aqueducts and r...",Researchers have isolated bacterial DNA of sev...,More than 200 mummified bodies were found in a...,Local,6.39795,0.500000,0.217391,0.375000
1,000f0054,2023-06-09 00:01:40,Sam Burgess has been selected in the back row ...,Sam Burgess has been selected in the back row ...,Bath face Newcastle Falcons in the Aviva Premi...,Local,3.63293,0.222222,0.058824,0.111111
2,001259c7,2023-06-09 00:05:26,Vincent Nogueira scored in stoppage time and P...,Vincent Nogueira scored in stoppage time and P...,Vincent Nogueira nets stoppage-time winner for...,Local,2.95279,0.324324,0.114286,0.324324
3,001b8f5c,2023-06-09 00:21:22,The number of passenger vehicle teen drivers i...,The number of passenger vehicle drivers aged 1...,"Between 2004 and 2013, number of passenger veh...",Local,82.84210,0.909091,0.905660,0.909091
4,0047dd2d,2023-06-09 00:22:52,Carlos Tevez talks about being as free as a bi...,Carlos Tevez is worshipped by Juventus fans an...,Juventus face Monaco in UEFA Champions League ...,Local,5.43933,0.230769,0.083333,0.230769
...,...,...,...,...,...,...,...,...,...,...
4656,ffbc73a1,2023-06-28 03:46:46,It seemed a somewhat unusual honeymoon destina...,Couple enjoyed a relaxing stay at the 18th-cen...,Five-star Ballyfin has was given the prestigio...,Mlflow,3.03729,0.181818,0.000000,0.181818
4657,ffbc995e,2023-06-28 03:52:14,"Eight years ago, after his AC Milan team had b...",Kaka knelt was wearing a vest on which he had ...,Liverpool are taking action against fan Stephe...,Mlflow,1.05597,0.000000,0.000000,0.000000
4658,ffcd2353,2023-06-28 04:17:48,Jack Grealish will decide his international fu...,Jack Grealish will decide his international fu...,Jack Grealish impressed as Aston Villa beat Li...,Mlflow,6.28560,0.160000,0.086957,0.160000
4659,ffd2779b,2023-06-28 04:18:13,It's the $65 million crime thriller that claim...,"Steven Davis, 57, of Milton, and his relatives...",'Black Mass' is a $65 million crime thriller s...,Local,1.87097,0.062500,0.000000,0.062500


In [17]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

start_dt = datetime.now() + timedelta(hours=2)
start_dt = start_dt.replace(hour=0, minute=0, second=0, microsecond=0)
num_samples = len(df) # make sure this number is larger than the actual sample size

rng = np.random.default_rng()
intervals = rng.exponential(scale=360, size=num_samples).round(0)
cum_intervals = intervals.cumsum()
for i in range(num_samples):
    df.loc[i, 'prediction_ts'] = start_dt - timedelta(seconds=cum_intervals[i])
df

Unnamed: 0,prediction_id,prediction_ts,document,summary,reference_summary,model_source,score_blue,score_rouge1,score_rouge2,score_rougeL
0,000c4204,2023-06-24 23:31:26,"They gave us sanitation, wine, aqueducts and r...",Researchers have isolated bacterial DNA of sev...,More than 200 mummified bodies were found in a...,Local,6.39795,0.500000,0.217391,0.375000
1,000f0054,2023-06-24 23:29:31,Sam Burgess has been selected in the back row ...,Sam Burgess has been selected in the back row ...,Bath face Newcastle Falcons in the Aviva Premi...,Local,3.63293,0.222222,0.058824,0.111111
2,001259c7,2023-06-24 23:25:37,Vincent Nogueira scored in stoppage time and P...,Vincent Nogueira scored in stoppage time and P...,Vincent Nogueira nets stoppage-time winner for...,Local,2.95279,0.324324,0.114286,0.324324
3,001b8f5c,2023-06-24 23:15:36,The number of passenger vehicle teen drivers i...,The number of passenger vehicle drivers aged 1...,"Between 2004 and 2013, number of passenger veh...",Local,82.84210,0.909091,0.905660,0.909091
4,0047dd2d,2023-06-24 23:14:43,Carlos Tevez talks about being as free as a bi...,Carlos Tevez is worshipped by Juventus fans an...,Juventus face Monaco in UEFA Champions League ...,Local,5.43933,0.230769,0.083333,0.230769
...,...,...,...,...,...,...,...,...,...,...
4656,ffbc73a1,2023-06-06 03:13:12,It seemed a somewhat unusual honeymoon destina...,Couple enjoyed a relaxing stay at the 18th-cen...,Five-star Ballyfin has was given the prestigio...,Mlflow,3.03729,0.181818,0.000000,0.181818
4657,ffbc995e,2023-06-06 03:06:00,"Eight years ago, after his AC Milan team had b...",Kaka knelt was wearing a vest on which he had ...,Liverpool are taking action against fan Stephe...,Mlflow,1.05597,0.000000,0.000000,0.000000
4658,ffcd2353,2023-06-06 02:59:34,Jack Grealish will decide his international fu...,Jack Grealish will decide his international fu...,Jack Grealish impressed as Aston Villa beat Li...,Mlflow,6.28560,0.160000,0.086957,0.160000
4659,ffd2779b,2023-06-06 02:58:22,It's the $65 million crime thriller that claim...,"Steven Davis, 57, of Milton, and his relatives...",'Black Mass' is a $65 million crime thriller s...,Local,1.87097,0.062500,0.000000,0.062500
