In [1]:
import os
import markdown
from bs4 import BeautifulSoup
import re
import wandb
import pandas as pd
import emoji

In [2]:
PROJECT_NAME = "semplify"

In [3]:
run = wandb.init(project=PROJECT_NAME, name="data_processing")

[34m[1mwandb[0m: Currently logged in as: [33mparambharat[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Cleanup Change Log

In [4]:
USRBY_RE = r"by \@\S+ in https\S+$" # by <user> in <pr>
URL_RE = r"in https\S+" # in <pr>
CH_RE = "^full changelog.*" # Full Changelog: ...
DATE_RE = r"\(\w+ \d+, \d+\)" # (August 10, 2022)


def cleanup_changelog(md_text):
    html = markdown.markdown(md_text)
    soup = BeautifulSoup(html, features="html.parser")
    text = re.sub(USRBY_RE, "", soup.text, 0, re.MULTILINE)
    text = re.sub(URL_RE, "", text, 0, re.MULTILINE)
    text = re.sub(CH_RE, "", text, 0, re.MULTILINE | re.IGNORECASE)
    text = re.sub(DATE_RE, "", text, 0, re.MULTILINE | re.IGNORECASE)
    text = text.strip()
    text = map(lambda x: x.strip(), text.splitlines())
    text = "\n".join(text)
    text = emoji.demojize(emoji.emojize(text, language="alias"))
    text = text.replace('&amp;', '&')
    return text

### Cleanup Tweet

In [5]:

def cleanup_tweet(tweet_text):
    tweet_content = tweet_text.splitlines()[1:-1]
    tweet_content = "\n".join(tweet_content).strip()
    tweet_content = emoji.demojize(tweet_content)
    tweet_content = tweet_content.replace(":bug:", ":beetle:")
    tweet_content = tweet_content.replace(":sparkles:", ":dizzy:")
    tweet_content = tweet_content.replace('&amp;', '&')
    return tweet_content


In [6]:
def cleanup_dataset(df):
    # clean up the changelogs
    df["cleaned_logs"] = df["release_notes"].map(cleanup_changelog)
    cleaned_df = df[df["cleaned_logs"].str.split().map(len) >50]
    
    # clean up the tweets
    cleaned_df["cleaned_tweet"] = cleaned_df["tweet"].map(cleanup_tweet)
    
    cleaned_df = cleaned_df.reset_index(drop=True)
    return cleaned_df

In [7]:
raw_artifacts = wandb.use_artifact("raw_dataset:latest")
raw_artifacts = raw_artifacts.wait()

raw_changelog_tweets = raw_artifacts.get("changelog_tweets")
raw_dataset = pd.DataFrame(
    raw_changelog_tweets.data,
    columns=raw_changelog_tweets.columns) 

cleaned_dataset = cleanup_dataset(raw_dataset)


processed_artifacts = wandb.Artifact("processed_dataset", type="dataset")

processed_artifacts.add(wandb.Table(dataframe=cleaned_dataset), name="cleaned_data")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df["cleaned_tweet"] = cleaned_df["tweet"].map(cleanup_tweet)


<ManifestEntry digest: 4lYqcH0IN9ZYzA43wKGsDA==>

In [8]:
## Prompt Creation

In [9]:
## Zero-shot Prompt

In [10]:
def make_zero_shot_prompt(row, add_start=True):
    start_seq = "summarize as a tweet:\n\n"
    log = row["cleaned_logs"].strip()
    prompt = "[changelog]:\n\n" + log + "\n\n[tweet]:\n\n"
    if add_start:
        prompt = start_seq + prompt
    return prompt

In [11]:
cleaned_dataset["prompt"] = cleaned_dataset.apply(
    make_zero_shot_prompt, axis=1)
processed_artifacts.add(
    wandb.Table(dataframe=cleaned_dataset), name="zeroshot_dataset")

<ManifestEntry digest: ep4Wv0zj2uviyCd3rwdQmQ==>

In [12]:
def make_one_shot_prompt(row):
    start_seq = "summarize as a tweet:\n\n"
    log = row["cleaned_logs"].strip()
    tweet = row["cleaned_tweet"]
    prompt = start_seq + "[changelog]:\n\n" + log + "\n\n[tweet]:\n\n" + tweet + "\n\n###\n\n"
    return prompt

def make_one_shot_dataset(dataset):
    records = []
    prompts = []
    for idx in range(1, len(dataset)):
        current_row = dataset.iloc[idx]
        previous_row = dataset.iloc[idx-1]
        first_half = make_one_shot_prompt(previous_row)
        second_half = make_zero_shot_prompt(current_row, add_start=False)
        prompt = first_half + second_half
        prompts.append(prompt)
        records.append(current_row)
    data = pd.DataFrame(records)
    data["prompt"] = prompts
    return data
    

In [13]:
one_shot_dataset = make_one_shot_dataset(cleaned_dataset)

In [14]:
processed_artifacts.add(
    wandb.Table(dataframe=one_shot_dataset), name="oneshot_dataset")
wandb.log_artifact(processed_artifacts)

<wandb.sdk.wandb_artifacts.Artifact at 0x7fdcbb803100>

In [15]:
wandb.finish()

VBox(children=(Label(value='0.184 MB of 0.184 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…