In [1]:
# uncomment the below line to install dependencies
# !pip install -qqq openai wandb transformers markdown beautifulsoup4 emoji PyGithub

% load_ext dotenv
% dotenv

import os

## set your environment keys below to run inference

# os.environ["GH_TOKEN"]="your github personal access token"
# os.environ["OPENAI_API_KEY"]="your openai api key"

In [2]:
import openai
import wandb
import pandas as pd
import numpy as np
from transformers import GPT2TokenizerFast
from github import Github
import re
import markdown
from bs4 import BeautifulSoup
import emoji
from packaging.version import Version
from functools import partial

In [3]:
PROJECT_NAME = "semplify"

# from sweep: fewshot_predictions-fine-sweep-29
default_config = dict(
    model_name="text-davinci-002",
    temperature=.34,
    max_tokens=80,
    top_p=0.42,
    frequency_penalty=0.60,
    presence_penalty=0.82,
    stop_seq="###",
    num_generations=3,
    prompt_length=3000,
    shuffle_prompts=False)

wandb.init(project=PROJECT_NAME, config=default_config)
config = wandb.config

openai.api_key = os.getenv("OPENAI_API_KEY")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

[34m[1mwandb[0m: Currently logged in as: [33mparambharat[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
# get the artifacts to create the prompt
def load_prompt_data():
    cleaned_artifacts = wandb.use_artifact(
        "parambharat/semplify/processed_dataset:v0", type='dataset')

    cleaned_artifacts = cleaned_artifacts.wait()

    cleaned_table = cleaned_artifacts.get("cleaned_data")
    cleaned_df = pd.DataFrame(
        cleaned_table.data,
        columns=cleaned_table.columns)
    return cleaned_df

In [5]:
github_client = Github(os.environ['GH_TOKEN'])
GH_USER = "wandb"
GH_REPO = "wandb"


def load_from_tag(tag):
    try:
        release_data = (github_client
                        .get_user(GH_USER)
                        .get_repo(GH_REPO)
                        .get_release(f"v{tag}"))
        if release_data.body.strip():
            release_notes = release_data.body
            return release_notes
    except Exception as e:
        raise e


USRBY_RE = r"by \@\S+ in https\S+$"  # by <user> in <pr>
URL_RE = r"in https\S+"  # in <pr>
CH_RE = "^full changelog.*"  # Full Changelog: ...
DATE_RE = r"\(\w+ \d+, \d+\)"  # (August 10, 2022)


def cleanup_changelog(md_text):
    html = markdown.markdown(md_text)
    soup = BeautifulSoup(html, features="html.parser")
    text = re.sub(USRBY_RE, "", soup.text, 0, re.MULTILINE)
    text = re.sub(URL_RE, "", text, 0, re.MULTILINE)
    text = re.sub(CH_RE, "", text, 0, re.MULTILINE | re.IGNORECASE)
    text = re.sub(DATE_RE, "", text, 0, re.MULTILINE | re.IGNORECASE)
    text = text.strip()
    text = map(lambda x: x.strip(), text.splitlines())
    text = "\n".join(text)
    text = emoji.demojize(emoji.emojize(text, language="alias"))
    text = text.replace('&amp;', '&')
    return text

In [6]:
def make_history(df, prompt_length=2000, shuffled=False):
    if shuffled:
        df = df.sample(frac=1, replace=False)
    total_length = 0
    prompts = []
    for idx, item in df.iloc[::-1].iterrows():
        if item["prompt_length"] + total_length < prompt_length:
            total_length += item["prompt_length"]
            row_prompt = (
                    "[changelog]:\n\n"
                    + item["cleaned_logs"].strip()
                    + "\n\n[tweet]:\n\n"
                    + item["cleaned_tweet"].strip()
                    + "\n\n###\n\n"
            )
            prompts.append(row_prompt)

    return "\n".join(prompts[::-1])

In [7]:
def get_tokenized_len(texts):
    return list(map(len, tokenizer(texts).input_ids))

In [8]:
def make_prompt(new_log, df, config):
    new_log_length = get_tokenized_len([new_log])[0]
    history_length = config.prompt_length - new_log_length - 10

    df["tweet_length"] = get_tokenized_len(df["cleaned_tweet"].tolist())
    df["log_length"] = get_tokenized_len(df["cleaned_logs"].tolist())
    df.loc[:, "prompt_length"] = df["tweet_length"] + df["log_length"] + 10

    history_prompt = make_history(df, history_length, shuffled=config.shuffle_prompts)
    prompt = (
            history_prompt
            + "[changelog]:\n\n"
            + new_log.strip()
            + "\n\n[tweet]:\n\n"
    )
    return prompt

In [9]:
def run_inference(prompt, config):
    response = openai.Completion.create(
        model=config.model_name,
        prompt=prompt,
        temperature=config.temperature,
        max_tokens=config.max_tokens,
        top_p=config.top_p,
        frequency_penalty=config.frequency_penalty,
        presence_penalty=config.presence_penalty,
        stop=config.stop_seq,
        n=config.num_generations,
    )

    generated_tweets = [
        choice.to_dict()["text"].strip() for choice in response.to_dict()["choices"]
    ]
    return generated_tweets

In [10]:
def format_tweet(tweet, semver):
    formatted_tweet = (f"⭐️wandb v{semver} released⭐️\n\n" +
                       emoji.emojize(tweet, language="alias") +
                       "\n\n`pip install wandb --upgrade`")
    return formatted_tweet

In [11]:
def generate_tweets_from_tag(git_tag, config):
    df = load_prompt_data()

    if Version(df["semver"].iloc[-1]) < Version(git_tag):
        new_log = cleanup_changelog(load_from_tag(git_tag))
    else:
        print("Tweet already exists for version in the dataset")

    prompt = make_prompt(new_log, df, config)
    generated_tweets = run_inference(prompt, config)

    tweet_formatter = partial(format_tweet, semver=git_tag)
    formatted_tweets = list(map(tweet_formatter, generated_tweets))
    return formatted_tweets

In [12]:
# Enter the Git tag you want to generate a tweet for 
GIT_TAG = input("Enter the version of git tag you wish to tweet about: ")

Enter the version of git tag you wish to tweet about: 0.13.3


In [13]:
tweets = generate_tweets_from_tag(git_tag=GIT_TAG, config=config)
for i, tweet in enumerate(tweets):
    print("#" * 40 + f"\tgenerated tweet: {i + 1}\t" + "#" * 40)
    print(tweet)
    print()

########################################	generated tweet: 1	########################################
⭐️wandb v0.13.3 released⭐️

7️⃣ new contributors!

💫 Adds raytune examples / tests
💫 Adds Sweeps on Launch Jobs for Kubernetes
💫 Adds parallelism to functional testing
🐞 Fixes broken Launch apikey error message

`pip install wandb --upgrade`

########################################	generated tweet: 2	########################################
⭐️wandb v0.13.3 released⭐️

7️⃣ new contributors!

💫 Adds support for arguments in Launch Jobs
💫 Adds Sweeps on Launch on Kubernetes
💫 Adds parallelism to functional testing
🐞 Fixes broken artifact test for regression

`pip install wandb --upgrade`

########################################	generated tweet: 3	########################################
⭐️wandb v0.13.3 released⭐️

7️⃣ new contributors!

💫 Adds support for arguments in Launch Jobs
💫 Adds Sweeps on Launch on Kubernetes
💫 Adds parallelism to functional testing
🐞 Fixes broken artifact test f