<a href="https://www.kaggle.com/code/nghtctrl/modeling-revision-classification?scriptVersionId=174675650" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Modeling Revision Classification

Daniel Kim, Jason G. Chew, Jiho Kim

# Introduction

---

## Setup

In [None]:
%pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sklearn.metrics import confusion_matrix, roc_curve, ConfusionMatrixDisplay
import json
import plotly.express as px
import pandas as pd
import torch

torch_device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {torch_device}")

torch.manual_seed(0);

## Load Data

In [None]:
data = pd.read_csv("/kaggle/input/argrewrite-v-2-corpus-sentence-pairs/sentence_pairs.csv")

In [None]:
actual_rev_types = []

for i in range(len(data)):
    revision_type = data.loc[i, "revision_type"]
    if revision_type != "neither":
        actual_rev_types.append(revision_type)

## Functions

In [None]:
def get_completion_logprobs(prefix, completion):
    with torch.no_grad():
        completion_ids = tokenizer.encode(completion, return_tensors="pt").to(torch_device)
        completion_len = completion_ids.shape[1]

        whole_phrase = prefix + completion
        whole_phrase_ids = tokenizer.encode(whole_phrase, return_tensors="pt").to(torch_device)
        whole_phrase_logits = model(whole_phrase_ids).logits
        whole_phrase_logprobs = torch.log_softmax(whole_phrase_logits[0], 1)

        completion_logprobs = []
        for i in range(-completion_len-1, -1):
            token_id = whole_phrase_ids[0][i+1]
            logprob = whole_phrase_logprobs[i][token_id]
            completion_logprobs.append(logprob)

    return completion_logprobs

In [None]:
def plot_roc(actual_rev_types, scores, metric_label):
    fpr, tpr, thresholds = roc_curve(actual_rev_types, scores, pos_label="content")
    # Plot code generated by ChatGPT:
    # https://chat.openai.com/share/2cb2a8d8-7d8e-46bf-b9b3-560db72f3f49
    roc_df = pd.DataFrame({"fpr": fpr, "tpr": tpr, "threshold": thresholds})
    fig = px.line(roc_df, x="fpr", y="tpr",
                  title=f"ROC Curve for {metric_label}",
                  labels={
                    "fpr": "False Positive Rate",
                    "tpr": "True Positive Rate",
                    "threshold": f"{metric_label} Threshold",
                  },
                  hover_data={"threshold"}) 

    # Add a diagonal line (random classifier baseline)
    fig.add_scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(color='gray', dash='dash'), name='Random Classifier')

    # Show the plot
    fig.show()

---

## Load GPT-2

In [None]:
model_name = "openai-community/gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=torch_device)

# Add the EOS token as PAD token
if model.generation_config.pad_token_id is None:
    model.generation_config.pad_token_id = model.generation_config.eos_token_id

tokenizer.decode([tokenizer.eos_token_id]);

## GPT-2 Baseline Classification

In [None]:
old_sentences = []
new_sentences = []
prompts = []

for i in range(len(data)):
    revision_type = data.loc[i, "revision_type"]
    if revision_type != "neither":
        old_sentence = data.loc[i, "original_sentence"]
        new_sentence = data.loc[i, "revised_sentence"]
        prompt = f"The following revision from: \n{old_sentence}\nto:\n{new_sentence}\n "
        prompts.append(prompt)

In [None]:
shortening_factor = 1

gpt2_preds = []
gpt2_logprob_diffs = []

logprob_threshold = 0

for i in range(len(prompts)//shortening_factor):
    prompt = prompts[i]

    content_logprobs = torch.stack(get_completion_logprobs(prefix=prompt, completion="is substantive")).to(torch_device)
    surface_logprobs = torch.stack(get_completion_logprobs(prefix=prompt, completion="is superficial")).to(torch_device)

    logprob_diff = (torch.sum(content_logprobs) - torch.sum(surface_logprobs)).item()
    gpt2_logprob_diffs.append(logprob_diff)

    if logprob_diff > logprob_threshold:
        gpt2_preds.append("content")
    else:
        gpt2_preds.append("surface")

In [None]:
plot_roc(actual_rev_types[:len(prompts)//shortening_factor], gpt2_logprob_diffs, metric_label="Logprob Diff")

In [None]:
cm = confusion_matrix(actual_rev_types[:len(prompts)//shortening_factor], gpt2_preds, labels=["content", "surface"])
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm, display_labels=["content", "surface"]
)
disp.plot();

# Prompt Engineering

## Adding More Keyword Description

### Appending Description
- We are not considering the cases for appending the explanation to the revision data. Our current approach is to use the last word as a type of revision, meaning that adding explanations later would not have any attention to it.

### Prepending Description
- We prepended the descriptions for the keywords for content-level revision and surface-level revision.

In [None]:
descriptive_prompts = []

for i in range(len(data)):
    revision_type = data.loc[i, "revision_type"]
    if revision_type != "neither":
        old_sentence = data.loc[i, "original_sentence"]
        new_sentence = data.loc[i, "revised_sentence"]
        content_stmt = f"Substantive revisions change the meaning significantly, so the following revision from '{old_sentence}' to '{new_sentence}' "
        surface_stmt = f"Superficial revisions only change words without affecting the overall meaning, so the following revision from '{old_sentence}' to '{new_sentence}' "
        descriptive_prompts.append(
            {
                "content_stmt": content_stmt,
                "surface_stmt": surface_stmt,
            }
        )

In [None]:
shortening_factor = 1

preds = []
logprob_diffs = []

logprob_threshold = 0

for i in range(len(descriptive_prompts)//shortening_factor):
    content_prompt = descriptive_prompts[i]["content_stmt"]
    surface_prompt = descriptive_prompts[i]["surface_stmt"]

    content_logprobs = torch.stack(get_completion_logprobs(prefix=content_prompt, completion="is substantive")).to(torch_device)
    surface_logprobs = torch.stack(get_completion_logprobs(prefix=surface_prompt, completion="is superficial")).to(torch_device)

    logprob_diff = (torch.sum(content_logprobs) - torch.sum(surface_logprobs)).item()
    logprob_diffs.append(logprob_diff)

    if logprob_diff > logprob_threshold:
        preds.append("content")
    else:
        preds.append("surface")

In [None]:
plot_roc(actual_rev_types[:len(descriptive_prompts)//shortening_factor], logprob_diffs, metric_label="Logprob Diff")

In [None]:
cm = confusion_matrix(actual_rev_types[:len(descriptive_prompts)//shortening_factor], preds, labels=["content", "surface"])
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm, display_labels=["content", "surface"]
)
disp.plot();

### Analysis
Prepending the description for the keywords for content-level revision and surface-level revision did improve the performance of the model from the baseline, according to the ROC curve. However, the improvement was not significant.

## Few-Shot Prompting
- For In-Context Learning, I have provided relevant examples of revisions
- Example sentence pairs are randomly chosen from 2to3 revisions, which are not seen by the model
- We will try both two seperate prompts (for revision types) as well as one composite prompt that contains both surface and content-level revisions

In [None]:
few_shot_prompt = """
The following revision from: Having these types of vehicles is also not worth taking away people’s jobs and putting their do not have the technology to operate at a high level of safety in certain weather conditions.
to: Having these types of vehicles is also not worth putting people's lives at risk, especially for those who live in areas where it snows and rains a lot, because these vehicles do not have the technology to operate at a high level of safety in those weather conditions.
is substantive.

The following revision from: In light of recent events with the death of an Arizona woman at the hands of a self-driving Uber, many are unsure of what stance to take on the matter.
to: In light of recent events with the death of an Arizona woman at the hands of a self-driving Uber, many are conflicted on what stance to take on the matter.
is superficial.

The following revision from: On the other hand, the car companies, your lawyers and some other groups will love this idea to death.
to: On the other hand, the self- driving car companies, your lawyers and Google (they provide GPS) will love this idea to death."
is substantive.

The following revision from: There are many variables to consider when thinking about individuals using self-driving cars: the weather, other traditional cars and their drivers, and the possibility of inappropriate - or developmentally inappropriate person - like children, mistakenly getting behind the wheel.
to: There are many confounding variables to consider when thinking about individuals using self-driving cars: the weather, other traditional cars and their drivers, and the possibility of inappropriate - or developmentally-inappropriate persons - like children, mistakenly climbing behind the wheel.
is superficial.

The following revision from: {old_sentence}
to: {new_sentence}
is 
"""

In [None]:
prompts = []
for i in range(len(data)):
    revision_type = data.loc[i, "revision_type"]
    if revision_type != "neither":
        old_sentence = data.loc[i, "original_sentence"]
        new_sentence = data.loc[i, "revised_sentence"]
        composite_stmts = few_shot_prompt.format(old_sentence=old_sentence, new_sentence=new_sentence)
        prompts.append(composite_stmts)

In [None]:
shortening_factor = 1

preds = []
logprob_diffs = []

logprob_threshold = 0

for i in range(len(prompts)//shortening_factor):
    prompt = prompts[i]

    content_logprobs = torch.stack(get_completion_logprobs(prefix=prompt, completion="substantive")).to(torch_device)
    surface_logprobs = torch.stack(get_completion_logprobs(prefix=prompt, completion="superficial")).to(torch_device)

    logprob_diff = (torch.sum(content_logprobs) - torch.sum(surface_logprobs)).item()
    logprob_diffs.append(logprob_diff)

    if logprob_diff > logprob_threshold:
        preds.append("content")
    else:
        preds.append("surface")

In [None]:
plot_roc(actual_rev_types[:len(prompts)//shortening_factor], logprob_diffs, metric_label="Logprob Diff")

In [None]:
cm = confusion_matrix(actual_rev_types[:len(prompts)//shortening_factor], preds, labels=["content", "surface"])
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm, display_labels=["content", "surface"]
)
disp.plot();

### Analysis
Learning from the in-context examples seems to be effective in improving the model's performance. The model was able to correctly classify the revision types with a higher accuracy when it was trained with in-context examples. The composite prompt seems to be more effective in improving the model's performance than the two separate prompts, since it gives the change for model to learn the difference between the two revision types.

---

# Comparison Between GPT-2 and Gemma

## Recall: GPT-2 Baseline Classification

In [None]:
plot_roc(actual_rev_types[:len(prompts)//shortening_factor], gpt2_logprob_diffs, metric_label="Logprob Diff")

In [None]:
cm = confusion_matrix(actual_rev_types[:len(prompts)//shortening_factor], gpt2_preds, labels=["content", "surface"])
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm, display_labels=["content", "surface"]
)
disp.plot();

## Gemma Baseline Classification

In [None]:
%pip install -U bitsandbytes
%pip install accelerate

In [None]:
model_name = "/kaggle/input/gemma/transformers/2b/2"
    
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=torch_device, quantization_config=quantization_config)

tokenizer.decode([tokenizer.eos_token_id]);

In [None]:
shortening_factor = 1

preds = []
logprob_diffs = []

logprob_threshold = 0

for i in range(len(prompts)//shortening_factor):
    prompt = prompts[i]

    content_logprobs = torch.stack(get_completion_logprobs(prefix=prompt, completion="is substantive")).to(torch_device)
    surface_logprobs = torch.stack(get_completion_logprobs(prefix=prompt, completion="is superficial")).to(torch_device)

    logprob_diff = (torch.sum(content_logprobs) - torch.sum(surface_logprobs)).item()
    logprob_diffs.append(logprob_diff)

    if logprob_diff > logprob_threshold:
        preds.append("content")
    else:
        preds.append("surface")

In [None]:
with open("/kaggle/input/gemma-data/gemma_data.json", "r") as file:
    gemma_data = json.load(file)

In [None]:
plot_roc(actual_rev_types[:len(prompts)], gemma_data["logprob_diffs"], metric_label="Logprob Diff")

In [None]:
cm = confusion_matrix(actual_rev_types[:len(prompts)], gemma_data["predictions"], labels=["content", "surface"])
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm, display_labels=["content", "surface"]
)
disp.plot();

# SBERT

In [None]:
old_sentences = []
new_sentences = []
actual_rev_types = []

for i in range(len(data)):
    revision_type = data.loc[i, "revision_type"]
    if revision_type != "neither":
        old_sentence = str(data.loc[i, "original_sentence"])
        old_sentences.append(old_sentence)
        new_sentence = str(data.loc[i, "revised_sentence"])
        new_sentences.append(new_sentence)
        actual_rev_types.append(revision_type)

In [None]:
model = SentenceTransformer("all-mpnet-base-v2").to(torch_device)

In [None]:
shortening_factor = 1
preds = []
diff_scores = []

diff_threshold = -0.661

for i in range(len(old_sentences)//shortening_factor):
    
    # Cosine-similarity code adapted from: https://www.sbert.net/docs/usage/semantic_textual_similarity.html
    with torch.no_grad():
        # Compute embeddings
        original_embed = model.encode(old_sentences[i], convert_to_tensor=True, show_progress_bar=False).to(torch_device)
        revision_embed = model.encode(new_sentences[i], convert_to_tensor=True, show_progress_bar=False).to(torch_device)

        # Compute cosine-similarities
        cos_similarity = util.cos_sim(original_embed, revision_embed)
        diff_score = -cos_similarity[0].item()
        diff_scores.append(diff_score)

        if diff_score > diff_threshold:
            preds.append("content")
        else:
            preds.append("surface")

In [None]:
plot_roc(actual_rev_types[:len(old_sentences)//shortening_factor], diff_scores, metric_label="Semantic Diff")

In [None]:
cm = confusion_matrix(actual_rev_types[:len(old_sentences)//shortening_factor], preds, labels=["content", "surface"])
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm, display_labels=["content", "surface"]
)
disp.plot();

# Results

# Conclusion

# Appendix