In [1]:
import math
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import pandas as pd

In [2]:
def calculate_sentence_log_prob(model, tokenizer, sentence, device):
    inputs = tokenizer(sentence, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])

    loss = outputs.loss.item() # -mean(log_prob)
    num_tokens = inputs["input_ids"].shape[1]

    sentence_log_prob = -loss * (num_tokens - 1)

    perplexity = math.exp(loss)

    return sentence_log_prob, perplexity

In [4]:
model_id = "Qwen/Qwen2.5-7B"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Loading {model_id}...")
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

Loading Qwen/Qwen2.5-7B...


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]



### Testing on Animate Subject Passive


In [5]:
data1 = pd.read_csv("data/animate_subject_passive.csv")
# data

for index, row in data1.iterrows():
  s1 = row["rows__row__sentence_good"]
  s2 = row["rows__row__sentence_bad"]

  log_prob_s1, ppl_s1 = calculate_sentence_log_prob(model, tokenizer, s1, device)
  log_prob_s2, ppl_s2 = calculate_sentence_log_prob(model, tokenizer, s2, device)

  data1.loc[index, "log(P(S1))"] = log_prob_s1
  data1.loc[index, "log(P(S2))"] = log_prob_s2

  if log_prob_s1 > log_prob_s2:
      data1.loc[index, "log(P(S1)) > log(P(S2))"] = 1
  else:
      data1.loc[index, "log(P(S1)) > log(P(S2))"] = 0
  data1.loc[index, "Confidence"] = abs(log_prob_s1 - log_prob_s2)
  data1.loc[index, "Ppl(S1)"] = ppl_s1
  data1.loc[index, "Ppl(S2)"] = ppl_s2

In [None]:
data1

Unnamed: 0,rows__row__sentence_good,rows__row__sentence_bad,log(P(S1)),log(P(S2)),log(P(S1)) > log(P(S2)),Confidence,Ppl(S1),Ppl(S2)
0,Amanda was respected by some waitresses.,Amanda was respected by some picture.,-48.375786,-49.595264,1.0,1.219478,422.831278,1193.968257
1,Some lake was passed by some cashiers.,Some lake was passed by some phenomena.,-50.271557,-48.479364,0.0,1.792193,535.898409,1018.028728
2,Lisa was kissed by the boys.,Lisa was kissed by the blouses.,-33.266919,-43.129505,1.0,9.862586,255.823185,474.071370
3,Amanda isn't respected by the children.,Amanda isn't respected by the cups.,-43.473186,-54.326721,1.0,10.853535,229.097645,889.660695
4,The glove was noticed by some woman.,The glove was noticed by some mouse.,-42.396571,-48.205002,1.0,5.808431,426.944071,978.899355
...,...,...,...,...,...,...,...,...
95,All public parks were explored by some teenager.,All public parks were explored by some screen.,-46.516441,-52.905529,1.0,6.389088,335.142283,744.856161
96,This cake is seen by the pedestrians.,This cake is seen by the parentheses.,-41.153308,-42.421058,1.0,1.267749,357.467360,428.440169
97,Every person wasn't cared for by the offspring.,Every person wasn't cared for by the fungus.,-48.228079,-46.305618,0.0,1.922462,212.443374,171.583263
98,Edward wasn't insulted by some alumni.,Edward wasn't insulted by some oxen.,-47.677170,-52.561508,1.0,4.884338,387.473022,343.836936


In [7]:
data1.to_csv("outputs/animate_subject_passive_output.csv", index=False)

### Testing on Animate Subject Trans

In [8]:
data2 = pd.read_csv("data/animate_subject_trans.csv")

for index, row in data2.iterrows():
  s1 = row["rows__row__sentence_good"]
  s2 = row["rows__row__sentence_bad"]

  log_prob_s1, ppl_s1 = calculate_sentence_log_prob(model, tokenizer, s1, device)
  log_prob_s2, ppl_s2 = calculate_sentence_log_prob(model, tokenizer, s2, device)

  data2.loc[index, "log(P(S1))"] = log_prob_s1
  data2.loc[index, "log(P(S2))"] = log_prob_s2

  if log_prob_s1 > log_prob_s2:
      data2.loc[index, "log(P(S1)) > log(P(S2))"] = 1
  else:
      data2.loc[index, "log(P(S1)) > log(P(S2))"] = 0
  data2.loc[index, "Confidence"] = abs(log_prob_s1 - log_prob_s2)
  data2.loc[index, "Ppl(S1)"] = ppl_s1
  data2.loc[index, "Ppl(S2)"] = ppl_s2

In [9]:
data2

Unnamed: 0,rows__row__sentence_good,rows__row__sentence_bad,log(P(S1)),log(P(S2)),log(P(S1)) > log(P(S2)),Confidence,Ppl(S1),Ppl(S2)
0,Tina revealed Margaret.,The horse revealed Margaret.,-37.472755,-42.783928,1.0,5.311172,11709.887601,44177.989962
1,Danielle visited Irene.,The eye visited Irene.,-29.353811,-37.865490,1.0,8.511679,1538.330192,12917.942171
2,Paul runs around the art galleries.,The river runs around the art galleries.,-35.600347,-39.867378,1.0,4.267031,377.432254,297.477824
3,Most banks have praised Raymond.,The jackets have praised Raymond.,-40.006485,-49.231129,1.0,9.224644,2984.826789,18886.935876
4,Every doctor was selling some restaurants.,A cup was selling some restaurants.,-45.542121,-48.436804,1.0,2.894683,1979.012949,3206.068299
...,...,...,...,...,...,...,...,...
95,Lawrence fled from Brad.,Literature fled from Brad.,-31.858149,-38.442085,1.0,6.583936,585.010453,2182.916177
96,These dancers should investigate Caroline.,Glaciers should investigate Caroline.,-44.382057,-48.820421,1.0,4.438364,7161.046711,3417.747098
97,Mitchell can't complain about Peter.,This window can't complain about Peter.,-36.565186,-44.745667,1.0,8.180481,185.600776,597.191684
98,Every doctor wasn't describing a river.,This story wasn't describing a river.,-48.002614,-38.636765,0.0,9.365849,951.002117,249.519714


In [10]:
data2.to_csv("outputs/animate_subject_trans_output.csv", index=False)

### Testing on Self Curated Data

In [11]:
data3 = pd.read_csv("data/Self_curated_data.csv")

for index, row in data3.iterrows():
  s1 = row["rows__row__sentence_good"]
  s2 = row["rows__row__sentence_bad"]

  log_prob_s1, ppl_s1 = calculate_sentence_log_prob(model, tokenizer, s1, device)
  log_prob_s2, ppl_s2 = calculate_sentence_log_prob(model, tokenizer, s2, device)

  data3.loc[index, "log(P(S1))"] = log_prob_s1
  data3.loc[index, "log(P(S2))"] = log_prob_s2

  if log_prob_s1 > log_prob_s2:
      data3.loc[index, "log(P(S1)) > log(P(S2))"] = 1
  else:
      data3.loc[index, "log(P(S1)) > log(P(S2))"] = 0
  data3.loc[index, "Confidence"] = abs(log_prob_s1 - log_prob_s2)
  data3.loc[index, "Ppl(S1)"] = ppl_s1
  data3.loc[index, "Ppl(S2)"] = ppl_s2

In [None]:
data3

Unnamed: 0,rows__row__sentence_good,rows__row__sentence_bad,log(P(S1)),log(P(S2)),log(P(S1)) > log(P(S2)),Confidence,Ppl(S1),Ppl(S2)
0,I eat poha with tea,I eat plate with tea,-39.645988,-34.018814,0.0,5.627173,740.751036,4937.940013
1,I ordered a cycle,I ordered a wall,-24.988827,-19.718,0.0,5.270827,516.567872,715.274686
2,I gave him water,I gave him air,-19.915164,-21.711063,1.0,1.795898,763.862773,1389.947102
3,I eat a tablet,I eat an ipad,-30.308693,-34.257122,1.0,3.948429,1953.100196,5241.067668
4,I and he drank coffee together,I and he drank coffee mug together,-39.351949,-48.937311,1.0,9.585362,705.3246,3484.983291
5,He has 10 lakh ruppes in his bank,He has 10 lakh cars in his bank,-46.007569,-51.244316,1.0,5.236747,46.245498,168.07858
6,He has calculator in his phone,He has car in his phone,-37.958808,-38.596733,1.0,0.637925,559.17808,621.905933


In [17]:
data3.to_csv("outputs/Self_curated_data_output.csv", index=False)

### Testing on Reference Tracking Data

In [14]:
data4 = pd.read_csv("data/Reference_tracking_data.csv")

for index, row in data4.iterrows():
  s1 = row["rows__row__sentence_good"]
  s2 = row["rows__row__sentence_bad"]

  log_prob_s1, ppl_s1 = calculate_sentence_log_prob(model, tokenizer, s1, device)
  log_prob_s2, ppl_s2 = calculate_sentence_log_prob(model, tokenizer, s2, device)

  data4.loc[index, "log(P(S1))"] = log_prob_s1
  data4.loc[index, "log(P(S2))"] = log_prob_s2

  if log_prob_s1 > log_prob_s2:
      data4.loc[index, "log(P(S1)) > log(P(S2))"] = 1
  else:
      data4.loc[index, "log(P(S1)) > log(P(S2))"] = 0
  data4.loc[index, "Confidence"] = abs(log_prob_s1 - log_prob_s2)
  data4.loc[index, "Ppl(S1)"] = ppl_s1
  data4.loc[index, "Ppl(S2)"] = ppl_s2

In [15]:
data4

Unnamed: 0,rows__row__sentence_good,rows__row__sentence_bad,log(P(S1)),log(P(S2)),log(P(S1)) > log(P(S2)),Confidence,Ppl(S1),Ppl(S2)
0,Riya gave Anu a gift because she had a birthday.,Riya gave Anu a gift because she had a basket.,-42.745728,-55.667061,1.0,12.921332,35.238669,103.43276
1,Arjun called Kabir because he needed help with...,Arjun called Kabir because the project needed ...,-42.67527,-59.629595,1.0,16.954325,35.032371,143.902056
2,Meera hugged Sanya because she was crying.,Meera hugged Sanya because the hallway was cry...,-43.265847,-65.775862,1.0,22.510015,122.402685,718.80217
3,The dog barked at the stranger because he ente...,The dog barked at the stranger because the gat...,-41.35724,-56.148956,1.0,14.791716,31.388345,75.124779
4,Amit cleaned the table because it had food sta...,Amit cleaned the table because the food stains...,-48.670402,-67.40538,1.0,18.734979,129.935758,275.094595
5,Tina put the book in the bag because it was to...,Tina put the book in the bag because the bag w...,-39.566331,-46.236568,1.0,6.670238,13.981785,17.989451
6,Rohan thanked Neeraj because he fixed the comp...,Rohan thanked Neeraj because the computer fixe...,-41.333451,-55.868119,1.0,14.534667,42.844894,160.600496
7,Priya covered the pot because it was boiling o...,Priya covered the pot because the boiling wate...,-39.455934,-55.551338,1.0,16.095405,51.70701,102.440094
8,The car hit the pole because it slipped on the...,The car hit the pole because the pole slipped ...,-41.626479,-51.660686,1.0,10.034206,32.100553,53.191515
9,Anita washed the spoon because it was dirty.,Anita washed the spoon because she was dirty.,-44.449087,-51.854722,1.0,7.405635,139.600875,317.867866


In [18]:
data4.to_csv("outputs/Reference_tracking_data.csv.csv", index=False)