# **LLM Verdict Feature**

In [None]:
!pip install -q transformers accelerate bitsandbytes

In [None]:
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm.auto import tqdm
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("LLAMA_3_TOKEN")
login(token=secret_value_0)
print("Successfully logged in to Hugging Face!")

In [None]:
import pandas as pd
import numpy as np
train_data_path = '/kaggle/input/fake-or-real-the-imposter-x-train/train_dataset.csv'
test_data_path = '/kaggle/input/fake-or-real-the-imposter-x-train/test_data.csv'
df = pd.read_csv(test_data_path)
df['text_1'] = df['text_1'].fillna('')
df['text_2'] = df['text_2'].fillna('')
new_df = pd.DataFrame()
new_df['text'] = pd.concat([df['text_1'],df['text_2']])

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
print("Loading quantized model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto", 
)
print("Model loaded successfully.")

In [None]:
def llm_verdict(new_df):
    SYSTEM_PROMPT = """You are a highly discerning forensic editor. Your task is to analyze the following text and determine how likely it is to be 'REAL' (closer to a well-written original source) versus 'FAKE' (a more distorted, lower-quality modification).
    Consider the text's specificity, factual consistency, tone, and complexity.
    """
    batch_size = 1  
    verdicts = []
    torch.cuda.empty_cache()
    gc.collect()
                
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    print(f"Starting inference on {len(new_df)} samples with batch size {batch_size}...")
    for i in tqdm(range(0, len(new_df), batch_size)):
        batch_df = new_df.iloc[i:i+batch_size]
        prompts = [
            tokenizer.apply_chat_template([
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {
                        "role": "user",
                        "content": f"""
                        ---
                        Text:
                        ---
                        {row['text']}
                        ---
                        Based on your expert analysis, provide a confidence score from 0 to 100 on how likely the text is to be REAL, where 100 is completely confident it is REAL and 0 is completely confident it is FAKE. State ONLY the number.
                        """
                    }
                ],
                tokenize=False,
                add_generation_prompt=True
            )
            for _, row in batch_df.iterrows()
        ]
        

        inputs = tokenizer(prompts,
                           return_tensors="pt", 
                           padding=True, 
                           truncation=False).to("cuda")
        

        output_sequences = model.generate(
            **inputs,
            temperature=0.01,
            max_new_tokens=12,
            pad_token_id=tokenizer.eos_token_id
        )
        

        responses = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
        for response in responses:
            llm_verdict = response.split("assistant\n\n")[-1]
            verdicts.append(llm_verdict)
        torch.cuda.empty_cache()
        gc.collect()
    return verdicts

In [None]:
# new_df['llm_judge_verdict'] = llm_verdict(new_df)
# print('LLm Judging Completed...')

In [None]:
# new_df['llm_judge_verdict']

In [None]:
# new_df[['llm_judge_verdict']].to_csv('llm_judge_feature_test_individual_.csv', index=False)
# print("\nLLM Judge feature generation complete. Results saved.")

# **Preplexity Score**

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
model_id = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_id)
tokenizer = GPT2Tokenizer.from_pretrained(model_id)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
def calculate_perplexity(text, model, tokenizer):
    """
    Calculates the perplexity of a given text using a GPT-2 model.
    Lower perplexity means the text is more predictable and natural.
    """
    if not isinstance(text, str) or not text.strip():
        return np.nan

    inputs = tokenizer(text, 
                       truncation=True,       
                       max_length=512,
                       return_tensors="pt").to(device)
    input_ids = inputs.input_ids

    with torch.no_grad():
        outputs = model(**inputs, labels=input_ids)
        neg_log_likelihood = outputs.loss
        perplexity = torch.exp(neg_log_likelihood)
        
    return perplexity.item()


In [None]:
import numpy as np
import pandas as pd
train_data_path = '/kaggle/input/fake-or-real-the-imposter-x-train/train_dataset.csv'
test_data_path = '/kaggle/input/fake-or-real-the-imposter-x-train/test_data.csv'
df = pd.read_csv(train_data_path)
df['text_1'] = df['text_1'].fillna('')
df['text_2'] = df['text_2'].fillna('')
new_df = pd.DataFrame()
new_df['text'] = pd.concat([df['text_1'],df['text_2']])

In [None]:
perplexity = []
for _,row in new_df.iterrows():
    perpex = calculate_perplexity(row['text'],model,tokenizer)
    perplexity.append(perpex)

In [None]:
new_df['perplexity_score'] = perplexity
new_df[['perplexity_score']].to_csv('perplexity_score_features_train.csv', index=False)

In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv(test_data_path)
df['text_1'] = df['text_1'].fillna('')
df['text_2'] = df['text_2'].fillna('')
new_df = pd.DataFrame()
new_df['text'] = pd.concat([df['text_1'],df['text_2']])
perplexity = []
for _,row in new_df.iterrows():
    perpex = calculate_perplexity(row['text'],model,tokenizer)
    perplexity.append(perpex)

In [None]:
new_df['perplexity_score'] = perplexity
new_df[['perplexity_score']].to_csv('perplexity_score_features_test.csv', index=False)

# **Sentence-Coherence Feature**

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize
import numpy as np
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')


model = SentenceTransformer('all-MiniLM-L6-v2')

def get_coherence_scores(text,max_sentences=50):
    sentences = sent_tokenize(text)
    sentences = sentences[:max_sentences]
    if len(sentences) < 2:
        return 0.0, 0.0 
    sentence_embeddings = model.encode(sentences)


    sequential_sims = [cosine_similarity([sentence_embeddings[i]], [sentence_embeddings[i+1]])[0][0] for i in range(len(sentences)-1)]
    flow_coherence = np.mean(sequential_sims)


    centroid = np.mean(sentence_embeddings, axis=0)
    focus_sims = [cosine_similarity([embedding], [centroid])[0][0] for embedding in sentence_embeddings]
    semantic_focus = np.mean(focus_sims)
    
    return flow_coherence, semantic_focus

In [None]:
import pandas as pd
train_data_path = '/kaggle/input/fake-or-real-the-imposter-x-train/train_dataset.csv'
test_data_path = '/kaggle/input/fake-or-real-the-imposter-x-train/test_data.csv'
df = pd.read_csv(train_data_path)
df['text_1'] = df['text_1'].fillna('')
df['text_2'] = df['text_2'].fillna('')
new_df = pd.DataFrame()
new_df['text'] = pd.concat([df['text_1'],df['text_2']])
flow_coherence = []
semantic_focus = []
for _,row in new_df.iterrows():
    flow_coherence_ , semantic_focus_ = get_coherence_scores(row['text'])
    flow_coherence.append(flow_coherence_)
    semantic_focus.append(semantic_focus_)

print(len(flow_coherence),len(semantic_focus))

np.save('flow_coherence_train.npy',flow_coherence)
np.save('semantic_focus_train.npy',semantic_focus)

In [None]:
import pandas as pd
train_data_path = '/kaggle/input/fake-or-real-the-imposter-x-train/train_dataset.csv'
test_data_path = '/kaggle/input/fake-or-real-the-imposter-x-train/test_data.csv'
df = pd.read_csv(test_data_path)
df['text_1'] = df['text_1'].fillna('')
df['text_2'] = df['text_2'].fillna('')
new_df = pd.DataFrame()
new_df['text'] = pd.concat([df['text_1'],df['text_2']])
flow_coherence = []
semantic_focus = []
for _,row in new_df.iterrows():
    flow_coherence_ , semantic_focus_ = get_coherence_scores(row['text'])
    flow_coherence.append(flow_coherence_)
    semantic_focus.append(semantic_focus_)

print(len(flow_coherence),len(semantic_focus))

np.save('flow_coherence_test.npy',flow_coherence)
np.save('semantic_focus_test.npy',semantic_focus)