<a href="https://colab.research.google.com/github/mysertkaya/AI-Generated-Text-Detection/blob/main/Other_Method.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Probability Model

In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
%%capture
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Scoring Algorithm

In [None]:
import torch
import torch.nn.functional as F

def calculate_rank_and_probability(model, tokenizer, text):
    final_results = {}
    final_results_raw = {}
    print("Text length:", len(text.split()))
    inputs = tokenizer(text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model(
            inputs['input_ids'],
            max_new_tokens=1,
            temperature=0.7,
            top_k=50,
            top_p=1,
            do_sample=True,
            return_dict_in_generate=True,
            output_scores=True,
            output_logits=True,
        )

    # List of scores (logits) from the model's output
    all_logits = outputs.logits[0]  # List of tensors (one per generation step)

    inputs_len = len(inputs['input_ids'][0])
    for i in range(inputs_len-1):
      logits = all_logits[i]

      # Apply softmax to get probabilities from logits (raw probabilities)
      probabilities = F.softmax(logits, dim=0)
      raw_scores = logits

      # Get the last token from the next batch input (generated token)
      last_token_id = inputs['input_ids'][0][i+1]

      # Find the probability of the last token
      last_token_probability = probabilities[last_token_id].item()
      last_token_probability_raw = raw_scores[last_token_id].item()
      print("p:", last_token_probability)
      print("pr:", last_token_probability_raw)

      # Sort the probabilities in descending order and get top 20
      sorted_probs, sorted_indices = torch.sort(probabilities, descending=True)
      sorted_probs_raw, sorted_indices_raw = torch.sort(raw_scores, descending=True)

      # Get the rank of the last token in the sorted list
      token_rank = (sorted_indices == last_token_id).nonzero(as_tuple=True)[0].item() + 1
      token_rank_raw = (sorted_indices_raw == last_token_id).nonzero(as_tuple=True)[0].item() + 1

      # Decode the last token to actual token (text)
      last_token = tokenizer.decode([last_token_id])

      # Print the rank and raw probability of the last token
      final_results[last_token] = (token_rank, last_token_probability)
      final_results_raw[last_token] = (token_rank_raw, last_token_probability_raw)




    import matplotlib.pyplot as plt
    from collections import Counter

    ranks = [token_rank for token_rank, _ in final_results.values()]

    rank_frequency = Counter(ranks)

    sorted_ranks = sorted(rank_frequency.items())

    # # Separate ranks and their frequencies for plotting
    # x = [rank for rank, freq in sorted_ranks]
    # y = [freq for rank, freq in sorted_ranks]

    # # Plot the graph
    # plt.figure(figsize=(10, 6))
    # plt.bar(x, y, color='skyblue', edgecolor='black')
    # plt.title('Frequency Distribution of Ranks')
    # plt.xlabel('Rank')
    # plt.ylabel('Frequency')
    # plt.grid(axis='y', linestyle='--', alpha=0.7)
    # plt.show()

    # Initialize counters
    total_count = 0
    count_below_50 = 0
    count_below_10 = 0
    count_below_20 = 0
    count_below_20000 = 0

    token_probabilities = 0
    token_probabilities_raw = 0
    token_probabilities_raw_combined = 0

    # Iterate through the dictionary and count ranks based on conditions
    for token, (token_rank, token_probability) in final_results.items():
        total_count += 1
        if token_rank < 10:
            count_below_10 += 1
        if token_rank < 20:
            count_below_20 += 1
        if token_rank < 50:
            count_below_50 += 1
        if token_rank < 20000:
            count_below_20000 += 1

        if token_probability == 0:
          variance = 0.1
          token_probabilities += (1/token_rank) * variance
        else:
          token_probabilities += (1/token_rank) * token_probability

    for token, (token_rank, token_probability) in final_results_raw.items():
        if token_probability == 0:
          variance = 0.1
          token_probabilities_raw_combined += (1/token_rank) * variance
        else:
          token_probabilities_raw_combined += (1/token_rank) * token_probability
        token_probabilities_raw += token_probability
    # Store the counts
    rank_counts = {
        'total_count': total_count,
        'count_below_10': count_below_10,
        'count_below_20': count_below_20,
        'count_below_50': count_below_50,
        'count_below_20000': count_below_20000
    }
    print(rank_counts)
    print("Softmax:")
    print("Tp/tc:", token_probabilities / len(final_results))
    print("10/tc:", rank_counts['count_below_10'] / rank_counts['total_count'])
    print("20/tc:", rank_counts['count_below_20'] / rank_counts['total_count'])
    print("50/tc:", rank_counts['count_below_50'] / rank_counts['total_count'])
    print("20000/tc:", rank_counts['count_below_20000'] / rank_counts['total_count'])
    print("Raw:")
    print("Tp/tc:", token_probabilities_raw / len(final_results_raw))
    print("Tpc/tc:", token_probabilities_raw_combined / len(final_results_raw))

    return rank_counts, token_probabilities, final_results

# Run

In [None]:
ai_text="""The waves crash vigorously against jagged rocks jutting out from the shore, sending plumes of frothy white spray high into the air. The pale blue water roils and churns, distorting the golden light of the setting sun reflecting off the waves. Each time the waves hit the rocks, they burst forth with a thunderous boom that fills the air, accompanied by the cacophony of screeching seagulls swooping down in search of any exposed creatures. The vast horizon stretches out endlessly before me, the line where pale blue sky meets dark blue sea blurred and indistinct. Patches of burnt orange and gold clouds line the horizon, remnants of the day's light dissolving into the depths of the ocean. In the distance, a lone sailboat navigates the waters, its white triangular sail swaying gently in the offshore breeze. The warm late afternoon air carries the unmistakable tang of salt water and kelp, invading my senses and transporting me fully into this seaside escape. I close my eyes and breathe deeply, feeling the sea mist on my face and listening to the rhythmic crash of wave after wave. When I open my eyes again, the sunset has painted the scene in hues of peach and rose-gold, giving the rocks, water and distant sailboat an almost ethereal glow. I turn away reluctantly, knowing my visit to this magical place must end for now but carrying with me its vivid sights, sounds and smells etched permanently into my mind."""

In [None]:
run_results = calculate_rank_and_probability(model, tokenizer, text)