# Our model

In [None]:
import torch
import re
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from typing import Tuple, Dict, Any, List
import numpy as np

class Detection:
    def __init__(self, model_id: str = "gpt2"):
        """
        Initialize the Detection class with a GPT-2 model and tokenizer.
        """
        self.model_id = model_id
        self.model = GPT2LMHeadModel.from_pretrained(model_id)
        self.tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
        self.max_length = self.model.config.n_positions
        self.stride = 512

    def results(self, threshold: float) -> Tuple[str, int]:
        """
        Determine the label based on the threshold.
        """
        if threshold < 40:
            label = 0  # AI content
            return "AI content", label
        else:
            label = 1  # Human written content
            return "Human written content", label

    def __call__(self, sentence: str) -> Dict[str, Any]:
        """
        Analyze the input sentence and predict whether it is AI-generated or human-written.
        """
        results = {}
        total_valid_char = sum(len(x) for x in re.findall("[a-zA-Z0-9]+", sentence))

        if total_valid_char < 100:
            message = "Please input more text (min 100 characters)"
            return {"status": message}

        lines = self.split_into_lines(sentence)
        lines = [line for line in lines if re.search("[a-zA-Z0-9]+", line)]

        results["Perplexity"] = self.perplexity(sentence)
        results["Entropy"] = self.entropy(sentence)
        results["Sentence Length Variance"] = self.sentence_length_variance(lines)

        Perplexity_per_line = [self.perplexity(line) for line in lines]
        results["Perplexity per line"] = np.mean(Perplexity_per_line)
        results["Burstiness"] = np.max(Perplexity_per_line)

        message, label = self.results(results["Perplexity per line"])
        results["label"] = label

        ai_content_count = sum(1 for line_ppl in Perplexity_per_line if self.results(line_ppl)[1] == 0)
        human_content_count = len(Perplexity_per_line) - ai_content_count

        total_lines = len(Perplexity_per_line)
        results["AI Content Proportion"] = ai_content_count / total_lines
        results["Human Content Proportion"] = human_content_count / total_lines

        print(f"AI Content Proportion: {results['AI Content Proportion']:.2%}")
        print(f"Human Content Proportion: {results['Human Content Proportion']:.2%}")

        return results

    def split_into_lines(self, text: str) -> List[str]:
        """
        Split the input text into lines based on punctuation and newlines.
        """
        return re.split(r'(?<=[.?!][ \[\(])|(?<=\n)\s*', text)

    def perplexity(self, sentence: str) -> float:
        """
        Calculate the perplexity of the input sentence.
        """
        encodings = self.tokenizer(sentence, return_tensors="pt")
        seq_len = encodings.input_ids.size(1)
        nlls = []

        for begin_loc in range(0, seq_len, self.stride):
            end_loc = min(begin_loc + self.max_length, seq_len)
            trg_len = end_loc - begin_loc
            input_ids = encodings.input_ids[:, begin_loc:end_loc]
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100

            with torch.no_grad():
                outputs = self.model(input_ids, labels=target_ids)
                neg_log_likelihood = outputs.loss * trg_len

            nlls.append(neg_log_likelihood)

            if end_loc == seq_len:
                break

        ppl = torch.exp(torch.stack(nlls).sum() / seq_len).item()
        return ppl

    def entropy(self, sentence: str) -> float:
        """
        Calculate the entropy of the input sentence.
        """
        encodings = self.tokenizer(sentence, return_tensors="pt")
        input_ids = encodings.input_ids[0]
        probabilities = torch.softmax(self.model(input_ids.unsqueeze(0)).logits, dim=-1)[0]
        entropy = -torch.sum(probabilities * torch.log(probabilities), dim=-1).mean().item()
        return entropy

    def sentence_length_variance(self, lines: List[str]) -> float:
        """
        Calculate the variance in sentence lengths.
        """
        lengths = [len(line.split()) for line in lines]
        return np.var(lengths)

model = Detection()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [None]:
#example
text1 = input("Enter the text you want to analyze:\n ")

Enter the text you want to analyze:
India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the most populous country from June 2023 and from the time of its independence in 1947, the world's most populous democracy.


In [None]:
model(text)

AI Content Proportion: 100.00%
Human Content Proportion: 0.00%


{'Perplexity': 20.425331115722656,
 'Entropy': 3.316962242126465,
 'Sentence Length Variance': 64.0,
 'Perplexity per line': 29.865665435791016,
 'Burstiness': 38.09730911254883,
 'label': 0,
 'AI Content Proportion': 1.0,
 'Human Content Proportion': 0.0}