<a href="https://colab.research.google.com/github/micha-blip/Simple-article-reference-checker/blob/ai-detector/desklib_AI_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel

class DesklibAIDetectionModel(PreTrainedModel):
    config_class = AutoConfig

    def __init__(self, config):
        super().__init__(config)
        # Initialize the base transformer model.
        self.model = AutoModel.from_config(config)
        # Define a classifier head.
        self.classifier = nn.Linear(config.hidden_size, 1)
        # Initialize weights (handled by PreTrainedModel)
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Forward pass through the transformer
        outputs = self.model(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs[0]
        # Mean pooling
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
        sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
        pooled_output = sum_embeddings / sum_mask

        # Classifier
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1), labels.float())

        output = {"logits": logits}
        if loss is not None:
            output["loss"] = loss
        return output

def predict_single_text(text, model, tokenizer, device, max_len=768, threshold=0.5):
    encoded = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs["logits"]
        probability = torch.sigmoid(logits).item()

    label = 1 if probability >= threshold else 0
    return probability, label

def main():
    # --- Model and Tokenizer Directory ---
    model_directory = "desklib/ai-text-detector-v1.01"

    # --- Load tokenizer and model ---
    tokenizer = AutoTokenizer.from_pretrained(model_directory)
    model = DesklibAIDetectionModel.from_pretrained(model_directory)

    # --- Set up device ---
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # --- Example Input text ---
    text_ai = "AI detection refers to the process of identifying whether a given piece of content, such as text, images, or audio, has been generated by artificial intelligence. This is achieved using various machine learning techniques, including perplexity analysis, entropy measurements, linguistic pattern recognition, and neural network classifiers trained on human and AI-generated data. Advanced AI detection tools assess writing style, coherence, and statistical properties to determine the likelihood of AI involvement. These tools are widely used in academia, journalism, and content moderation to ensure originality, prevent misinformation, and maintain ethical standards. As AI-generated content becomes increasingly sophisticated, AI detection methods continue to evolve, integrating deep learning models and ensemble techniques for improved accuracy."
    text_human = "It is estimated that a major part of the content in the internet will be generated by AI / LLMs by 2025. This leads to a lot of misinformation and credibility related issues. That is why if is important to have accurate tools to identify if a content is AI generated or human written"

    # --- Run prediction ---
    probability, predicted_label = predict_single_text(text_ai, model, tokenizer, device)
    print(f"Probability of being AI generated: {probability:.4f}")
    print(f"Predicted label: {'AI Generated' if predicted_label == 1 else 'Not AI Generated'}")

    probability, predicted_label = predict_single_text(text_human, model, tokenizer, device)
    print(f"Probability of being AI generated: {probability:.4f}")
    print(f"Predicted label: {'AI Generated' if predicted_label == 1 else 'Not AI Generated'}")

if __name__ == "__main__":
    main()


In [7]:
def detect_ai_content(text, model_directory="desklib/ai-text-detector-v1.01", max_len=768, threshold=0.5):
    """
    Detects whether the given text is likely AI-generated.

    Args:
        text (str): The input text to analyze.
        model_directory (str): The directory or name of the pre-trained model.
        max_len (int): The maximum sequence length for tokenization.
        threshold (float): The probability threshold for classifying as AI-generated.

    Returns:
        tuple: A tuple containing the probability of being AI-generated and the predicted label (0 for Not AI Generated, 1 for AI Generated).
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_directory)
    model = DesklibAIDetectionModel.from_pretrained(model_directory)

    # Set up device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Run prediction
    probability, predicted_label = predict_single_text(text, model, tokenizer, device, max_len=max_len, threshold=threshold)
    return probability, predicted_label

In [19]:
text_to_check = '''Quantum leap computing at sub-zero temperatures represents a pivotal advancement in the field of quantum information science, enabling unprecedented improvements in coherence times, error reduction, and computational stability. At cryogenic conditions, typically in the millikelvin range achieved through dilution refrigeration, quantum bits (qubits) are effectively isolated from thermal noise, thereby preserving superposition and entanglement states essential for large-scale quantum operations. These ultra-low temperatures suppress phononic and electronic excitations, reducing decoherence rates that otherwise hinder fault-tolerant quantum computation at higher temperatures. Moreover, operating quantum processors at sub-zero temperatures facilitates integration with superconducting circuits, which exhibit zero resistive losses and allow for the implementation of high-fidelity gate operations. Such advancements have positioned cryogenic quantum architectures as a cornerstone in the pursuit of scalable quantum computing, underscoring the necessity of temperature control as a critical parameter in the design of next-generation quantum technologies.'''
probability, label = detect_ai_content(text_to_check)
print(f"Text: '{text_to_check}'")
print(f"Probability of being AI generated: {probability:.4f}")
print(f"Predicted label: {'AI Generated' if label == 1 else 'Not AI Generated'}")

Text: 'Quantum leap computing at sub-zero temperatures represents a pivotal advancement in the field of quantum information science, enabling unprecedented improvements in coherence times, error reduction, and computational stability. At cryogenic conditions, typically in the millikelvin range achieved through dilution refrigeration, quantum bits (qubits) are effectively isolated from thermal noise, thereby preserving superposition and entanglement states essential for large-scale quantum operations. These ultra-low temperatures suppress phononic and electronic excitations, reducing decoherence rates that otherwise hinder fault-tolerant quantum computation at higher temperatures. Moreover, operating quantum processors at sub-zero temperatures facilitates integration with superconducting circuits, which exhibit zero resistive losses and allow for the implementation of high-fidelity gate operations. Such advancements have positioned cryogenic quantum architectures as a cornerstone in the