# Pipeline to process the user input and determine whether the given text is AI-generated or Human-written

We do the word embeddings and PCA on the user input to match it with our model.

In [1]:
from textstat.textstat import textstatistics
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec, KeyedVectors
import numpy as np

def lexical_diversity(text):
    tokens = text.split()
    if len(tokens) == 0:
        return 0
    else:
        return len(set(tokens)) / len(tokens)
    
def readability_score(text):
    return textstatistics().flesch_reading_ease(text)

def generate_word2vec_features(text, word2vec_model):
    tokens = text.split()
    embeddings = np.zeros((300,))
    valid_tokens = 0
    for token in tokens:
        if token in word2vec_model:
            embeddings += word2vec_model[token]
            valid_tokens += 1
    if valid_tokens > 0:
        embeddings /= valid_tokens
    return embeddings

def text_to_features(text, word2vec_model, pca_model):
    lex_div = lexical_diversity(text)
    read_score = readability_score(text)
    word2vec_features = generate_word2vec_features(text, word2vec_model)
    word2vec_features_reshaped = word2vec_features.reshape(1, -1)
    word2vec_features_reduced = pca_model.transform(word2vec_features_reshaped)
    features = np.concatenate(([lex_div, read_score], word2vec_features_reduced.flatten()))
    return features

# Function to perform binary classification

In [2]:
def predict_text_classification(text, word2vec_model, pca_model, model):
    features = text_to_features(text, word2vec_model, pca_model).reshape(1, -1)
    prediction = model.predict(features)
    return "AI-generated" if prediction == 1 else "Human-written"

# Function to perform multiclass classification

In [3]:
def predict_text_classification_mc(text, word2vec_model, pca_model, model):
    features = text_to_features(text, word2vec_model, pca_model).reshape(1, -1)
    prediction = model.predict(features)
    
    if prediction == 13:
        pred = "GPT-3.5"
    elif prediction == 37:
        pred = "OPT-1.3B"
    elif prediction == 41:
        pred = "OPT-30B"
    elif prediction == 58:
        pred = "Text-Davinci-002"
    elif prediction == 59:
        pred = "Text-Davinci-003"
        
    return pred

In [4]:
def main():
    word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
    
    with open('binaryclf_rf.pkl', 'rb') as fileA:
        rf_clf = pickle.load(fileA)

    with open('mlp_pca.pkl', 'rb') as fileB:
        pca = pickle.load(fileB)
    
    user_input_text = input("Enter the text for classification: ")
    
    BinaryPrediction = predict_text_classification(user_input_text, word2vec_model, pca, rf_clf) 
    
    if BinaryPrediction == "Human-written":
        
        print(f"The text is predicted as: {BinaryPrediction}")
    
    else:
        
        with open('multiclassclf_mlp.pkl', 'rb') as f:
            mlp_clf = pickle.load(f)
        
        MultiClassPrediction = predict_text_classification_mc(user_input_text, word2vec_model, pca, mlp_clf)
        
        print(f"The text is predicted as: {BinaryPrediction}")

        print(f"The text is from: {MultiClassPrediction} Large Language Model")
    
if __name__ == "__main__":
    main()

Enter the text for classification: In the heart of Gotham City, a shadowy figure emerges from the darkness, a symbol of justice and fear. With the weight of the city's crime pressing upon his shoulders, Batman confronts a mugger in a desolate alley, his gravelly voice commanding respect and fear. Swiftly dispatching the criminal, he retreats to the heights of Gotham's skyline, contemplating the never-ending battle against the city's corruption. Yet, fueled by Alfred's unwavering support and a newfound lead on a case, Batman reaffirms his commitment to bringing light to the darkness that plagues his beloved city, disappearing once more into the night, a silent guardian watching over Gotham's restless streets.




The text is predicted as: AI-generated
The text is from: GPT-3.5 Large Language Model
