In [5]:
#PREPROCESS STEP 

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

# DATA
train_essays_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\train_essays.csv")
train_prompts_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\train_prompts.csv")
test_essays_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\test_essays.csv")

# FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    if isinstance(text, str): 
        # REMOVE URLS
        text = re.sub(r'http\S+', '', text)

        # REMOVE SPECIAL CHARACTERS AND CONVERT TO LOWERCASE
        text = re.sub(r'[^a-zA-Z\s]', '', text).lower()

        # TOKENIZE TEXT INTO WORDS
        words = nltk.word_tokenize(text)

        # REMOVE STOP WORDS
        words = [word for word in words if word not in stopwords.words('english')]

        # DEFINE STEMMER AND LEMMATIZER
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()

        # APPLY STEMMER AND LEMMATIZER
        words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words]

        # JOIN THE WORDS BACK INTO A SINGLE STRING
        processed_text = ' '.join(words)

        return processed_text
    else:
        return text  # RETURN THE INPUT UNCHANGED IF IT'S NOT A STRING

# PERFORM PREPROCESSING ON TRAIN AND TEST DATAFRAMES
train_essays_data['text'] = train_essays_data['text'].apply(preprocess_text)
train_essays_data['generated'] = train_essays_data['generated'].apply(preprocess_text)
train_prompts_data['prompt_name'] = train_prompts_data['prompt_name'].apply(preprocess_text)
train_prompts_data['instructions'] = train_prompts_data['instructions'].apply(preprocess_text)
train_prompts_data['source_text'] = train_prompts_data['source_text'].apply(preprocess_text)
test_essays_data['text'] = test_essays_data['text'].apply(preprocess_text)


print("Preprocessed 'text' column in train_essays_data:")
print(train_essays_data['text'])

print("\nPreprocessed 'generated' column in train_essays_data:")
print(train_essays_data['generated'])

print("\nPreprocessed 'prompt_name' column in train_prompts_data:")
print(train_prompts_data['prompt_name'])

print("\nPreprocessed 'instructions' column in train_prompts_data:")
print(train_prompts_data['instructions'])

print("\nPreprocessed 'source_text' column in train_prompts_data:")
print(train_prompts_data['source_text'])

print("\nPreprocessed 'text' column in test_essays_data:")
print(test_essays_data['text'])


Preprocessed 'text' column in train_essays_data:
0       car car around sinc becam famou henri ford cre...
1       transport larg necess countri worldwid doubt c...
2       america love affair vehicl seem cool say elisa...
3       often ride car drive one motor vehicl work sto...
4       car wonder thing perhap one world greatest adv...
                              ...                        
1373    fuss elector colleg mani peopl get confus work...
1374    limit car usag mani advantag put lot le pollut...
1375    there new trend develop year soon full throttl...
1376    know car big part societi today howev car bigg...
1377    car around sinc popular ever sinc although rec...
Name: text, Length: 1378, dtype: object

Preprocessed 'generated' column in train_essays_data:
0       0
1       0
2       0
3       0
4       0
       ..
1373    0
1374    0
1375    0
1376    0
1377    0
Name: generated, Length: 1378, dtype: int64

Preprocessed 'prompt_name' column in train_prompts_data:
0     

In [3]:
#perform n-grams feature engineering

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer

# DATA
train_essays_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\train_essays.csv")
train_prompts_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\train_prompts.csv")
test_essays_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\test_essays.csv")

# FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    if isinstance(text, str): 
        # REMOVE URLS
        text = re.sub(r'http\S+', '', text)

        # REMOVE SPECIAL CHARACTERS AND CONVERT TO LOWERCASE
        text = re.sub(r'[^a-zA-Z\s]', '', text).lower()

        # TOKENIZE TEXT INTO WORDS
        words = nltk.word_tokenize(text)

        # REMOVE STOP WORDS
        words = [word for word in words if word not in stopwords.words('english')]

        # DEFINE STEMMER AND LEMMATIZER
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()

        # APPLY STEMMER AND LEMMATIZER
        words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words]

        # JOIN THE WORDS BACK INTO A SINGLE STRING
        processed_text = ' '.join(words)

        return processed_text
    else:
        return text  # RETURN THE INPUT UNCHANGED IF IT'S NOT A STRING

# FUNCTION TO EXTRACT N-GRAM FEATURES
def extract_ngram_features(text, ngram_range=(1, 2)):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    X = vectorizer.fit_transform(text)
    return X

# PERFORM PREPROCESSING ON TRAIN AND TEST DATAFRAMES
train_essays_data['text'] = train_essays_data['text'].apply(preprocess_text)
train_essays_data['generated'] = train_essays_data['generated'].apply(preprocess_text)
train_prompts_data['prompt_name'] = train_prompts_data['prompt_name'].apply(preprocess_text)
train_prompts_data['instructions'] = train_prompts_data['instructions'].apply(preprocess_text)
train_prompts_data['source_text'] = train_prompts_data['source_text'].apply(preprocess_text)
test_essays_data['text'] = test_essays_data['text'].apply(preprocess_text)

# EXTRACT N-GRAM FEATURES
ngram_range = (1, 2)  # Adjust the n-gram range as needed
train_essays_ngram = extract_ngram_features(train_essays_data['text'], ngram_range)
train_prompts_ngram=extract_ngram_features(train_prompts_data['source_text'], ngram_range)
test_essays_ngram = extract_ngram_features(test_essays_data['text'], ngram_range)


print("Preprocessed 'text' column in train_essays_data:")
print(train_essays_data['text'])
print("\nN-gram features for 'text' column in train_essays_data:")
print(train_essays_ngram.toarray())

print("\nPreprocessed 'generated' column in train_essays_data:")
print(train_essays_data['generated'])

print("\nPreprocessed 'prompt_name' column in train_prompts_data:")
print(train_prompts_data['prompt_name'])

print("\nPreprocessed 'instructions' column in train_prompts_data:")
print(train_prompts_data['instructions'])

print("\nPreprocessed 'source_text' column in train_prompts_data:")
print(train_prompts_data['source_text'])
print("\nN-gram features for 'text' column in train_prompts_data:")
print( train_prompts_ngram.toarray())



print("\nPreprocessed 'text' column in test_essays_data:")
print(test_essays_data['text'])
print("\nN-gram features for 'text' column in test_essays_data:")
print(test_essays_ngram.toarray())


Preprocessed 'text' column in train_essays_data:
0       car car around sinc becam famou henri ford cre...
1       transport larg necess countri worldwid doubt c...
2       america love affair vehicl seem cool say elisa...
3       often ride car drive one motor vehicl work sto...
4       car wonder thing perhap one world greatest adv...
                              ...                        
1373    fuss elector colleg mani peopl get confus work...
1374    limit car usag mani advantag put lot le pollut...
1375    there new trend develop year soon full throttl...
1376    know car big part societi today howev car bigg...
1377    car around sinc popular ever sinc although rec...
Name: text, Length: 1378, dtype: object

N-gram features for 'text' column in train_essays_data:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Preprocessed 'generated' column in train_essays_data:
0       0
1       0
2       0
3       0
4

In [7]:
#perform tf idf .......
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

# DATA
train_essays_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\train_essays.csv")
train_prompts_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\train_prompts.csv")
test_essays_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\test_essays.csv")

# FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    if isinstance(text, str): 
        # REMOVE URLS
        text = re.sub(r'http\S+', '', text)

        # REMOVE SPECIAL CHARACTERS AND CONVERT TO LOWERCASE
        text = re.sub(r'[^a-zA-Z\s]', '', text).lower()

        # TOKENIZE TEXT INTO WORDS
        words = nltk.word_tokenize(text)

        # REMOVE STOP WORDS
        words = [word for word in words if word not in stopwords.words('english')]

        # DEFINE STEMMER AND LEMMATIZER
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()

        # APPLY STEMMER AND LEMMATIZER
        words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words]

        # JOIN THE WORDS BACK INTO A SINGLE STRING
        processed_text = ' '.join(words)

        return processed_text
    else:
        return text  # RETURN THE INPUT UNCHANGED IF IT'S NOT A STRING

# FUNCTION TO EXTRACT N-GRAM FEATURES
def extract_ngram_features(text, ngram_range=(1, 2)):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    X = vectorizer.fit_transform(text)
    return X

# FUNCTION TO PERFORM TF-IDF VECTORIZATION
def tfidf_vectorization(text):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text)
    return X

# PERFORM PREPROCESSING ON TRAIN AND TEST DATAFRAMES
train_essays_data['text'] = train_essays_data['text'].apply(preprocess_text)
train_essays_data['generated'] = train_essays_data['generated'].apply(preprocess_text)
train_prompts_data['prompt_name'] = train_prompts_data['prompt_name'].apply(preprocess_text)
train_prompts_data['instructions'] = train_prompts_data['instructions'].apply(preprocess_text)
train_prompts_data['source_text'] = train_prompts_data['source_text'].apply(preprocess_text)
test_essays_data['text'] = test_essays_data['text'].apply(preprocess_text)

# EXTRACT N-GRAM FEATURES
ngram_range = (1, 2)
train_essays_ngram = extract_ngram_features(train_essays_data['text'], ngram_range)
test_essays_ngram = extract_ngram_features(test_essays_data['text'], ngram_range)

# PERFORM TF-IDF VECTORIZATION
train_essays_tfidf = tfidf_vectorization(train_essays_data['text'])
test_essays_tfidf = tfidf_vectorization(test_essays_data['text'])



print("TF-IDF representations of text in training data:")
print(train_essays_tfidf.toarray()) 

print("TF-IDF representations of text in test data:")
print(test_essays_tfidf.toarray())  

TF-IDF representations of text in training data:
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.08093555 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
TF-IDF representations of text in test data:
[[0.72033345 0.54783215 0.42544054 0.         0.        ]
 [0.         0.61980538 0.48133417 0.61980538 0.        ]
 [0.         0.         0.42544054 0.54783215 0.72033345]]


In [10]:
#check x and y  shape are same ?

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  


# DATA
train_essays_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\train_essays.csv")
train_prompts_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\train_prompts.csv")
test_essays_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\test_essays.csv")

# FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        # REMOVE URLS
        text = re.sub(r'http\S+', '', text)

        # REMOVE SPECIAL CHARACTERS AND CONVERT TO LOWERCASE
        text = re.sub(r'[^a-zA-Z\s]', '', text).lower()

        # TOKENIZE TEXT INTO WORDS
        words = nltk.word_tokenize(text)

        # REMOVE STOP WORDS
        words = [word for word in words if word not in stopwords.words('english')]

        # DEFINE STEMMER AND LEMMATIZER
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()

        # APPLY STEMMER AND LEMMATIZER
        words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words]

        # JOIN THE WORDS BACK INTO A SINGLE STRING
        processed_text = ' '.join(words)

        return processed_text
    else:
        return text  # RETURN THE INPUT UNCHANGED IF IT'S NOT A STRING

# FUNCTION TO EXTRACT N-GRAM FEATURES
def extract_ngram_features(text, ngram_range=(1, 2)):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    X = vectorizer.fit_transform(text)
    return X

# FUNCTION TO PERFORM TF-IDF VECTORIZATION
def tfidf_vectorization(text):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text)
    return X

# PERFORM PREPROCESSING ON TRAIN AND TEST DATAFRAMES
train_essays_data['text'] = train_essays_data['text'].apply(preprocess_text)
train_essays_data['generated'] = train_essays_data['generated'].apply(preprocess_text)
train_prompts_data['prompt_name'] = train_prompts_data['prompt_name'].apply(preprocess_text)
train_prompts_data['instructions'] = train_prompts_data['instructions'].apply(preprocess_text)
train_prompts_data['source_text'] = train_prompts_data['source_text'].apply(preprocess_text)
test_essays_data['text'] = test_essays_data['text'].apply(preprocess_text)

# EXTRACT N-GRAM FEATURES
ngram_range = (1, 2)  # Adjust the n-gram range as needed
train_essays_ngram = extract_ngram_features(train_essays_data['text'], ngram_range)
test_essays_ngram = extract_ngram_features(test_essays_data['text'], ngram_range)

# PERFORM TF-IDF VECTORIZATION
train_essays_tfidf = tfidf_vectorization(train_essays_data['text'])
test_essays_tfidf = tfidf_vectorization(test_essays_data['text'])


X = train_essays_tfidf  
y = train_essays_data['generated']  


print("Shape of feature matrix X:", X.shape)
print("Shape of target vector y:", y.shape)


Shape of feature matrix X: (1378, 11944)
Shape of target vector y: (1378,)


In [2]:

#perform svm model to predict ai generate text

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# DATA
train_essays_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\train_essays.csv")
train_prompts_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\train_prompts.csv")
test_essays_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\test_essays.csv")

# FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    if isinstance(text, str):  
        
        # REMOVE URLS
        text = re.sub(r'http\S+', '', text)

        # REMOVE SPECIAL CHARACTERS AND CONVERT TO LOWERCASE
        text = re.sub(r'[^a-zA-Z\s]', '', text).lower()

        # TOKENIZE TEXT INTO WORDS
        words = nltk.word_tokenize(text)

        # REMOVE STOP WORDS
        words = [word for word in words if word not in stopwords.words('english')]

        # DEFINE STEMMER AND LEMMATIZER
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()

        # APPLY STEMMER AND LEMMATIZER
        words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words]

        # JOIN THE WORDS BACK INTO A SINGLE STRING
        processed_text = ' '.join(words)

        return processed_text
    else:
        return text  # RETURN THE INPUT UNCHANGED IF IT'S NOT A STRING

# FUNCTION TO EXTRACT N-GRAM FEATURES
def extract_ngram_features(text, ngram_range=(1, 2)):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    X = vectorizer.fit_transform(text)
    return X

# FUNCTION TO PERFORM TF-IDF VECTORIZATION
def tfidf_vectorization(text):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text)
    return X

# PERFORM PREPROCESSING ON TRAIN AND TEST DATAFRAMES
train_essays_data['text'] = train_essays_data['text'].apply(preprocess_text)
train_essays_data['generated'] = train_essays_data['generated'].apply(preprocess_text)
train_prompts_data['prompt_name'] = train_prompts_data['prompt_name'].apply(preprocess_text)
train_prompts_data['instructions'] = train_prompts_data['instructions'].apply(preprocess_text)
train_prompts_data['source_text'] = train_prompts_data['source_text'].apply(preprocess_text)
test_essays_data['text'] = test_essays_data['text'].apply(preprocess_text)

# EXTRACT N-GRAM FEATURES
ngram_range = (1, 2)  # Adjust the n-gram range as needed
train_essays_ngram = extract_ngram_features(train_essays_data['text'], ngram_range)
test_essays_ngram = extract_ngram_features(test_essays_data['text'], ngram_range)

# PERFORM TF-IDF VECTORIZATION
train_essays_tfidf = tfidf_vectorization(train_essays_data['text'])
test_essays_tfidf = tfidf_vectorization(test_essays_data['text'])


#DEFINE FEATURE VARIABLE X AND TARGET VARIABLE Y
X = train_essays_tfidf  
y = train_essays_data['generated']  

# SPLIT DATA INTO TRAINING AND TESTING SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# BUID SVM CLASSIFIRE
svm_classifier = SVC(kernel='linear') 

# TRAIN THE SVM MODEL
svm_classifier.fit(X_train, y_train)

# MAKE PREDICTION ON TEST DATA
y_pred = svm_classifier.predict(X_test)

# EVALUATE THE MODEL
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


print("Classification results:")
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)


Classification results:
Accuracy: 0.9963768115942029
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       0.00      0.00      0.00         1

    accuracy                           1.00       276
   macro avg       0.50      0.50      0.50       276
weighted avg       0.99      1.00      0.99       276



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
#PERFORM ANN MODEL 

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report 

from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Rest of your code...



# DATA
train_essays_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\train_essays.csv")
train_prompts_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\train_prompts.csv")
test_essays_data = pd.read_csv(r"C:\Users\nh013\Desktop\LLM - Detect AI Generated Text\test_essays.csv")

# FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    if isinstance(text, str): 
        
        # REMOVE URLS
        text = re.sub(r'http\S+', '', text)

        # REMOVE SPECIAL CHARACTERS AND CONVERT TO LOWERCASE
        text = re.sub(r'[^a-zA-Z\s]', '', text).lower()

        # TOKENIZE TEXT INTO WORDS
        words = nltk.word_tokenize(text)

        # REMOVE STOP WORDS
        words = [word for word in words if word not in stopwords.words('english')]

        # DEFINE STEMMER AND LEMMATIZER
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()

        # APPLY STEMMER AND LEMMATIZER
        words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words]

        # JOIN THE WORDS BACK INTO A SINGLE STRING
        processed_text = ' '.join(words)

        return processed_text
    else:
        return text  # RETURN THE INPUT UNCHANGED IF IT'S NOT A STRING

# FUNCTION TO EXTRACT N-GRAM FEATURES
def extract_ngram_features(text, ngram_range=(1, 2)):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    X = vectorizer.fit_transform(text)
    return X

# FUNCTION TO PERFORM TF-IDF VECTORIZATION
def tfidf_vectorization(text):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text)
    return X

# PERFORM PREPROCESSING ON TRAIN AND TEST DATAFRAMES
train_essays_data['text'] = train_essays_data['text'].apply(preprocess_text)
train_essays_data['generated'] = train_essays_data['generated'].apply(preprocess_text)
train_prompts_data['prompt_name'] = train_prompts_data['prompt_name'].apply(preprocess_text)
train_prompts_data['instructions'] = train_prompts_data['instructions'].apply(preprocess_text)
train_prompts_data['source_text'] = train_prompts_data['source_text'].apply(preprocess_text)
test_essays_data['text'] = test_essays_data['text'].apply(preprocess_text)

# EXTRACT N-GRAM FEATURES
ngram_range = (1, 2)  
train_essays_ngram = extract_ngram_features(train_essays_data['text'], ngram_range)
test_essays_ngram = extract_ngram_features(test_essays_data['text'], ngram_range)

# PERFORM TF-IDF VECTORIZATION
train_essays_tfidf = tfidf_vectorization(train_essays_data['text'])
test_essays_tfidf = tfidf_vectorization(test_essays_data['text'])

#DEFINE X FEATURE VARIABLE AND Y TARGET VARIABLE
X = train_essays_tfidf  
y = train_essays_data['generated']  

# SPLIT DATA INTO TRAINING AND TESTING SET
X = train_essays_data['text']
y = train_essays_data['generated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# TOKENIZE THE TEXT DATA
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

#PAD THE SEQUANCES AT THE SAME LENGTH
max_sequence_length = 100  
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)


#BUILD ANN MODEL
model = Sequential()

# EMBEDDING LAYER
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100 
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))

# LSTM LAYER
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))

#ADD DENSE OUTPUT LAYER
model.add(Dense(1, activation='sigmoid'))

# COMPILE THE MODEL
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



#TRAIN THE MODEL
# CONVERT LABELS TO BINARY (0 or 1)
y_train = (y_train == 'generated').astype(int)
y_test = (y_test == 'generated').astype(int)

# TRAIN THE MODEL
model.fit(X_train_padded, y_train, epochs=5, batch_size=64, validation_data=(X_test_padded, y_test))


# SAVE THE MODEL
model.save(r"C:\Users\nh013\Desktop\model\model_path.h5")


#MAKE PREDICTION
y_pred = model.predict(X_test_padded)

# EVALUATE THE MODEL
y_pred_binary = (y_pred > 0.5).astype(int)
accuracy = np.mean(np.array(y_pred_binary) == np.array(y_test))


print("Accuracy:", accuracy)


# GENERATE CLASSIFICATION REPORT
classification_rep = classification_report(y_test, y_pred_binary)


print("Classification Report:")
print(classification_rep)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       276

    accuracy                           1.00       276
   macro avg       1.00      1.00      1.00       276
weighted avg       1.00      1.00      1.00       276

