In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import wordnet as w
import math
from collections import Counter

nltk.download('punkt')
nltk.download('wordnet')

# Load data
gpt_data = pd.read_csv("data_process/gpt.csv")
human_data = pd.read_csv("data_process/human.csv")

# Take a random sample of 300 instances from each dataset
gpt_data = gpt_data.sample(300, random_state=42)
human_data = human_data.sample(300, random_state=42)

# Combine the two datasets into one
data = pd.concat([gpt_data, human_data], ignore_index=True)

# Feature extraction functions
def sentence_length(text):
    text = str(text)
    sentences = nltk.sent_tokenize(text)
    numberofsentences = len(sentences)
    total_words = 0
    for i in sentences:
        total_words += len(i.split())
    avg_sentence = total_words / numberofsentences
    return numberofsentences, avg_sentence

def repetitivewords(text):
    text = str(text)
    token = nltk.word_tokenize(text.lower())
    synsets = []
    for i in token:
        synsets.extend(w.synsets(i))
    synonyms = []
    for synset in synsets:
        synonyms.append([lemma.name() for lemma in synset.lemmas()])
    repeat = 0
    for index in range(len(synonyms)):
        for nextindex in range(index+1, len(synonyms)):
            if len(set(synonyms[index]) & set(synonyms[nextindex])) > 0:
                repeat += 1
    return repeat / len(token)

def entropy(text):
    text = str(text)
    tokens = nltk.word_tokenize(text.lower())
    tokennumber = Counter(tokens)
    total = len(tokens)
    numberofprobs = []
    for count in tokennumber.values():
        prob = count / total
        numberofprobs.append(prob)
    entropy = 0.0
    for i in numberofprobs:
        if i > 0:
            entropy -= i * (math.log(i, 2))
    return entropy

# Extract features
data['sent_length'], data['avg_sent_length'] = zip(*data['text'].apply(sentence_length))
data['repetitive_words'] = data['text'].apply(repetitivewords)
data['text_entropy'] = data['text'].apply(entropy)

# Split data into training and testing sets
X = data.drop(columns=['generated'])
y = data['generated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fill NaN values with an empty string
X_train['text'] = X_train['text'].fillna('')
X_test['text'] = X_test['text'].fillna('')

# Extract TF-IDF features
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train['text'])
X_test_tfidf = tfidf.transform(X_test['text'])

# Combine TF-IDF features with the extracted features
X_train_features = X_train.drop(columns=['text']).to_numpy()
X_test_features = X_test.drop(columns=['text']).to_numpy()

X_train_combined = np.hstack((X_train_tfidf.toarray(), X_train_features))
X_test_combined = np.hstack((X_test_tfidf.toarray(), X_test_features))

# Train models
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train_combined, y_train)

model_lr = LogisticRegression(random_state=42, max_iter=1000)
model_lr.fit(X_train_combined, y_train)

model_svm = SVC(random_state=42)
model_svm.fit(X_train_combined, y_train)

# Model evaluation
models = {'Random Forest': model_rf,
          'Logistic Regression': model_lr,
          'SVM': model_svm}
for name, model in models.items():
    y_pred_train = model.predict(X_train_combined)  # Predict the labels for the training data
    train_report = classification_report(y_train, y_pred_train, output_dict=True)

    train_precision = train_report['macro avg']['precision']
    train_recall = train_report['macro avg']['recall']
    train_f_measure = train_report['macro avg']['f1-score']
    train_accuracy = train_report['accuracy']

    print("{} Train Evaluation:".format(name))
    print("  Precision: {:.4f}".format(train_precision))
    print("  Recall: {:.4f}".format(train_recall))
    print("  F-measure: {:.4f}".format(train_f_measure))
    print("  Accuracy: {:.4f}\n".format(train_accuracy))

    y_pred_test = model.predict(X_test_combined)  # Predict the labels for the testing data
    test_report = classification_report(y_test, y_pred_test, output_dict=True)

    test_precision = test_report['macro avg']['precision']
    test_recall = test_report['macro avg']['recall']
    test_f_measure = test_report['macro avg']['f1-score']
    test_accuracy = test_report['accuracy']

    print("{} Test Evaluation:".format(name))
    print("  Precision: {:.4f}".format(test_precision))
    print("  Recall: {:.4f}".format(test_recall))
    print("  F-measure: {:.4f}".format(test_f_measure))
    print("  Accuracy: {:.4f}\n".format(test_accuracy))


[nltk_data] Downloading package punkt to /Users/xinranli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/xinranli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Random Forest Train Evaluation:
  Precision: 0.9979
  Recall: 0.9979
  F-measure: 0.9979
  Accuracy: 0.9979

Random Forest Test Evaluation:
  Precision: 0.9265
  Recall: 0.9194
  F-measure: 0.9165
  Accuracy: 0.9167

Logistic Regression Train Evaluation:
  Precision: 0.9465
  Recall: 0.9412
  F-measure: 0.9415
  Accuracy: 0.9417

Logistic Regression Test Evaluation:
  Precision: 0.9511
  Recall: 0.9511
  F-measure: 0.9500
  Accuracy: 0.9500

SVM Train Evaluation:
  Precision: 0.8735
  Recall: 0.8554
  F-measure: 0.8543
  Accuracy: 0.8562

SVM Test Evaluation:
  Precision: 0.8867
  Recall: 0.8629
  F-measure: 0.8567
  Accuracy: 0.8583

