In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load data
gpt_data = pd.read_csv("gpt.csv")
human_data = pd.read_csv("human.csv")

# Combine the two datasets into one
data = pd.concat([gpt_data, human_data], ignore_index=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2, random_state=42)

import nltk
from nltk.tokenize import sent_tokenize
def sentence_stats(text):
sentences = nltk.sent_tokenize(text)
num_sentences = len(sentences)
avg_sentence_len = sum(len(sentence.split()) for sentence in sentences) / num_sentences
return num_sentences, avg_sentence_len

# Feature extraction
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train) # Transform training data into TF-IDF features
X_test_tfidf = tfidf.transform(X_test) # Transform testing data into TF-IDF features

# Train models
param_grid_rf = {'n_estimators': [100, 200, 300],
              'max_features': ['sqrt', 'log2']}
model_rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(model_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, scoring='accuracy')
grid_search_rf.fit(X_train_tfidf, y_train)
model_rf = grid_search_rf.best_estimator_

model_lr = LogisticRegression(random_state=42, max_iter=1000)
model_lr.fit(X_train_tfidf, y_train)

param_grid_svm = {'C': [0.1, 1, 10],
                  'kernel': ['linear', 'poly', 'rbf']}
model_svm = SVC(random_state=42)
grid_search_svm = GridSearchCV(model_svm, param_grid=param_grid_svm, cv=5, n_jobs=-1, scoring='accuracy')
grid_search_svm.fit(X_train_tfidf, y_train)
model_svm = grid_search_svm.best_estimator_

# Model evaluation
models = {'Random Forest': model_rf,
          'Logistic Regression': model_lr,
          'SVM': model_svm}
for name, model in models.items():
    y_pred = model.predict(X_test_tfidf) # Predict the labels for the testing data
    accuracy = accuracy_score(y_test, y_pred) # Calculate the accuracy of the model on the testing data
    print("{} Accuracy: {:.4f}".format(name, accuracy))

# Model testing
test_data = pd.read_csv("test.csv") # Load the testing data from a CSV file
X_test_data = test_data["text"]
y_test_data = test_data["label"]

X_test_tfidf_data = tfidf.transform(X_test_data) # Transform testing data into TF-IDF features

# Test models
for name, model in models.items():
    y_pred_data = model.predict(X_test_tfidf_data) # Predict the labels for the testing data using the trained model
    accuracy_data = accuracy_score(y_test_data, y_pred_data) # Calculate the accuracy of the model on the testing data
    print("{} Test Accuracy: {:.4f}".format(name, accuracy_data))
