In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import nltk
import string
from nltk.corpus import wordnet as w
import math
from collections import Counter
from sklearn.model_selection import GridSearchCV

# Load data
gpt_data = pd.read_csv("data_process/gpt.csv")
human_data = pd.read_csv("data_process/human.csv")

# Take a random sample of instances from each dataset
gpt_data = gpt_data.sample(1000, random_state=42)
human_data = human_data.sample(1000, random_state=42)

# Combine the two datasets into one
data = pd.concat([gpt_data, human_data], ignore_index=True)

# Feature extraction functions
def sentence_length(text):
    text = str(text)
    sentences = nltk.sent_tokenize(text)
    numberofsentences = len(sentences)
    total_words = 0
    for i in sentences:
        total_words += len(i.split())
    avg_sentence = total_words / numberofsentences
    return numberofsentences, avg_sentence

def repetitivewords(text):
    text = str(text)
    token = nltk.word_tokenize(text.lower())
    synsets = []
    for i in token:
        synsets.extend(w.synsets(i))
    synonyms = []
    for synset in synsets:
        synonyms.append([lemma.name() for lemma in synset.lemmas()])
    repeat = 0
    for index in range(len(synonyms)):
        for nextindex in range(index+1, len(synonyms)):
            if len(set(synonyms[index]) & set(synonyms[nextindex])) > 0:
                repeat += 1
    return repeat / len(token)

def entropy(text):
    text = str(text)
    tokens = nltk.word_tokenize(text.lower())
    tokennumber = Counter(tokens)
    total = len(tokens)
    numberofprobs = []
    for count in tokennumber.values():
        prob = count / total
        numberofprobs.append(prob)
    entropy = 0.0
    for i in numberofprobs:
        if i > 0:
            entropy -= i * (math.log(i, 2))
    return entropy

def count_punctuation(text):
     text = str(text)
     sentences = nltk.sent_tokenize(text)
     numberofsentences = len(sentences)
     count = 0
     for char in text:
          if char in string.punctuation:
              count += 1
     return count / numberofsentences if numberofsentences > 0 else 0

def count_numbers(text):
     sentences = nltk.sent_tokenize(text)
     num_sentences = len(sentences)
     pattern1 = re.compile(r'((one|One|two|Two|three|Three|four|Four|five|Five|six|Six|seven|Seven|eight|Eight|nine|Nine|ten|Ten|eleven|Eleven|twelve|Twelve|thirteen|fourteen|fifteen|sixteen|twenty|thirty|fourty|fifty|sixty|seventy|eighty|ninty|first|second|third|fourth|fifth|sixth|seventh|eight|ninth|tenth)[-]?( hundred)?( thousand)?( million)?( billion)?)')
     pattern2 = re.compile(r'(([0-9]+)((,[0-9]{3})+)?(\.[0-9]+)?[ ]?(hundreds)?(millions)?(hundred)?(million)?)')
     pattern3 = re.compile(r'(twenty|thirty|fourty|fifty|sixty|seventy|eighty|ninty)[-]?((one|two|three|four|five|six|seven|eight|nine)[-]?(hundred)?(thousand)?(million)?)?')
     pattern4 = re.compile(r'((h)?(H)?alf[ ]?(a)?)')
     count = 0
     for sentence in sentences:
          count += len(re.findall(pattern1, sentence))
          count += len(re.findall(pattern2, sentence))
          count += len(re.findall(pattern3, sentence))
          count += len(re.findall(pattern4, sentence))
     return count / num_sentences if num_sentences > 0 else 0

# Extract features
data['sent_length'], data['avg_sent_length'] = zip(*data['text'].apply(sentence_length))
data['repetitive_words'] = data['text'].apply(repetitivewords)
data['text_entropy'] = data['text'].apply(entropy)

# Split data into training and testing sets
X = data.drop(columns=['generated'])
y = data['generated']

# Split data into training, validation, and testing sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42)

# Fill NaN values with an empty string
X_train['text'] = X_train['text'].fillna('')
X_val['text'] = X_val['text'].fillna('')
X_test['text'] = X_test['text'].fillna('')

# Extract TF-IDF features
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train['text'])
X_val_tfidf = tfidf.transform(X_val['text'])
X_test_tfidf = tfidf.transform(X_test['text'])

# Combine TF-IDF features with the extracted features
X_train_features = X_train.drop(columns=['text']).to_numpy()
X_val_features = X_val.drop(columns=['text']).to_numpy()
X_test_features = X_test.drop(columns=['text']).to_numpy()

X_train_combined = np.hstack((X_train_tfidf.toarray(), X_train_features))
X_val_combined = np.hstack((X_val_tfidf.toarray(), X_val_features))
X_test_combined = np.hstack((X_test_tfidf.toarray(), X_test_features))

# Perform Grid Search for optimal parameters
rf_params = {'n_estimators': [50, 100],
             'max_depth': [None, 30]}
lr_params = {'C': [0.1, 1, 10],
             'solver': ['newton-cg', 'liblinear']}
svm_params = {'C': [1, 10],
              'kernel': ['linear', 'rbf']}


model_rf = RandomForestClassifier(random_state=42)
model_lr = LogisticRegression(random_state=42, max_iter=1000)
model_svm = SVC(random_state=42)

grid_rf = GridSearchCV(model_rf, rf_params, cv=5)
grid_rf.fit(X_val_combined, y_val)
best_rf_params = grid_rf.best_params_

grid_lr = GridSearchCV(model_lr, lr_params, cv=5)
grid_lr.fit(X_val_combined, y_val)
best_lr_params = grid_lr.best_params_

grid_svm = GridSearchCV(model_svm, svm_params, cv=5)
grid_svm.fit(X_val_combined, y_val)
best_svm_params = grid_svm.best_params_

# Train models with optimal parameters
model_rf = RandomForestClassifier(**best_rf_params, random_state=42)
model_rf.fit(X_train_combined, y_train)

model_lr = LogisticRegression(**best_lr_params, random_state=42, max_iter=1000)
model_lr.fit(X_train_combined, y_train)

model_svm = SVC(**best_svm_params, random_state=42)
model_svm.fit(X_train_combined, y_train)

# Model evaluation
models = {'Random Forest': model_rf,
          'Logistic Regression': model_lr,
          'SVM': model_svm}
for name, model in models.items():
    y_pred_train = model.predict(X_train_combined)  # Predict the labels for the training data
    train_report = classification_report(y_train, y_pred_train, output_dict=True)

    train_precision = train_report['macro avg']['precision']
    train_recall = train_report['macro avg']['recall']
    train_f_measure = train_report['macro avg']['f1-score']
    train_accuracy = train_report['accuracy']

    print("{} Train Evaluation:".format(name))
    print("  Precision: {:.5f}".format(train_precision))
    print("  Recall: {:.5f}".format(train_recall))
    print("  F-measure: {:.5f}".format(train_f_measure))
    print("  Accuracy: {:.5f}\n".format(train_accuracy))

    y_pred_test = model.predict(X_test_combined)  # Predict the labels for the testing data
    test_report = classification_report(y_test, y_pred_test, output_dict=True)

    test_precision = test_report['macro avg']['precision']
    test_recall = test_report['macro avg']['recall']
    test_f_measure = test_report['macro avg']['f1-score']
    test_accuracy = test_report['accuracy']

    print("{} Test Evaluation:".format(name))
    print("  Precision: {:.5f}".format(test_precision))
    print("  Recall: {:.5f}".format(test_recall))
    print("  F-measure: {:.5f}".format(test_f_measure))
    print("  Accuracy: {:.5f}\n".format(test_accuracy))

    


Random Forest Train Evaluation:
  Precision: 0.97177
  Recall: 0.96987
  F-measure: 0.96994
  Accuracy: 0.96998

Random Forest Test Evaluation:
  Precision: 0.94692
  Recall: 0.94304
  F-measure: 0.94319
  Accuracy: 0.94333

Logistic Regression Train Evaluation:
  Precision: 0.94766
  Recall: 0.94409
  F-measure: 0.94413
  Accuracy: 0.94425

Logistic Regression Test Evaluation:
  Precision: 0.95779
  Recall: 0.95651
  F-measure: 0.95663
  Accuracy: 0.95667

SVM Train Evaluation:
  Precision: 0.99786
  Recall: 0.99786
  F-measure: 0.99786
  Accuracy: 0.99786

SVM Test Evaluation:
  Precision: 0.96085
  Recall: 0.95986
  F-measure: 0.95997
  Accuracy: 0.96000



## Fake data test

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import wordnet as w
import math
from collections import Counter
from sklearn.model_selection import GridSearchCV

# Load data
gpt_data = pd.read_csv("data_process/gpt.csv")
human_data = pd.read_csv("data_process/human.csv")
fake_data = pd.read_csv("data_process/fake.csv")

# Take a random sample of instances from each dataset
# gpt_data = gpt_data.sample(1000, random_state=42)
# human_data = human_data.sample(1000, random_state=42)

# Combine the two datasets into one
data = fake_data.sample(2000, random_state=42)
test_data = human_data.sample(10000, random_state=42)

# Feature extraction functions
def sentence_length(text):
    text = str(text)
    sentences = nltk.sent_tokenize(text)
    numberofsentences = len(sentences)
    total_words = 0
    for i in sentences:
        total_words += len(i.split())
    avg_sentence = total_words / numberofsentences
    return numberofsentences, avg_sentence

def repetitivewords(text):
    text = str(text)
    token = nltk.word_tokenize(text.lower())
    synsets = []
    for i in token:
        synsets.extend(w.synsets(i))
    synonyms = []
    for synset in synsets:
        synonyms.append([lemma.name() for lemma in synset.lemmas()])
    repeat = 0
    for index in range(len(synonyms)):
        for nextindex in range(index+1, len(synonyms)):
            if len(set(synonyms[index]) & set(synonyms[nextindex])) > 0:
                repeat += 1
    return repeat / len(token)

def entropy(text):
    text = str(text)
    tokens = nltk.word_tokenize(text.lower())
    tokennumber = Counter(tokens)
    total = len(tokens)
    numberofprobs = []
    for count in tokennumber.values():
        prob = count / total
        numberofprobs.append(prob)
    entropy = 0.0
    for i in numberofprobs:
        if i > 0:
            entropy -= i * (math.log(i, 2))
    return entropy

# Extract features
data['sent_length'], data['avg_sent_length'] = zip(*data['text'].apply(sentence_length))
data['repetitive_words'] = data['text'].apply(repetitivewords)
data['text_entropy'] = data['text'].apply(entropy)

test_data['sent_length'], test_data['avg_sent_length'] = zip(*test_data['text'].apply(sentence_length))
test_data['repetitive_words'] = test_data['text'].apply(repetitivewords)
test_data['text_entropy'] = test_data['text'].apply(entropy)

# Split data into training and testing sets
X = data.drop(columns=['generated'])
y = data['generated']

# Split data into training, validation, and testing sets
# X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train = data.drop(columns=['generated'])
y_train = data['generated']
X_test = test_data.drop(columns=['generated'])
y_test = test_data['generated']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1765, random_state=42)

# Fill NaN values with an empty string
X_train['text'] = X_train['text'].fillna('')
X_val['text'] = X_val['text'].fillna('')
X_test['text'] = X_test['text'].fillna('')

# Extract TF-IDF features
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train['text'])
X_val_tfidf = tfidf.transform(X_val['text'])
X_test_tfidf = tfidf.transform(X_test['text'])

# Combine TF-IDF features with the extracted features
X_train_features = X_train.drop(columns=['text']).to_numpy()
X_val_features = X_val.drop(columns=['text']).to_numpy()
X_test_features = X_test.drop(columns=['text']).to_numpy()

X_train_combined = np.hstack((X_train_tfidf.toarray(), X_train_features))
X_val_combined = np.hstack((X_val_tfidf.toarray(), X_val_features))
X_test_combined = np.hstack((X_test_tfidf.toarray(), X_test_features))

# Perform Grid Search for optimal parameters
rf_params = {'n_estimators': [50, 100],
             'max_depth': [None, 30]}
lr_params = {'C': [0.1, 1, 10],
             'solver': ['newton-cg', 'liblinear']}
svm_params = {'C': [1, 10],
              'kernel': ['linear', 'rbf']}


model_rf = RandomForestClassifier(random_state=42)
model_lr = LogisticRegression(random_state=42, max_iter=1000)
model_svm = SVC(random_state=42)

grid_rf = GridSearchCV(model_rf, rf_params, cv=5)
grid_rf.fit(X_val_combined, y_val)
best_rf_params = grid_rf.best_params_

grid_lr = GridSearchCV(model_lr, lr_params, cv=5)
grid_lr.fit(X_val_combined, y_val)
best_lr_params = grid_lr.best_params_

grid_svm = GridSearchCV(model_svm, svm_params, cv=5)
grid_svm.fit(X_val_combined, y_val)
best_svm_params = grid_svm.best_params_

# Train models with optimal parameters
model_rf = RandomForestClassifier(**best_rf_params, random_state=42)
model_rf.fit(X_train_combined, y_train)

model_lr = LogisticRegression(**best_lr_params, random_state=42, max_iter=1000)
model_lr.fit(X_train_combined, y_train)

model_svm = SVC(**best_svm_params, random_state=42)
model_svm.fit(X_train_combined, y_train)

# Model evaluation
models = {'Random Forest': model_rf,
          'Logistic Regression': model_lr,
          'SVM': model_svm}
for name, model in models.items():
    y_pred_train = model.predict(X_train_combined)  # Predict the labels for the training data
    train_report = classification_report(y_train, y_pred_train, output_dict=True)

    train_precision = train_report['macro avg']['precision']
    train_recall = train_report['macro avg']['recall']
    train_f_measure = train_report['macro avg']['f1-score']
    train_accuracy = train_report['accuracy']

    print("{} Train Evaluation:".format(name))
    print("  Precision: {:.5f}".format(train_precision))
    print("  Recall: {:.5f}".format(train_recall))
    print("  F-measure: {:.5f}".format(train_f_measure))
    print("  Accuracy: {:.5f}\n".format(train_accuracy))

    y_pred_test = model.predict(X_test_combined)  # Predict the labels for the testing data
    test_report = classification_report(y_test, y_pred_test, output_dict=True)

    test_precision = test_report['macro avg']['precision']
    test_recall = test_report['macro avg']['recall']
    test_f_measure = test_report['macro avg']['f1-score']
    test_accuracy = test_report['accuracy']

    print("{} Test Evaluation:".format(name))
    print("  Precision: {:.5f}".format(test_precision))
    print("  Recall: {:.5f}".format(test_recall))
    print("  F-measure: {:.5f}".format(test_f_measure))
    print("  Accuracy: {:.5f}\n".format(test_accuracy))


Random Forest Train Evaluation:
  Precision: 0.96969
  Recall: 0.96966
  F-measure: 0.96964
  Accuracy: 0.96964



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest Test Evaluation:
  Precision: 0.50000
  Recall: 0.23755
  F-measure: 0.32208
  Accuracy: 0.47510

Logistic Regression Train Evaluation:
  Precision: 0.95813
  Recall: 0.95810
  F-measure: 0.95810
  Accuracy: 0.95811

Logistic Regression Test Evaluation:
  Precision: 0.50000
  Recall: 0.25680
  F-measure: 0.33932
  Accuracy: 0.51360

SVM Train Evaluation:
  Precision: 0.53880
  Recall: 0.53706
  F-measure: 0.53163
  Accuracy: 0.53673

SVM Test Evaluation:
  Precision: 0.50000
  Recall: 0.31950
  F-measure: 0.38987
  Accuracy: 0.63900



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Error Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from nltk import FreqDist
from nltk import bigrams, trigrams

In [None]:
# Predict the labels for the validation data
best_model = model_svm  # Use the best performing model from your previous analysis
y_pred_val = best_model.predict(X_val_combined)

# Find the misclassified instances
misclassified_indices = np.where(y_val != y_pred_val)[0]
misclassified_data = X_val.iloc[misclassified_indices]

# Analyze the misclassified instances
#misclassified_data['sent_length'].hist()
#plt.xlabel('Sentence Length')
#plt.ylabel('Frequency')
#plt.title('Histogram of Sentence Length for Misclassified Instances')
#plt.show()

misclassified_data['avg_sent_length'].hist()
plt.xlabel('Average Sentence Length')
plt.ylabel('Frequency')
plt.title('Histogram of Average Sentence Length for Misclassified Instances')
#plt.savefig('avg_sent_length.png')
plt.show()

misclassified_data['repetitive_words'].hist()
plt.xlabel('Repetitive Words')
plt.ylabel('Frequency')
plt.title('Histogram of Repetitive Words for Misclassified Instances')
#plt.savefig('repet_words.png')
plt.show()

misclassified_data['text_entropy'].hist()
plt.xlabel('Text Entropy')
plt.ylabel('Frequency')
plt.title('Histogram of Text Entropy for Misclassified Instances')
#plt.savefig('entropy.png')
plt.show()

# Concatenate misclassified texts
misclassified_texts = ' '.join(misclassified_data['text'])

# Tokenize the misclassified texts
tokens = nltk.word_tokenize(misclassified_texts.lower())

# Generate a frequency distribution of the tokens
fdist = FreqDist(tokens)

# Visualize the frequency distribution
fdist.plot(30, cumulative=False, title='Top 30 Most Common Tokens in Misclassified Instances')
#plt.savefig('most_common.png')
plt.show()

# Create bigrams and trigrams
bigram_tokens = list(bigrams(tokens))
trigram_tokens = list(trigrams(tokens))

# Generate frequency distributions for bigrams and trigrams
fdist_bigrams = FreqDist(bigram_tokens)
fdist_trigrams = FreqDist(trigram_tokens)

# Visualize the frequency distributions
fdist_bigrams.plot(30, cumulative=False, title='Top 30 Most Common Bigrams in Misclassified Instances')
#plt.savefig('bigrams.png')
plt.show()

fdist_trigrams.plot(30, cumulative=False, title='Top 30 Most Common Trigrams in Misclassified Instances')
#plt.savefig('trigrams.png')
plt.show()

# Perform POS tagging on tokens
pos_tags = nltk.pos_tag(tokens)

# Count the frequency of each POS tag
pos_freq = nltk.FreqDist(tag for (word, tag) in pos_tags)

# Visualize the frequency distribution of POS tags
pos_freq.plot(30, cumulative=False, title='Top 30 Most Common POS Tags in Misclassified Instances')
#plt.savefig('pos_tag.png')
plt.show()


In [None]:
# Set the style for the plot
sns.set(style='whitegrid', font_scale=1.2)

def plot_histogram(data, xlabel, ylabel, title, filename):
    plt.figure(figsize=(12, 8))
    sns.histplot(data, kde=True, color='darkblue', bins=30)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.savefig(filename)
    plt.show()

plot_histogram(misclassified_data['avg_sent_length'], 'Average Sentence Length', 'Frequency', 'Histogram of Average Sentence Length for Misclassified Instances', 'avg_sent_length.png')

plot_histogram(misclassified_data['repetitive_words'], 'Repetitive Words', 'Frequency', 'Histogram of Repetitive Words for Misclassified Instances', 'repet_words.png')

plot_histogram(misclassified_data['text_entropy'], 'Text Entropy', 'Frequency', 'Histogram of Text Entropy for Misclassified Instances', 'entropy.png')

In [None]:
# Create a colormap
cmap = sns.light_palette("blue", as_cmap=True)

# Top 30 most common tokens
plt.figure(figsize=(12, 8))
most_common = pd.DataFrame(fdist.most_common(30), columns=['Token', 'Frequency'])
sns.barplot(x='Token', y='Frequency', data=most_common, palette=cmap(most_common['Frequency'] / most_common['Frequency'].max()))
plt.title('Top 30 Most Common Tokens in Misclassified Instances')
plt.xticks(rotation=45)
plt.savefig('most_common.png', bbox_inches='tight')
plt.show()

# Top 30 most common bigrams
plt.figure(figsize=(12, 8))
most_common_bigrams = pd.DataFrame(fdist_bigrams.most_common(30), columns=['Bigram', 'Frequency'])
sns.barplot(x='Bigram', y='Frequency', data=most_common_bigrams, palette=cmap(most_common_bigrams['Frequency'] / most_common_bigrams['Frequency'].max()))
plt.title('Top 30 Most Common Bigrams in Misclassified Instances')
plt.xticks(rotation=45)
plt.savefig('bigrams.png', bbox_inches='tight')
plt.show()

# Top 30 most common trigrams
plt.figure(figsize=(12, 8))
most_common_trigrams = pd.DataFrame(fdist_trigrams.most_common(30), columns=['Trigram', 'Frequency'])
sns.barplot(x='Trigram', y='Frequency', data=most_common_trigrams, palette=cmap(most_common_trigrams['Frequency'] / most_common_trigrams['Frequency'].max()))
plt.title('Top 30 Most Common Trigrams in Misclassified Instances')
plt.xticks(rotation=45)
plt.savefig('trigrams.png', bbox_inches='tight')
plt.show()

# Top 30 most common POS tags
plt.figure(figsize=(12, 8))
most_common_pos_tags = pd.DataFrame(pos_freq.most_common(30), columns=['POS Tag', 'Frequency'])
sns.barplot(x='POS Tag', y='Frequency', data=most_common_pos_tags, palette=cmap(most_common_pos_tags['Frequency'] / most_common_pos_tags['Frequency'].max()))
plt.title('Top 30 Most Common POS Tags in Misclassified Instances')
plt.xticks(rotation=45)
plt.savefig('pos_tag.png', bbox_inches='tight')
plt.show()