In [19]:
import pandas as pd
import numpy as np
import math
import nltk
from collections import Counter
from nltk.corpus import wordnet as w
random_state = 114514

In [5]:
# Load data
gpt_data = pd.read_csv("data_process/gpt.csv")
human_data = pd.read_csv("data_process/human.csv")

# Combine the two datasets into one
data = pd.concat([gpt_data, human_data], ignore_index=True)

In [6]:
# Feature extraction functions
def sentence_length(text):
    text = str(text)
    sentences = nltk.sent_tokenize(text)
    numberofsentences = len(sentences)
    total_words = 0
    for i in sentences:
        total_words += len(i.split())
    avg_sentence = total_words / numberofsentences
    return numberofsentences, avg_sentence

def repetitivewords(text):
    text = str(text)
    token = nltk.word_tokenize(text.lower())
    synsets = []
    for i in token:
        synsets.extend(w.synsets(i))
    synonyms = []
    for synset in synsets:
        synonyms.append([lemma.name() for lemma in synset.lemmas()])
    repeat = 0
    for index in range(len(synonyms)):
        for nextindex in range(index+1, len(synonyms)):
            if len(set(synonyms[index]) & set(synonyms[nextindex])) > 0:
                repeat += 1
    return repeat / len(token)

def entropy(text):
    text = str(text)
    tokens = nltk.word_tokenize(text.lower())
    tokennumber = Counter(tokens)
    total = len(tokens)
    numberofprobs = []
    for count in tokennumber.values():
        prob = count / total
        numberofprobs.append(prob)
    entropy = 0.0
    for i in numberofprobs:
        if i > 0:
            entropy -= i * (math.log(i, 2))
    return entropy

# Extract features
data['sent_length'], data['avg_sent_length'] = zip(*data['text'].apply(sentence_length))
data['repetitive_words'] = data['text'].apply(repetitivewords)
data['text_entropy'] = data['text'].apply(entropy)

# save the DataFrame to a CSV file
data.to_csv('data_process/processed_data.csv', index=False)

KeyboardInterrupt: 

In [48]:
data = pd.read_csv("data_process/processed_data.csv")

In [49]:
data['generated'].value_counts()

generated
0    101643
1     10000
Name: count, dtype: int64

In [50]:
num_to_drop = data['generated'].value_counts()[0] - data['generated'].value_counts()[1]
data = data.drop(data[data['generated'] == 0].sample(n=num_to_drop).index)
data['generated'].value_counts()

generated
1    10000
0    10000
Name: count, dtype: int64

In [52]:
data.to_csv('data_process/processed_balanced_data.csv', index=False)