In [1]:
import re
import numpy as np
import pandas as pd

path = "cnn2011/CNN_Articels_clean_2/CNN_Articels_clean.csv"
with open(path) as f:
    df = pd.read_csv(f)

print(df.shape)
print(df.columns)

(37949, 11)
Index(['Index', 'Author', 'Date published', 'Category', 'Section', 'Url',
       'Headline', 'Description', 'Keywords', 'Second headline',
       'Article text'],
      dtype='object')


In [2]:
def is_single_person(author):
    return not (
        len(author.split()) < 2
        or re.search(r"\b(Staff|Reuters)\b", author, re.IGNORECASE)
        or " and " in author
        or author == "By "
    )


# drops if: article text or author is missing, author is not a single person, or article text is too short
df = df.dropna(subset=["Article text", "Author"])
df = df[df["Author"].map(is_single_person)]
df = df[df["Article text"].str.len() > 100]

In [3]:
df["Article text length"] = df["Article text"].apply(len)
author_article_length = (
    df.groupby("Author")["Article text length"].sum().sort_values(ascending=False)
)
df = df[df["Author"].isin(author_article_length[author_article_length > 1000000].index)]

# RANDOMIZATION OF THE DATA
# this allows us to split the data using indices
df = df.sample(frac=1, random_state=40)

In [4]:
topauthors = df.groupby("Author")["Article text"].apply(" ".join)
topauthors = topauthors.str.slice(0, 1000000)

In [5]:
# 1 word ~ 5 characters
charcount = 10000


def split_text(text, charcount):
    text = text[: len(text) - (len(text) % charcount)]
    return [text[i : i + charcount] for i in range(0, len(text), charcount)]

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

with open("vocab.txt", "r") as file:
    vocab = file.read().splitlines()
print(vocab)

vectorizer = CountVectorizer(
    vocabulary=vocab, stop_words=None, token_pattern=r"(?u)\b\w+\b"
)

feature_names = vectorizer.get_feature_names_out()

['the', 'be', 'are', 'been', 'is', 's', 'was', 'were', 'of', 'and', 'to', 'a', 'an', 'in', 'have', 'had', 'has', 'it', 'he', 'his', 'that', 'i', 'for', 'they', 'their', 'you', 'not', 't', 'on', 'she', 'her', 'with', 'as', 'this', 'we', 'at', 'by', 'but', 'from', 'or', 'which', 'will', 'there', 'no', 'can', 'if', 'what', 'would', 'so', 'up']


In [7]:
from sklearn.model_selection import train_test_split

ai_df = pd.read_csv("rewrites.csv")
ai_train_df, ai_test_df = train_test_split(ai_df, test_size=0.5, random_state=40)

ai_train = " ".join(ai_train_df["Generated text"])
ai_test = " ".join(ai_test_df["Generated text"])

ai_train_subsets = split_text(ai_train, charcount)
ai_test_subsets = split_text(ai_test, charcount)

ai_train_subsets_df = pd.DataFrame(ai_train_subsets, columns=["Text chunk"])
ai_test_subsets_df = pd.DataFrame(ai_test_subsets, columns=["Text chunk"])

In [8]:
from scipy.stats import skewnorm

ai_train_subsets_df["Count"] = list(
    vectorizer.transform(ai_train_subsets_df["Text chunk"]).toarray()
)
ai_train_subsets_df["Proportion"] = ai_train_subsets_df["Count"].apply(
    lambda x: x / x.sum() if x.sum() != 0 else x
)

ai_fit_params = []
for i, feature in enumerate(feature_names):
    aidist = ai_train_subsets_df["Proportion"].apply(lambda x: x[i])
    ai_fit_params.append(skewnorm.fit(aidist))

In [9]:
def get_p(x, params):
    a, loc, scale = params
    p_value = skewnorm.cdf(x, a=a, loc=loc, scale=scale)
    p_value = 2 - 2 * p_value if p_value > 0.5 else 2 * p_value
    return p_value


def p_value(vector, fits):
    p_values = []
    for i, feature in enumerate(feature_names):
        params = fits[i]
        p_values.append(get_p(vector[i], params))

    return np.mean(p_values)

In [10]:
def classify(test_subsets, inverse=False, ai_bias=1):
    test_subsets_df = pd.DataFrame(test_subsets, columns=["Text chunk"])
    test_subsets_df["Count"] = list(
        vectorizer.transform(test_subsets_df["Text chunk"]).toarray()
    )
    test_subsets_df["Proportion"] = test_subsets_df["Count"].apply(
        lambda x: x / x.sum() if x.sum() != 0 else x
    )

    test_subsets_df["Vectors"] = list(
        vectorizer.transform(test_subsets_df["Text chunk"]).toarray()
    )
    test_subsets_df["Vectors"] = test_subsets_df["Vectors"].apply(
        lambda x: x / x.sum() if x.sum() != 0 else x
    )

    test_subsets_df["p_human"] = test_subsets_df["Vectors"].apply(
        lambda x: p_value(x, human_fit_params)
    )
    test_subsets_df["p_ai"] = test_subsets_df["Vectors"].apply(
        lambda x: p_value(x, ai_fit_params)
    )

    test_subsets_df["Prediction"] = test_subsets_df["p_human"] > ai_bias * test_subsets_df["p_ai"]
    if inverse:
        test_subsets_df["Prediction"] = ~test_subsets_df["Prediction"]

    return test_subsets_df["Prediction"].sum(), len(test_subsets_df)

In [11]:
human_correct, human_total, ai_correct, ai_total = 0, 0, 0, 0

for author, text in topauthors.items():
    i_split = int(len(text) * 0.5)
    train, test = text[:i_split], text[i_split:]
    human_train_subsets_df, human_test_subsets_df = (
        pd.DataFrame(split_text(train, charcount), columns=["Text chunk"]),
        pd.DataFrame(split_text(test, charcount), columns=["Text chunk"]),
    )

    human_train_subsets_df["Count"] = list(
        vectorizer.transform(human_train_subsets_df["Text chunk"]).toarray()
    )
    human_train_subsets_df["Proportion"] = human_train_subsets_df["Count"].apply(
        lambda x: x / x.sum() if x.sum() != 0 else x
    )

    human_fit_params = []
    for i, feature in enumerate(feature_names):
        humandist = human_train_subsets_df["Proportion"].apply(lambda x: x[i])
        human_fit_params.append(skewnorm.fit(humandist))

    ai_bias = 1.2

    human_results = classify(human_test_subsets_df, ai_bias=ai_bias)
    human_correct += human_results[0]
    human_total += human_results[1]

    ai_results = classify(ai_test_subsets_df, inverse=True, ai_bias=ai_bias)
    ai_correct += ai_results[0]
    ai_total += ai_results[1]

In [12]:
print("Accuracy on humans:", human_correct / human_total, f"({human_correct}/{human_total})")
print("Accuracy on AI:", ai_correct / ai_total, f"({ai_correct}/{ai_total})")

Accuracy on humans: 0.9714285714285714 (1020/1050)
Accuracy on AI: 0.9927113702623906 (6129/6174)


In [13]:
def f_beta(precision, recall, beta=1):
    return (1 + beta ** 2) * (precision * recall) / ((beta ** 2 * precision) + recall)

tp = ai_correct
fp = human_total - human_correct
fn = ai_total - ai_correct

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = f_beta(precision, recall)
f05 = f_beta(precision, recall, beta=0.5)

print(f"Precision: {precision}\nRecall: {recall}\nF1: {f1}\nF0.5: {f05}")

Precision: 0.9951290793960058
Recall: 0.9927113702623906
F1: 0.993918754560934
F0.5: 0.9946445959104189


In [14]:
print(tp, fp, fn)

6129 30 45
