In [None]:
import re
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from gensim.utils import tokenize

## Standardized Project Gutenberg Corpus

### Sample the data

In [None]:
# Read the metadata file
spgc_metadata = pd.read_csv(r"data\SPGC-metadata-2018-07-18.csv")

# Due to computational resources select only texts with size less than 1000 KB
spgc_metadata = spgc_metadata[spgc_metadata["file_size"] < 1000]

# Count the number of texts per language
spgc_languages_count = spgc_metadata.value_counts("language")

# Select the top 20 languages
spgc_languages_chosen = spgc_languages_count.head(20).index.tolist()

# Skip languages for which embeddings are not available in NLPL repository
spgc_languages_chosen.remove("['eo']") # Esperanto
spgc_languages_chosen.remove("['tl']") # Tagalog

# Skip language which had low coverage during the calculations
spgc_languages_chosen.remove("['zh']") # Chinese

# Filter the metadata to include only the chosen languages
spgc_metadata_filtered = spgc_metadata[spgc_metadata["language"].isin(spgc_languages_chosen)]

# Sample 100 texts per language
# If a language has less than 100 texts, sample all of them
spgc_metadata_sampled = (
    spgc_metadata_filtered.groupby("language", group_keys=False)
    .apply(lambda x: x.sample(n=min(len(x), 100), random_state=213))
)

### Count number of tokens and coverage

In [None]:
for language in spgc_languages_chosen:
    print(f"Processing language: {language}")
    # Filter the metadata for the current language
    spgc_metadata_language = spgc_metadata_sampled[spgc_metadata_sampled["language"] == language]

    # Load embeddings for the current language
    language_code = re.findall(r"[a-z]{2}", language)[0]
    model_current = KeyedVectors.load_word2vec_format(f"embeddings\word2vec_{language_code}.bin", binary=True)

    # Iterate over all texts in the current language
    for index, row in spgc_metadata_language.iterrows():
        file_name = row["id"] + "_tokens.txt"
        file_path = f"data/SGPC/{file_name}"

        # Read the text file
        text_file = open(file_path, mode="r", encoding="UTF-8")
        tokens = text_file.read().split("\n")
        text_file.close()

        # Count the number of tokens in the text
        token_count = len(tokens)
        spgc_metadata_sampled.at[index, "token_count"] = token_count

        # Embed tokens
        vectors = np.asarray([model_current[w] for w in tokens if w in model_current])

        # Calculate coverage
        coverage = len(vectors) / token_count if token_count > 0 else 0
        spgc_metadata_sampled.at[index, "coverage"] = coverage

In [None]:
spgc_metadata_sampled.to_csv("data/spgc_metadata_sampled.csv", index=False)

In [None]:
spgc_metadata_sampled = pd.read_csv("data/spgc_metadata_sampled.csv")
# Remove Chinese
spgc_metadata_sampled = spgc_metadata_sampled[spgc_metadata_sampled["language"] != "['zh']"]

### Descriptive statistics

In [None]:
spgc_languages_stats_tokens = spgc_metadata_sampled.groupby("language")["token_count"].describe().loc[:, ['count', 'min', 'max', 'mean', 'std']]
print(spgc_languages_stats_tokens)

In [None]:
spgc_languages_stats_coverage = spgc_metadata_sampled.groupby("language")["coverage"].describe().loc[:, ['count', 'min', 'max', 'mean', 'std']]
print(spgc_languages_stats_coverage)

## Human vs LLM Corpus

In [None]:
# All texts are in English
model = KeyedVectors.load_word2vec_format("embeddings/word2vec_en.bin", binary=True)

### Sample the data

In [None]:
# Read the human vs LLM text corpus
df_human_vs_llm = pd.read_csv("data\Human_vs_LLM_Text_Corpus.csv")

In [None]:
# Select few sources for the analysis
sources_chosen = ["Human", "GPT-3.5", "GPT-4", "LLaMA-7B", "LLaMA-13B", "LLaMA-30B", "LLaMA-65B"]
df_human_vs_llm = df_human_vs_llm[df_human_vs_llm["source"].isin(sources_chosen)]

# Sample 1000 texts per source
df_human_vs_llm_sampled = (
    df_human_vs_llm.groupby("source", group_keys=False)
    .apply(lambda x: x.sample(n=min(len(x), 1000), random_state=213))
)

df_human_vs_llm_sampled.reset_index(drop=True, inplace=True)

In [None]:
df_human_vs_llm_sampled.to_csv("data/human_vs_llm_sampled.csv", index=False)

In [None]:
df_human_vs_llm_sampled = pd.read_csv("data/human_vs_llm_sampled.csv")

### Count number of tokens and coverage

In [None]:
for index, row in df_human_vs_llm_sampled.iterrows():
    text = row["text"]

    # Tokenize the text
    tokens = list(tokenize(text, lowercase=True))

    # Count the number of tokens in the text
    token_count = len(tokens)
    df_human_vs_llm_sampled.at[index, "token_count"] = token_count

    # Embed tokens
    vectors = np.asarray([model[w] for w in tokens if w in model])

    # Calculate coverage
    coverage = len(vectors) / token_count if token_count > 0 else 0
    df_human_vs_llm_sampled.at[index, "coverage"] = coverage

    # Print progress every 1000 texts
    if (index+1) % 1000 == 0:
        print(f"Processed {index+1} texts...")

### Descriptive statistics

In [None]:
human_vs_llm_stats_tokens = df_human_vs_llm_sampled.groupby("source")["token_count"].describe().loc[:, ['count', 'min', 'max', 'mean', 'std']]
print(human_vs_llm_stats_tokens)

In [None]:
human_vs_llm_stats_coverage = df_human_vs_llm_sampled.groupby("source")["coverage"].describe().loc[:, ['count', 'min', 'max', 'mean', 'std']]
print(human_vs_llm_stats_coverage)

## To LaTeX

In [None]:
print(spgc_languages_chosen)

In [None]:
languages_dictionary = {
    "['en']": "English",
    "['fr']": "French",
    "['fi']": "Finnish", 
    "['de']": "German", 
    "['nl']": "Dutch", 
    "['it']": "Italian", 
    "['es']": "Spanish", 
    "['pt']": "Portuguese", 
    # "['zh']": "Chinese", 
    "['el']": "Greek", 
    "['sv']": "Swedish", 
    "['hu']": "Hungarian", 
    "['la']": "Latin",
    "['da']": "Danish",
    "['ca']": "Catalan",
    "['pl']": "Polish",
    "['ja']": "Japanese",
    "['no']": "Norwegian",
}

spgc_languages_stats_tokens.index = spgc_languages_stats_tokens.index.map(languages_dictionary)
spgc_languages_stats_coverage.index = spgc_languages_stats_coverage.index.map(languages_dictionary)

In [None]:
headers = ["Count", "Mean", "St.D.", "Min", "Q1", "Q2", "Q3", "Max"]

In [None]:
latex_spgc_languages_stats_tokens = spgc_languages_stats_tokens.to_latex(
    # float_format="%.1f",
    # header=headers,
    # label="tab:spgc_languages_stats_tokens",
    # caption="Statistics of token count for the sampled SPGC texts in different languages.",
)
print(latex_spgc_languages_stats_tokens)

In [None]:
latex_spgc_languages_stats_coverage = spgc_languages_stats_coverage.to_latex(
    # float_format="%.1f",
    # header=headers,
    # label="tab:spgc_languages_stats_coverage",
    # caption="Statistics of coverage of tokens by NLPL embeddings for the sampled SPGC texts in different languages.",
)
print(latex_spgc_languages_stats_coverage)

In [None]:
latex_human_vs_llm_stats_tokens = human_vs_llm_stats_tokens.to_latex(
    # float_format="%.1f",
    # header=headers,
    # label="tab:human_ai_stats_counts",
    # caption="Statistics of token count for the sampled Human vs LLM texts."
)
print(latex_human_vs_llm_stats_tokens)

In [None]:
latex_human_vs_llm_stats_coverage = human_vs_llm_stats_coverage.to_latex(
    # float_format="%.1f",
    # header=headers,
    # label="tab:human_ai_stats_coverage",
    # caption="Statistics of coverage of tokens by NLPL embeddings for the sampled Human vs LLM texts."
)
print(latex_human_vs_llm_stats_coverage)

In [None]:
# Table with properties of embedding spaces
nlpl_properties = pd.read_csv("data/embeddings_space_sizes.csv")
nlpl_properties_latex = nlpl_properties.to_latex(
    float_format="%.0f",
    label="tab:embeddings_space_sizes",
    caption="Properties of the embedding spaces for different languages.",
)
print(nlpl_properties_latex)