In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

import utils

# Ignore all warnings
warnings.filterwarnings("ignore")

# Download NLTK resources
utils.checkForNLTKResources()

trainDataset = "_data/train_essays.csv"
testDataset = "_data/test_essays.csv"
promtDataset = "_data/train_prompts.csv"

trainDf = pd.read_csv(trainDataset)
testDf = pd.read_csv(testDataset)
promptDf = pd.read_csv(promtDataset)

In [None]:
print("--------trainDf-------")
print(trainDf.info())
print("\n")

print("--------testDf-------")
print(testDf.info())
print("\n")

print("--------promptDf-------")
print(promptDf.info())

In [None]:
# prompt id distribution
sns.set(style="darkgrid")
sns.countplot(x='prompt_id', data=trainDf)
plt.title('Prompt ID Distribution')
plt.show()

In [None]:
# target distribution
generated = trainDf['generated'].value_counts().to_dict()
print(f"Human: {generated[0]}")
print(f"AI: {generated[1]}")
sns.countplot(x='generated', data=trainDf)
plt.title('Target Distribution')
plt.show()

In [None]:
# external datasets to complete the training set
externalLLMGeneratedTextsDf = pd.read_csv("_data/LLM-Mistral-7B-Instruct-texts/Mistral7B_CME_v7.csv")
print("------LLM Mistral 7B Dataset------")
print(externalLLMGeneratedTextsDf.info())
print("\n")

print("------LLM Mistral 7B Disitrbution------")
print(externalLLMGeneratedTextsDf['prompt_name'].value_counts())
print("\n")

promptNames = promptDf['prompt_name'].to_list()
externalLLMDf = externalLLMGeneratedTextsDf[externalLLMGeneratedTextsDf['prompt_name'].isin(promptNames)]
externalLLMDf.loc[:,'generated'] = 1
promptId_map_dict = {2: 0 , 12: 1} # matching prompt_id columns with the train dataset
externalLLMDf["prompt_id"] = externalLLMDf["prompt_id"].map(promptId_map_dict)
externalLLMDf = externalLLMDf.drop(columns=['prompt_name'])
print("------External Dataset------")
print(externalLLMDf.info())
print("\n")
print("------External Dataset Distribution------")
print(externalLLMDf['prompt_id'].value_counts())

# prepare train dataset to be concatenated with the external dataset
trainDf = trainDf.drop(columns=['id'])

In [None]:
# concatenate the traind dataset with the external LLM dataset
newTrainDf = pd.concat([trainDf, externalLLMDf], ignore_index=True)
print("------Concatenated Dataset------")
print(newTrainDf['generated'].info())
print("\n")
print("------Concatenated Dataset Generated Distribution------")
print(newTrainDf['generated'].value_counts())
sns.countplot(x='generated', data=newTrainDf)
plt.title('Target Distribution')
plt.show()

In [None]:
# verify if there are duplicates and drop if there is any
print(f"Before dropping duplicates: {len(newTrainDf)}")
newTrainDf = newTrainDf.drop_duplicates(subset=['text'])
print(f"After dropping duplicates: {len(newTrainDf)}")

In [None]:
# Check the length of the essays
newTrainDf['length'] = newTrainDf['text'].apply(lambda x: len(x.split()))
# Clean the text and calculate length after cleaning
newTrainDf['cleaned_text'] = newTrainDf['text'].apply(lambda s: utils.cleanText(s))
newTrainDf['cleaned_length'] = newTrainDf['cleaned_text'].apply(lambda x: len(x.split()))
# Create a figure with two subplots
plt.figure(figsize=(12, 6))
# Subplot 1: Length distribution before cleaning
plt.subplot(1, 2, 1)  # 1 row, 2 columns, 1st subplot
sns.histplot(newTrainDf['length'], bins=50, kde=True)
plt.title('Length Distribution (before cleaning)')
# Subplot 2: Length distribution after cleaning
plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd subplot
sns.histplot(newTrainDf['cleaned_length'], bins=50, kde=True)
plt.title('Length Distribution (after cleaning)')
# Show the plot
plt.tight_layout()  # Adjusts the subplots to fit in the figure area
plt.show()