In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('dataset.csv')

df['index'] = df.index + 1
df.dropna(subset=['difficult_words'], inplace=True)
df.dropna(subset=['lexicon_count'], inplace=True)

# Filter the dataset for rows where 'text_clarity' is null
filtered_df = df[df['text_clarity'].isnull()]

# Select the first 100 data points
selected_data = filtered_df.head(100)

# Prompt for LLM Models
print("You are a category classifier AI. Determine if the paragraph is clear enough for the category. Thoroughly read and evaluate the content of each paragraph, comparing it with the category mentioned. This evaluation will assess its relevance and clarity in relation to the specified category. Look for key terms, check the logical flow of the text, and determine whether it accurately represents the concepts and themes relevant to its category.\nCheck each following contents are clear enough or not one by one.\nIf the content is clear enough, you can only type either the word 'clear_enough' else 'not_clear_enough' with its respected index like mentioned in following example. Remember, DO NOT type other way around. DO NOT need to explain why. Strictly follow the given template example.\n")
print("Example template:\n\nindex,text_clarity\n1,clear_enough\n2,not_clear_enough\n\nHere are the contents of the paragraph you need to check." + "\n"*2)
print("----------\n")

# Prepare to print 'index', 'category', and 'paragraph' column values
for index, row in selected_data.iterrows():
    print(f"Index:\n{row['index']}\n")
    print(f"Category:\n{row['category']}\n")
    print(f"Paragraph:\n{row['paragraph']}\n")
    print("----------\n")

print(len(selected_data))

You are a category classifier AI. Determine if the paragraph is clear enough for the category. Thoroughly read and evaluate the content of each paragraph, comparing it with the category mentioned. This evaluation will assess its relevance and clarity in relation to the specified category. Look for key terms, check the logical flow of the text, and determine whether it accurately represents the concepts and themes relevant to its category.
Check each following contents are clear enough or not one by one.
If the content is clear enough, you can only type either the word 'clear_enough' else 'not_clear_enough' with its respected index like mentioned in following example. Remember, DO NOT type other way around. DO NOT need to explain why. Strictly follow the given template example.

Example template:

index,text_clarity
1,clear_enough
2,not_clear_enough

Here are the contents of the paragraph you need to check.


----------

Index:
10

Category:
biographies

Paragraph:
Pepys' diary provides

In [2]:
import pandas as pd

# Load the datasets
main_df = pd.read_csv('dataset.csv')
main_df['index'] = main_df.index + 1
print('Initial labeled values:', main_df['text_clarity'].notnull().sum())
labeled_df = pd.read_csv('labeled_dataset.csv')
print('Acquired labels:', labeled_df['text_clarity'].notnull().sum())

# Ensure both datasets have 'index' and 'text_clarity' columns
assert 'index' in main_df.columns and 'text_clarity' in main_df.columns
assert 'index' in labeled_df.columns and 'text_clarity' in labeled_df.columns

# Select unique indexs and limit to 100 rows
selected_labeled_data = labeled_df.drop_duplicates(subset='index').head(100)

# Create a dictionary for efficient lookup
labeled_data_dict = selected_labeled_data.set_index('index')['text_clarity'].to_dict()

# Update the 'text_clarity' column in the main dataset
main_df.loc[main_df['index'].isin(labeled_data_dict), 'text_clarity'] = main_df['index'].map(labeled_data_dict)

# Write the updated main dataset to a new CSV file (or overwrite the existing one)
main_df.to_csv('updated_dataset.csv', index=False)  # Replace with the desired filename if needed

print('Sum of labeled rows:', main_df['text_clarity'].notnull().sum())
print('Unique index in main dataset:', main_df['index'].nunique())
print('Unique index in updated dataset:', main_df['index'].nunique())

Initial labeled values: 9
Acquired labels: 100
Sum of labeled rows: 109
Unique index in main dataset: 9347
Unique index in updated dataset: 9347
