# Postprocess Notebook

In [None]:
import os
from tqdm import tqdm
from helper import postprocess_topics
import pandas as pd

### Rating

In [None]:
input_folder = 'Data/Topics/Original'
output_folder = 'Data/Topics/Processed/Ratings'

In [None]:
prompt = """
Coherence rating prompt
"""

In [None]:
postprocess_topics(input_folder, output_folder, prompt)

### Word Intrusion

In [None]:
input_folder = 'Data/Topics/Original'
output_folder = 'Data/Topics/Processed/Intrusion'

In [None]:
prompt = """
Word intrusion prompt
"""

In [None]:
postprocess_topics(input_folder, output_folder, prompt)

## Labeling

In [None]:
input_folder = 'Data/Topics/Original'
output_folder = 'Data/Topics/Processed/Labels'

In [None]:
prompt = """
Topic labels prompt
"""

In [None]:
postprocess_topics(input_folder, output_folder, prompt)

### Result Compilation

In [28]:
input_folder = 'Data/Topics/Processed/'
result_folders = ['Ratings', 'Intrusion']
output_folder = 'Data/Topics/Processed/'

### Ratings

In [32]:
result_folder = os.path.join(input_folder, result_folders[0])

datasets = []
vectorizers = []
models = []
ratings = []

def map_values(value):
    if value == 'Very Related':
        return 1
    elif value == 'Somewhat Related':
        return 2
    elif value == 'Not Very Related':
        return 3

for file in os.listdir(result_folder):
    file_path = os.path.join(result_folder, file)

    rating_df = pd.read_csv(file_path, header=None)
    rating_df['Rating'] = rating_df[0].apply(map_values)
    average_rating = round(rating_df['Rating'].mean(), 2)

    attributes = file.split('-')
    datasets.append(attributes[0].split('_')[-1])
    vectorizers.append(attributes[1])
    models.append(attributes[-1].replace('.csv', ''))
    ratings.append(average_rating)

df = pd.DataFrame({'Dataset': datasets, 'Vectorizer': vectorizers, 'Model': models, 'Rating': ratings})

### Word Intrusion

In [37]:
result_folder = os.path.join(input_folder, result_folders[1])

intrusions = []

for file in os.listdir(result_folder):
    file_path = os.path.join(result_folder, file)

    intrusion_df = pd.read_csv(file_path, header=None)
    average_intrusions = intrusion_df[0].mean()
    intrusions.append(round(average_intrusions, 2))

df['Intrusions'] = intrusions

In [40]:
df.to_csv('Data/Topics/Processed/compilation.csv', index=False)

### Topic Words and Labels

In [59]:
topic_folder = 'Data/Topics/Original'
label_folder = 'Data/Topics/Processed/Labels'
output_folder = 'Data/Topics/Processed/Topics and Labels'

In [60]:
for file in os.listdir(topic_folder):

    topic_file_path = os.path.join(topic_folder, file)
    
    with open(topic_file_path) as f:
        topics = f.read().split('\n')
        
    topics = [topic for topic in topics if topic]

    df = pd.DataFrame({'Topics': topics})

    label_file = topic_file_path.split('\\')[-1].replace('.txt', '.csv')
    label_file_path = os.path.join(label_folder, label_file)

    labels = pd.read_csv(label_file_path, header=None)
    labels = labels[0].values.tolist()

    df['Labels'] = labels

    save_path = os.path.join(output_folder, label_file)
    df.to_csv(save_path, index=False)