# Preprocess Notebook

In [None]:
import os
from helper import extract_keywords, sample_csv, dataset_stats

In [None]:
prompt = """
Keyword Extraction prompt
"""

## BBC News Business

In [None]:
dataset = 'BBC_News'

### TTMs

In [None]:
# Join the BBC business news text files into a single file
input_path = os.path.join('Data/Raw', dataset, 'business')
output_path = os.path.join('Data/Processed', dataset, 'TTM', dataset + '.txt')

corpus = []
for file in os.listdir(input_path):
    file_path = os.path.join(input_path, file)
    
    with open(file_path) as f:
        document = f.read()

    corpus.append(document.replace('\n', ' ').strip())

In [None]:
with open(output_path, 'w') as f:
    f.write('\n'.join(corpus))

average_count, num_strings, total_words = dataset_stats(corpus)
print('- Dataset Characteristics -')
print("Number of documents:", num_strings)
print("Average word count:", average_count)
print("Total words:", total_words)

### LLM

In [None]:
context='finance news article'
extract_keywords(dataset, prompt, context)

In [None]:
with open(f'Data/Processed/{dataset}/LLM/{dataset}.txt') as f:
    documents = f.read().split('\n')

average_count, num_strings, total_words = dataset_stats(documents)
print('- Dataset Characteristics -')
print("Number of documents:", num_strings)
print("Average word count:", average_count)
print("Total words:", total_words)

## Arxiv Abstracts

In [None]:
dataset = 'Arxiv_Abstracts'

### TTM

In [None]:
num_samples = 500
columns_to_retain = ['summaries'] # Abstracts

sample_csv(dataset, num_samples, columns_to_retain)

In [None]:
with open(f'Data/Processed/{dataset}/TTM/{dataset}.txt') as f:
    documents = f.read().split('\n')

average_count, num_strings, total_words = dataset_stats(documents)
print('- Dataset Characteristics -')
print("Number of documents:", num_strings)
print("Average word count:", average_count)
print("Total words:", total_words)

### LLM

In [None]:
context='scientific abstract'
extract_keywords(dataset, prompt, context)

In [None]:
with open(f'Data/Processed/{dataset}/LLM/{dataset}.txt') as f:
    documents = f.read().split('\n')

average_count, num_strings, total_words = dataset_stats(documents)
print('- Dataset Characteristics -')
print("Number of documents:", num_strings)
print("Average word count:", average_count)
print("Total words:", total_words)

## ChatGPT_Tweets

In [None]:
dataset = 'ChatGPT_Tweets'

### TTM

In [None]:
num_samples = 1000
columns_to_retain = ['content'] # Tweets

sample_csv(dataset, num_samples, columns_to_retain)

In [None]:
with open(f'Data/Processed/{dataset}/TTM/{dataset}.txt', encoding='utf-8') as f:
    documents = f.read().split('\n')

average_count, num_strings, total_words = dataset_stats(documents)
print('- Dataset Characteristics -')
print("Number of documents:", num_strings)
print("Average word count:", average_count)
print("Total words:", total_words)

### LLM

In [None]:
context='tweet about a large language model'
extract_keywords(dataset, prompt, context)

In [None]:
with open(f'Data/Processed/{dataset}/LLM/{dataset}.txt', encoding='utf-8') as f:
    documents = f.read().split('\n')

average_count, num_strings, total_words = dataset_stats(documents)
print('- Dataset Characteristics -')
print("Number of documents:", num_strings)
print("Average word count:", average_count)
print("Total words:", total_words)