# Evaluate chunks with gpt-4

In [None]:
import openai
import os
import pandas as pd
import tqdm

# Preparations
## Load text with Langchain

In [None]:
# Source: https://python.langchain.com/docs/modules/data_connection/document_loaders/file_directory
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader

path = r' path to \data\txt_files'

text_loader_kwargs={'autodetect_encoding': True}

loader = DirectoryLoader(path, glob="**/*.txt", show_progress=True, loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)

documents = loader.load()

doc_sources = [doc.metadata['source']  for doc in documents ]

## Chunk text with Langchain

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm

# Initialize the text splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    separators=["\n\n", "\n", "(?<=\\. )", " ", ""],
    length_function=len
)

# Split the documents with a progress bar
chunks = []
for document in tqdm(documents, desc="Splitting documents"):
    chunks.extend(splitter.split_documents([document]))

In [None]:
print (f'The numbers of chunks to be evaluated are: {len(chunks)}')

## Build dataframe with year, author, title, and chunks from lists

In [None]:
import pandas as pd
source_list = []
year_list = []
author_list = []
title_list = []
text_list = []

# Wrap the chunks iterable with tqdm to add a progress bar
for i in tqdm(chunks, desc="Processing chunks"):
    # text replace
    text = (i.page_content.replace('\n',' ').replace('Ãˆ','È').replace('Ãª', 'ê').replace('â€™','’')
            .replace('Å“', 'œ').replace('Ã©', 'é').replace('Ã®','î')
            .replace('Ã‰', 'É').replace('Ã¨', 'è').replace('Ã', 'à')
            .replace('à¢', 'â').replace('â€¦', '…').replace('à”', 'Ô')
            .replace('à¹','ù').replace('\xa0', '').replace('§','ç').replace('â€”', '—'))
    
    # Source
    source = i.metadata['source'].split('\\')[-1].replace('.txt','')
    
    # Year
    year = source.split()[0]
    
    # Author
    author = source.split()[1]
    
    # Title
    title = ' '.join(source.split()[2:])
    
    text_list.append(text)
    source_list.append(source)
    year_list.append(year)
    author_list.append(author)
    title_list.append(title)

df_source = pd.DataFrame({'year': year_list, 'author': author_list, 'title': title_list, 'text': text_list})

## Read the keywords from the text file

In [None]:

with open(r'path to \key_word_lists\technology_list.txt', 'r', encoding='utf-8-sig') as file:
    keywords = [line.strip() for line in file]

# Create a function to check for keywords in a text
# This solution ensures that only exact matches of 
# the keywords are considered, and partial matches are not counted.

def contains_keyword(text):
    for keyword in keywords:
        if keyword in text.split():
            return keyword
    return 0

# Apply the function to the DataFrame
df_source['keyword'] = df_source['text'].apply(contains_keyword)


## Subset data to chunks holding a keyword

In [None]:
df_source[df_source['keyword']!=0]

In [None]:
# copy the dataframe 
df = df_source.iloc[44204:, :].copy()

# Evaluate with GPT

In [None]:

import openai
from openai import OpenAI
from tqdm import tqdm
tqdm.pandas()
import os
from dotenv import load_dotenv, find_dotenv
import random
import time
import pandas as pd
import copy

openai.api_key = 'your-api-key-goes-here'

client = openai.OpenAI()

assistant_description = "You are an AI language model designed to analyze text for specific themes and emotions. \
                        Your task is to evaluate text chunks to determine if artificial lighting contributes to \
                        romantic or loving emotions. You should carefully analyze the provided text and identify \
                        instances where illumination by artificial lighting evokes feelings of love or romance between people."

def get_completion(prompt, model="gpt-4o"):
    messages = [{"role": "user", "content": prompt, "assistant_description": assistant_description}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content

#####################

def chat_gpt_evaluation(text):
    
    prompt = f"""
    You will be provided with a text chunk that is delimited with ''' '''.

    Please take your time to carefully analyze the following text chunks and identify 
    instances where illumination by artificial lighting contributes to romantic or loving emotions between people.
    
    Begin your response with 'Yes.' if such instances are found, or 'No.' if they are not.
    
    Follow your initial response with a short argument based on the text explaining why the 
    answer is 'Yes' or 'No'. 
    
    
    Text: '''{text}'''
    """

    gpt_response = get_completion(prompt)

    random_number_sleep = random.randint(0, 2)
    time.sleep(random_number_sleep)
    print(f'Sleep for {random_number_sleep} sec.')

    return gpt_response

# Create a directory to save the DataFrame copies
output_dir = r'D:\lfl_dataframe_copies_5'
os.makedirs(output_dir, exist_ok=True)

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Check if the value in the 'keyword' column is not equal to 0
    if row['keyword'] != 0:
        # Apply the function to the selected column 'text' and create a new column 'gpt_evaluation'
        df.at[index, 'gpt_evaluation'] = chat_gpt_evaluation(row['text'])

        # Save a copy of the DataFrame
        df_copy = copy.deepcopy(df)
        df_copy.to_csv(os.path.join(output_dir, f'df_copy_{index+1}.csv'), index=False)

# Print the final DataFrame
print("Final DataFrame:")
print(df)


## Store the evaluations 

In [None]:
df.to_csv('gpt_evaluation_chunks_with_keywords.csv', index=False)

In [None]:
# read the evaulations
import pandas as pd
df = pd.read_csv('gpt_evaluation_chunks_with_keywords.csv')

In [None]:
df

## If evaluation string starts with 'no' add 0 else if the string startswith 'yes' then add 1
## store the evaluations again this time with 0 and 1 added

In [None]:
def add_one_or_zero(x):
    if str(x).startswith('No.'):
        return 0
    elif str(x).startswith('Yes.'):
        return 1

df['sum'] = df['gpt_evaluation'].progress_apply(lambda x: add_one_or_zero(x))


df.to_csv('gpt_evaluatio_chunks_with_keywords_sum_added.csv', index=False)

In [None]:
# Inspect df
df