# Continuation with parsing (updated 17.12.2023)

Parsing the number of files in the folder and creation of a folder with output files

In [None]:
!unzip data_CBSD.zip

In [3]:
import csv
import os
from collections import Counter

def process_files(input_folder, output_folder, k):
    files = os.listdir(input_folder)[:k]  # Get first k files
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file in files:
        unique_lines = {}
        relation = ""

        with open(os.path.join(input_folder, file), 'r') as infile:
            reader = csv.reader((line.replace('\t', '\t') for line in infile), delimiter='\t')
            next(reader, None)  # Skip header if exists

            for i, row in enumerate(reader):
                if i == 0:  # Extract relation from the second row (i.e., first data row)
                    relation = row[-1] if len(row) > 6 else ""
                if len(row) >= 7:  # Ensure the row has enough columns
                    key = tuple(row[:4])  # First 4 columns as key
                    least_most = tuple(row[4:6])  # least_illustrative and most_illustrative
                    if key not in unique_lines:
                        unique_lines[key] = []
                    unique_lines[key].append(least_most)

        with open(os.path.join(output_folder, file), 'w', newline='') as outfile:
            writer = csv.writer(outfile, delimiter='\t')
            writer.writerow([relation])  # Write the relation as the first line
            for key, values in unique_lines.items():
                if values:
                    most_common = Counter(values).most_common(1)[0][0]
                    writer.writerow(list(key) + list(most_common))
                else:
                    writer.writerow(list(key) + ["", ""])  # Empty values for missing data

# Example usage
input_folder = '/content/Testing/Phase2Answers'  # Adjust to your input folder path
output_folder = '/content/output_files'  # Adjust to your output folder path
k = 8  # Number of files to process
process_files(input_folder, output_folder, k)

In [4]:
def create_no_most_least_files(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file in os.listdir(input_folder):
        with open(os.path.join(input_folder, file), 'r') as infile, \
             open(os.path.join(output_folder, file.replace('.txt', '_no_least_most.txt')), 'w', newline='') as outfile:
            reader = csv.reader(infile, delimiter='\t')
            writer = csv.writer(outfile, delimiter='\t')

            for i, row in enumerate(reader):
                if i == 0:  # Copy the first line as is (relation)
                    writer.writerow(row)
                else:
                    writer.writerow(row[:4])  # Write only the first four columns

# Example usage
input_folder = '/content/output_files'  # Adjust to your input folder path
output_folder = '/content/output_no_least_most'  # Adjust to your output folder path
create_no_most_least_files(input_folder, output_folder)

### GPT feeding

In [None]:
# This cell is a draft useful for the report, don't run it

def process_files_and_query_gpt(input_folder, output_folder, api_key):
    openai.api_key = api_key  # Set the API key for OpenAI

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file in os.listdir(input_folder):
        with open(os.path.join(input_folder, file), 'r') as infile:
            reader = csv.reader(infile, delimiter='\t')
            relation = next(reader, [])[0]  # First row for the relation
            pairs = [row for row in reader]

            # Construct dynamic instructions including the relation
            # instructions = ("I'm going to give you several lines of the same type. "
            #                 "Your task is for each line to output the least illustrative "
            #                 "and the most illustrative representation of this relation: '"
            #                 + relation + "' (the order is important here!). "
            #                 "So, the output should be multiple lines with 2 pairs: "
            #                 "least illustrative and most illustrative. Output only this "
            #                 "information without any other comments.")
            instructions = ("For each line, output the least illustrative "
                            "and the most illustrative representation of this relation: '"
                            + relation + "'. The output should be two pairs: "
                            "least illustrative and most illustrative.")

            # Divide into batches of max 20 lines
            batches = [pairs[i:i + 20] for i in range(0, len(pairs), 20)]
            responses = []

            for batch in batches:
                # Prepare messages for API call, including the instructions
                messages = [{"role": "system", "content": instructions}]
                messages.extend([{"role": "user", "content": " ".join(row)} for row in batch])

                # Make API calls using chat completions
                chat_completion = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=messages
                )
                # Correctly extracting the assistant's response
                assistant_message = chat_completion['choices'][0]['message']
                if assistant_message['role'] == 'assistant':
                    responses.append(assistant_message['content'])

            # Write to new file
            with open(os.path.join(output_folder, file.replace('.txt', '_gpt.txt')), 'w', newline='') as outfile:
                writer = csv.writer(outfile, delimiter='\t')
                writer.writerow([relation])
                for response in responses:
                    writer.writerow([response])

# Example usage
input_folder = '/content/output_no_least_most'
output_folder = '/content/output_gpt'
api_key = ''  # Replace with your actual API key
process_files_and_query_gpt(input_folder, output_folder, api_key)

In [6]:
# This is the core cell, the instruction is sufficient

!pip install openai==0.28
import openai

def process_files_and_query_gpt(input_folder, output_folder, api_key):
    openai.api_key = api_key  # Set the API key for OpenAI

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file in os.listdir(input_folder):
        with open(os.path.join(input_folder, file), 'r') as infile:
            reader = csv.reader(infile, delimiter='\t')
            relation = next(reader, [])[0]  # First row for the relation

            # Update instructions including the relation
            instructions = ("In this line, based on the pairs provided, choose among them the least illustrative "
                            "and the most illustrative representation for this relation: '"
                            + relation + "' (the order of the relation matters). The output should be these four pairs "
                            "and the least illustrative and the most illustrative as the 5th and 6th column, accordingly."
                            "The output should be written in one line, 6 pairs overall in the following format:"
                            "pair1, pair2, pair3, pair4, least_illustrative, most_illustrative "
                            "And that's it, no brackets, no quotes, nothing else, it must be in this format.")

            responses = []

            for pairs in reader:
                # Prepare the message for API call, including the instructions and the line
                message = [{"role": "system", "content": instructions},
                           {"role": "user", "content": " ".join(pairs)}]

                # Make API calls for each line
                chat_completion = openai.ChatCompletion.create(
                    model="gpt-4",
                    messages=message
                )
                # Extracting the assistant's response
                assistant_message = chat_completion['choices'][0]['message']
                if assistant_message['role'] == 'assistant':
                    responses.append(assistant_message['content'])

            # Write to new file
            with open(os.path.join(output_folder, file.replace('.txt', '_gpt.txt')), 'w', newline='') as outfile:
                writer = csv.writer(outfile, delimiter='\t')
                writer.writerow([relation])
                for response in responses:
                    writer.writerow([response])

# Example usage
input_folder = '/content/output_no_least_most'
output_folder = '/content/output_gpt'
api_key = ''  # Replace with your actual API key
process_files_and_query_gpt(input_folder, output_folder, api_key)

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m71.7/76.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.[0m[31m
[0mSuccessfully installed openai-0.28.0


In [49]:
import os
import csv

def create_lists(combination, path_correct, path_gpt):
    # Filenames based on the combination
    filename_correct = f'Phase2Answers-{combination}.txt'
    filename_gpt = f'Phase2Answers-{combination}_no_least_most_gpt.txt'

    # Initialize lists
    leasts_correct = []
    mosts_correct = []
    leasts_gpt = []
    mosts_gpt = []

    # Process the file from the correct folder
    with open(os.path.join(path_correct, filename_correct), 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        next(reader)  # Skip the first line (relation)
        for row in reader:
            leasts_correct.append(row[4])
            mosts_correct.append(row[5])

    # Process the file from the gpt folder
    with open(os.path.join(path_gpt, filename_gpt), 'r') as file:
        reader = csv.reader(file, delimiter=',')
        next(reader)  # Skip the first line (relation)
        for row in reader:
            cleaned_row = [element.strip() for element in row]
            if len(cleaned_row) >= 6:
                leasts_gpt.append(cleaned_row[4])
                mosts_gpt.append(cleaned_row[5])

    return leasts_correct, mosts_correct, leasts_gpt, mosts_gpt

# Example usage
combination = '10b'
path_correct = '/content/output_files'
path_gpt = '/content/output_gpt'
leasts_correct, mosts_correct, leasts_gpt, mosts_gpt = create_lists(combination, path_correct, path_gpt)

In [52]:
print(len(leasts_correct))
print(leasts_correct)
print(len(leasts_gpt))
print(leasts_gpt)

print('##############')

print(len(mosts_correct))
print(mosts_correct)
print(len(mosts_gpt))
print(mosts_gpt)

100
['friendliness:wink', 'growl:danger', 'discourse:relationship', 'frown:anger', 'sigh:exhaustion', 'wink:friendliness', 'wave:acknowledgment', 'friendliness:wink', 'hilarity:laugh', 'exhaustion:sigh', 'glare:anger', 'cough:sickness', 'anger:slap', 'hilarity:laugh', 'hilarity:laugh', 'exhaustion:sigh', 'growl:danger', 'glare:anger', 'anger:slap', 'glare:anger', 'hilarity:laugh', 'frown:sadness', 'wink:friendliness', 'kiss:passion', 'anger:slap', 'lie:dishonesty', 'growl:danger', 'sorrow:tears', 'friendliness:wink', 'wave:acknowledgment', 'friendliness:wink', 'anger:slap', 'snarl:anger', 'glare:anger', 'cough:illness', 'hilarity:laugh', 'wink:friendliness', 'sigh:exhaustion', 'sorrow:tears', 'slap:anger', 'hilarity:laugh', 'frown:discontent', 'frown:distaste', 'exhaustion:sigh', 'friendliness:wink', 'yawn:boredom', 'wink:friendliness', 'anger:slap', 'friendliness:wink', 'exhaustion:sigh', 'nod:agreement', 'crying:sadness', 'frown:distaste', 'discourse:relationship', 'nod:agreement', '

In [53]:
def compare_lists(list1, list2):
    if len(list1) != len(list2):
        print("Lists are of different lengths. Cannot compare element-wise.")
        return

    matched = 0
    for i in range(len(list1)):
        if list1[i] == list2[i]:
            matched += 1
        else:
            print(f"Mismatch at index {i}: '{list1[i]}' (List1) vs '{list2[i]}' (List2)")

    print(f"Total matches: {matched} out of {len(list1)}")

# Example usage with your lists
print("Comparing least illustrative pairs:")
compare_lists(leasts_correct, leasts_gpt)

print("\nComparing most illustrative pairs:")
compare_lists(mosts_correct, mosts_gpt)

Comparing least illustrative pairs:
Mismatch at index 1: 'growl:danger' (List1) vs 'anger:slap' (List2)
Mismatch at index 2: 'discourse:relationship' (List1) vs 'punch:hatred' (List2)
Mismatch at index 3: 'frown:anger' (List1) vs 'handshake:cordiality' (List2)
Mismatch at index 4: 'sigh:exhaustion' (List1) vs 'friendliness:wink' (List2)
Mismatch at index 5: 'wink:friendliness' (List1) vs 'exhaustion:sigh' (List2)
Mismatch at index 6: 'wave:acknowledgment' (List1) vs 'kiss:passion' (List2)
Mismatch at index 9: 'exhaustion:sigh' (List1) vs 'discourse:relationship' (List2)
Mismatch at index 10: 'glare:anger' (List1) vs 'sorrow:tears' (List2)
Mismatch at index 13: 'hilarity:laugh' (List1) vs 'burp:gas' (List2)
Mismatch at index 16: 'growl:danger' (List1) vs 'punch:hatred' (List2)
Mismatch at index 17: 'glare:anger' (List1) vs 'handshake:cordiality' (List2)
Mismatch at index 19: 'glare:anger' (List1) vs 'wave:acknowledgment' (List2)
Mismatch at index 20: 'hilarity:laugh' (List1) vs 'slap:an

In [51]:
# Statistics for matching data

# 1c
# least - 51 out of 105
# most - 25 out of 105
# Modification made:
# leasts_gpt.insert(20, '')
# mosts_gpt.insert(1, '')

# 2a
# least - 46 out of 110
# most - 44 out of 110

# 2g
# least - 51 out of 108
# most - 57 out of 108

# 4b
# least - 26 out of 88
# most - 36 out of 88

# 4d
# least - 40 out of 75
# most - 43 out of 75

# 6d
# least - 34 out of 113
# most - 45 out of 113

# 10b
# Modification made:
# leasts_gpt.insert(35, '')
# mosts_gpt.insert(35, '')
# least - 45 out of 100
# most - 32 out of 100

In [54]:
from prettytable import PrettyTable

# Data to be displayed
data = [
    {"id": "1c", "least": "51 out of 105", "most": "25 out of 105", "notes": "leasts_gpt.insert(20, ''), mosts_gpt.insert(1, '')"},
    {"id": "2a", "least": "46 out of 110", "most": "44 out of 110", "notes": ""},
    {"id": "2g", "least": "51 out of 108", "most": "57 out of 108", "notes": ""},
    {"id": "4b", "least": "26 out of 88", "most": "36 out of 88", "notes": ""},
    {"id": "4d", "least": "40 out of 75", "most": "43 out of 75", "notes": ""},
    {"id": "6d", "least": "34 out of 113", "most": "45 out of 113", "notes": ""},
    {"id": "10b", "least": "45 out of 100", "most": "32 out of 100", "notes": "leasts_gpt.insert(35, ''), mosts_gpt.insert(35, '')"}
]

# Create a PrettyTable
table = PrettyTable()
table.field_names = ["ID", "Least Illustrative", "Most Illustrative", "Modification Notes"]

# Adding rows to the table
for entry in data:
    table.add_row([entry["id"], entry["least"], entry["most"], entry["notes"]])

print(table)

+-----+--------------------+-------------------+-----------------------------------------------------+
|  ID | Least Illustrative | Most Illustrative |                  Modification Notes                 |
+-----+--------------------+-------------------+-----------------------------------------------------+
|  1c |   51 out of 105    |   25 out of 105   |  leasts_gpt.insert(20, ''), mosts_gpt.insert(1, '') |
|  2a |   46 out of 110    |   44 out of 110   |                                                     |
|  2g |   51 out of 108    |   57 out of 108   |                                                     |
|  4b |    26 out of 88    |    36 out of 88   |                                                     |
|  4d |    40 out of 75    |    43 out of 75   |                                                     |
|  6d |   34 out of 113    |   45 out of 113   |                                                     |
| 10b |   45 out of 100    |   32 out of 100   | leasts_gpt.insert(35, ''