# Import libraries

In [1]:
import pandas as pd
import numpy as np
import openai
import time
import nltk
import re
import ast
import os

from datetime import datetime
from openai.error import RateLimitError
from collections import Counter

# OpenAI Key
You need your own OpenAI key to run the LLM-based sentence classification task. You can create your own API key here: https://platform.openai.com/api-keys

In [2]:
openai.api_key = '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'

# Path to regulations_dataset.csv

In [4]:
# Getting path of current working directory
current_directory = os.getcwd()
print(f"Current directory: {current_directory}")

# Path to regulations_dataset
regulations_dataset_path = os.path.normpath(os.path.join(current_directory, '..', 'datasets', 'regulations_dataset.csv'))
print(f"Path to regulations_dataset.csv: {regulations_dataset_path}")

Current directory: /Users/niclasgriesshaber/Desktop/guilds-llm/02_llm_classification
Path to regulations_dataset.csv: /Users/niclasgriesshaber/Desktop/guilds-llm/datasets/llm_classification_results.csv


# Load regulations dataset

In [6]:
# Load the dataset
df = pd.read_csv(regulations_dataset_path)

In [6]:
# View dataset
df.head()

Unnamed: 0,country,year,guild,text
0,mexico,1757,cotton-weavers,"1.—Ordenanza primera. Primeramente, que las ma..."
1,mexico,1620,bakers,"1.—Primeramente, antes todas cosas, todos los ..."
2,mexico,1592,cloth-makers,1.— Que cualquiera persona de cualquiera calid...
3,mexico,1605,cloth-finishers,Primeramente que al principio de cada un año s...
4,mexico,1706,tallow,"Primeramente, que en cada un año por principio..."


# Clean text from LLM digitization pipeline

In [7]:
# Function to clean the OCR text provided by the large language model
def clean_text(text):
    
    # Remove line breaks
    text_no_line_breaks = text.replace('\n', ' ')

    # Remove Arabic numeral patterns followed by a period
    text_no_arabic_numerals = re.sub(r'\b\d+\.\s*', '', text_no_line_breaks)

    # Use a regular expression to split on periods that are not part of "etc." or similar abbreviations
    sentences = re.split(r'(?<!\betc)\.\s+(?=[A-Z])', text_no_arabic_numerals)

    # Filter sentences with less than 4 words
    cleaned_sentences = [sentence.strip() for sentence in sentences if len(sentence.split()) >= 4]
    cleaned_text = '. '.join(cleaned_sentences)

    return cleaned_text

In [8]:
# Apply the function to create the new column 'cleaned_text'
df['cleaned_text'] = df['text'].apply(clean_text)

In [9]:
# View dataframe
df.head()

Unnamed: 0,country,year,guild,text,cleaned_text
0,mexico,1757,cotton-weavers,"1.—Ordenanza primera. Primeramente, que las ma...","Primeramente, que las mantas ordinarias se han..."
1,mexico,1620,bakers,"1.—Primeramente, antes todas cosas, todos los ...","—Primeramente, antes todas cosas, todos los pa..."
2,mexico,1592,cloth-makers,1.— Que cualquiera persona de cualquiera calid...,— Que cualquiera persona de cualquiera calidad...
3,mexico,1605,cloth-finishers,Primeramente que al principio de cada un año s...,Primeramente que al principio de cada un año s...
4,mexico,1706,tallow,"Primeramente, que en cada un año por principio...","Primeramente, que en cada un año por principio..."


# Pre-processing dataset

In [10]:
# Split the cleaned text into sentences
df['sentences'] = df['cleaned_text'].apply(nltk.sent_tokenize)

# Count the number of sentences in the ordinance
df['sentence_count'] = df['sentences'].apply(lambda x: len(x))

# Initialize the column 'classification_count' with NaN
df['classification_count'] = np.nan
df['classification_count'] = df['classification_count'].astype(object)

In [11]:
# Check number of total sentences in the regulations dataset
df['sentence_count'].sum()

1044

In [12]:
df.head()

Unnamed: 0,country,year,guild,text,cleaned_text,sentences,sentence_count,classification_count
0,mexico,1757,cotton-weavers,"1.—Ordenanza primera. Primeramente, que las ma...","Primeramente, que las mantas ordinarias se han...","[Primeramente, que las mantas ordinarias se ha...",40,
1,mexico,1620,bakers,"1.—Primeramente, antes todas cosas, todos los ...","—Primeramente, antes todas cosas, todos los pa...","[—Primeramente, antes todas cosas, todos los p...",5,
2,mexico,1592,cloth-makers,1.— Que cualquiera persona de cualquiera calid...,— Que cualquiera persona de cualquiera calidad...,[— Que cualquiera persona de cualquiera calida...,5,
3,mexico,1605,cloth-finishers,Primeramente que al principio de cada un año s...,Primeramente que al principio de cada un año s...,[Primeramente que al principio de cada un año ...,9,
4,mexico,1706,tallow,"Primeramente, que en cada un año por principio...","Primeramente, que en cada un año por principio...","[Primeramente, que en cada un año por principi...",10,


# GPT-4 Multi-Category Classification Prompt

In [13]:
# Task prompt to instruct GPT-4 to classify the sentence
task_prompt = """Clasifica la siguiente oración en español según las categorías a continuación: 
0: La oración menciona alguna forma de discriminación, por ejemplo, excluyendo a personas específicas, como negros, mulatos o indios del gremio, o de ser promovidos o que no pueden abrir su propia tienda. 
1: La oración menciona alguna forma de capital humano, como educación, aprendices, aprendizajes o exámenes de oficio dentro del gremio. 
2: La oración menciona la calidad del producto, quizás detallando el proceso de fabricación. 
3: La oración menciona cualquier cosa relacionada con los mercados económicos, posiblemente mencionando precios, suministro u otros conceptos de mercado similares. 
4: La oración menciona alguna forma de castigo, multas o autoridades que hacen cumplir las ordenanzas. 
5: La oración se refiere a alguna forma de religión, cofradías o hermandades. 
6: La oración no encaja en ninguna de las categorías anteriores. 
ALWAYS use the following format: 34 or 015. NEVER use commas (,), line breaks (\n), or whitespaces ( ) in your response.
If a sentence belongs to one category, only return a single number."""

# Functions for the LLM-based sentence classification task

Temperature parameter is set to 0 for deterministic outputs.

In [14]:
# Function to send a sentence with a task prompt to the OpenAI API
def classify_sentence(task_prompt, text):
    """
    Classifies a given sentence `text` based on a set of categories described in `task_prompt`.
    
    Args:
    - task_prompt (str): A detailed description of the categories for classification.
    - text (str): The sentence to classify.
    
    Returns:
    - str: The classification result.
    """
    
    message = {
        "role": "system",
        "content": f"{task_prompt}\nOración para clasificar: {text}"
    }
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[message],
        temperature=0.0, # Set to 0.0 to get deterministic results
        seed = 42,
    )
    return response['choices'][0]['message']['content'].strip()

In [15]:
# Classifying every sentence of an ordinance using GPT-4
def classify_and_count(task_prompt, sentences, delay_time=5):
    classifications = []
    
    for i, sentence in enumerate(sentences):
        print(f'Analysing new sentence {i}')
        
        while True:
            try:
                classification = classify_sentence(task_prompt, sentence)
                classifications.append(classification)
                break  # Exit the loop if the function call was successful
            except Exception as e:
                print(f'Error occurred: {e}. Retrying at:', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
                time.sleep(delay_time)
                print('Retrying now at:', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        
    counter = Counter(classifications)
    return dict(counter)

In [16]:
# Iterate over the whole dataset for LLM-based sentence classification task
def process_dataframe(df, task_prompt, delay_time=5):
    length = len(df)
    for i in range(length):
        print(f'Starting new ordinance: {i}')
        try:
            current_sentences = df.loc[i]['sentences']
            my_dict = classify_and_count(task_prompt, current_sentences, delay_time)
            df.at[i, 'classification_count'] = my_dict
            print('Dictionary added to dataframe:', my_dict)
            
        except RateLimitError as e:
            print(f"Rate limit reached. Waiting for {e.retry_after}ms.")
            time.sleep(e.retry_after / 1000)  # Convert ms to seconds.
        except Exception as e:
            print(f"Error: {e}. Skipping sentence at index {i}.")

# Run LLM-based sentence classification task using GPT-4
This may take a while.

In [18]:
# Run LLM-based sentence classification task using GPT-4
process_dataframe(df, task_prompt)
print('Finished processing dataframe.')

Starting new ordinance: 0
Analysing new sentence 0
Analysing new sentence 1
Analysing new sentence 2
Analysing new sentence 3
Analysing new sentence 4
Analysing new sentence 5
Analysing new sentence 6
Analysing new sentence 7
Analysing new sentence 8
Analysing new sentence 9
Analysing new sentence 10
Analysing new sentence 11
Analysing new sentence 12
Analysing new sentence 13
Analysing new sentence 14
Analysing new sentence 15
Analysing new sentence 16
Analysing new sentence 17
Analysing new sentence 18
Analysing new sentence 19
Analysing new sentence 20
Analysing new sentence 21
Analysing new sentence 22
Analysing new sentence 23
Analysing new sentence 24
Analysing new sentence 25
Analysing new sentence 26
Analysing new sentence 27
Analysing new sentence 28
Analysing new sentence 29
Analysing new sentence 30
Analysing new sentence 31
Analysing new sentence 32
Analysing new sentence 33
Analysing new sentence 34
Analysing new sentence 35
Analysing new sentence 36
Analysing new sentence

In [19]:
# View dataset to check classification results
df.head()

Unnamed: 0,country,year,guild,text,cleaned_text,sentences,sentence_count,classification_count
0,mexico,1757,cotton-weavers,"1.—Ordenanza primera. Primeramente, que las ma...","Primeramente, que las mantas ordinarias se han...","[Primeramente, que las mantas ordinarias se ha...",40,"{'4': 11, '2': 17, '14': 5, '24': 3, '04': 1, ..."
1,mexico,1620,bakers,"1.—Primeramente, antes todas cosas, todos los ...","—Primeramente, antes todas cosas, todos los pa...","[—Primeramente, antes todas cosas, todos los p...",5,"{'1': 1, '34': 1, '3': 1, '4': 2}"
2,mexico,1592,cloth-makers,1.— Que cualquiera persona de cualquiera calid...,— Que cualquiera persona de cualquiera calidad...,[— Que cualquiera persona de cualquiera calida...,5,"{'24': 3, '4': 2}"
3,mexico,1605,cloth-finishers,Primeramente que al principio de cada un año s...,Primeramente que al principio de cada un año s...,[Primeramente que al principio de cada un año ...,9,"{'14': 3, '1': 4, '134': 1, '04': 1}"
4,mexico,1706,tallow,"Primeramente, que en cada un año por principio...","Primeramente, que en cada un año por principio...","[Primeramente, que en cada un año por principi...",10,"{'4': 1, '14': 1, '1': 2, '2': 2, '3': 2, '15'..."


# Post-processing of LLM-based sentence classification task

# Check if every row contains a valid dictionary

In [20]:
# Check if each entry is a dictionary
is_dict = df['classification_count'].apply(lambda x: isinstance(x, dict))

# There should be 54 valid dictionaries
print(f'Number of valid dictionaries: {is_dict.sum()}')

Number of valid dictionaries: 54


# Check unique characters in dictionary keys

In [21]:
# Assuming df is your DataFrame and 'classification_count' is the column with dictionaries
unique_keys = set().union(*df['classification_count'].apply(lambda x: x.keys()))

# Convert the set to a list if you need a list
unique_keys_list = list(unique_keys)

# Print the unique keys
print(unique_keys_list)

['245', '34', '24, 4, 5', '1', '014', '6', '2', '042', '45', '05', '04', '12', '4', '1254', '0154', '23', '1345', '14', '3', '145', '234', '2 4', '15', '24', '134', '01', '5', '0']


# Remove potential whitespaces from dictionary keys

In [22]:
# Function to remove whitespaces from dictionary keys
def remove_whitespace_from_keys(d):
    return {k.replace(' ', ''): v for k, v in d.items()}

# Apply the function to each dictionary in the 'classification_count' column
df['classification_count'] = df['classification_count'].apply(remove_whitespace_from_keys)

In [23]:
# Assuming df is your DataFrame and 'classification_count' is the column with dictionaries
unique_keys = set().union(*df['classification_count'].apply(lambda x: x.keys()))

# Convert the set to a list if you need a list
unique_keys_list = list(unique_keys)

print(unique_keys_list)

['245', '34', '1', '014', '6', '2', '042', '45', '05', '04', '12', '4', '1254', '0154', '23', '1345', '14', '3', '145', '234', '24,4,5', '15', '24', '134', '01', '5', '0']


# Transform classification count to classification dictionary
For example, disentangle "04" into "0" and "4" in a new classification dictionary

In [7]:
# Initialise the column 'classification_dict' with NaN
df['classification_dict'] = np.nan

In [8]:
# Disentangle multi-classifications such as "04" into "0" and "4"
def transform_dict(original_dict):
    
    # Initialize new dictionary
    new_dict = {str(i): 0 for i in range(7)}  # Adjusted range to 0-7 for single-digit numbers
    
    # Ensure the original_dict is an actual dictionary
    if isinstance(original_dict, str):
        original_dict = ast.literal_eval(original_dict)
    
    # Count occurrences
    for key, value in original_dict.items():
        if len(key) > 1:  # If the key has more than one digit
            numbers = list(key)  # Directly convert the string to a list of its characters
        else:
            numbers = [key]  # If it's a single digit, make it a list
        for num in numbers:
            num = num.strip()  # Remove any leading or trailing whitespace
            if num in new_dict:  # Check if num is a valid key in new_dict
                new_dict[num] += value
            else:
                print(f"Unexpected number found in key: {num}")  # For debugging

    return new_dict

In [9]:
# Assuming df is your DataFrame and 'classification_count' is the column with dictionaries
# Apply the transformation to each row in the 'classification_count' column
df['classification_dict'] = df['classification_count'].apply(transform_dict)

In [10]:
df.head()

Unnamed: 0,country,year,guild,text,cleaned_text,sentences,sentence_count,classification_count,classification_dict
0,mexico,1757,cotton-weavers,"1.—Ordenanza primera. Primeramente, que las ma...","Primeramente, que las mantas ordinarias se han...","['Primeramente, que las mantas ordinarias se h...",40,"{'24': 1, '2': 19, '14': 7, '4': 9, '04': 1, '...","{'0': 1, '1': 7, '2': 20, '3': 2, '4': 19, '5'..."
1,mexico,1620,bakers,"1.—Primeramente, antes todas cosas, todos los ...","—Primeramente, antes todas cosas, todos los pa...","['—Primeramente, antes todas cosas, todos los ...",5,"{'1': 1, '34': 1, '3': 1, '4': 2}","{'0': 0, '1': 1, '2': 0, '3': 2, '4': 3, '5': ..."
2,mexico,1592,cloth-makers,1.— Que cualquiera persona de cualquiera calid...,— Que cualquiera persona de cualquiera calidad...,['— Que cualquiera persona de cualquiera calid...,5,"{'24': 1, '4': 4}","{'0': 0, '1': 0, '2': 1, '3': 0, '4': 5, '5': ..."
3,mexico,1605,cloth-finishers,Primeramente que al principio de cada un año s...,Primeramente que al principio de cada un año s...,['Primeramente que al principio de cada un año...,9,"{'14': 2, '1': 5, '2': 1, '04': 1}","{'0': 1, '1': 7, '2': 1, '3': 0, '4': 3, '5': ..."
4,mexico,1706,tallow,"Primeramente, que en cada un año por principio...","Primeramente, que en cada un año por principio...","['Primeramente, que en cada un año por princip...",10,"{'4': 1, '14': 1, '1': 2, '2': 2, '3': 2, '15'...","{'0': 1, '1': 5, '2': 2, '3': 2, '4': 2, '5': ..."


# Add century variable

In [11]:
# Function to determine the century
def determine_century(year):
    if year == 1801:
        return 18
    return (year - 1) // 100 + 1

In [12]:
# Determine the century for each year
df['century'] = df['year'].apply(determine_century)

# Classification count per category

In [13]:
# Initialize the totals dictionary
totals = {str(i): 0 for i in range(7)}  

# Calculate totals
for i in range(len(df)):
    for key in totals.keys():
        totals[key] += df['classification_dict'][i].get(key, 0)

# Define the labels
labels = [
    "Entry Barriers", 
    "Human Capital", 
    "Product Quality", 
    "Markets",
    "Punishment, Fines and Enforcement", 
    "Religion",
    "Other"
]

# Print totals with labels
for label, key in zip(labels, totals.keys()):
    print(f"{label}: {totals[key]}")

Entry Barriers: 43
Human Capital: 294
Product Quality: 226
Markets: 96
Punishment, Fines and Enforcement: 481
Religion: 37
Other: 67


# Save llm classification results as csv

In [14]:
save_path = os.path.normpath(os.path.join(current_directory, '..', 'datasets', 'llm_classification_results.csv'))
print(f"Path to save the classification results: {save_path}")

Path to save the classification results: /Users/niclasgriesshaber/Desktop/guilds-llm/datasets/llm_classification_results.csv


In [15]:
# Save classification results
df.to_csv(save_path, index=False)