# Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk

from scipy import stats

# Load classifications dataset

In [2]:
# Getting path of current working directory
current_directory = os.getcwd()
print(f"Current directory: {current_directory}")

# Path to regulations_dataset.csv
regulations_dataset_path = os.path.normpath(os.path.join(current_directory, '..', 'regulations_dataset.csv'))
print(f"Path to llm_classification_results.csv: {regulations_dataset_path}")

Current directory: /Users/niclasgriesshaber/Desktop/guilds-llm/datasets/summary_statistics
Path to llm_classification_results.csv: /Users/niclasgriesshaber/Desktop/guilds-llm/datasets/regulations_dataset.csv


In [3]:
# Load regulations dataset for general analysis
df = pd.read_csv(regulations_dataset_path)

In [4]:
df.head()

Unnamed: 0,country,year,guild,text
0,mexico,1757,cotton-weavers,"1.—Ordenanza primera. Primeramente, que las ma..."
1,mexico,1620,bakers,"1.—Primeramente, antes todas cosas, todos los ..."
2,mexico,1592,cloth-makers,1.— Que cualquiera persona de cualquiera calid...
3,mexico,1605,cloth-finishers,Primeramente que al principio de cada un año s...
4,mexico,1706,tallow,"Primeramente, que en cada un año por principio..."


# Clean text from LLM digitization pipeline

In [5]:
# Function to clean the OCR text provided by the large language model
def clean_text(text):
    
    # Remove line breaks
    text_no_line_breaks = text.replace('\n', ' ')

    # Remove Arabic numeral patterns followed by a period
    text_no_arabic_numerals = re.sub(r'\b\d+\.\s*', '', text_no_line_breaks)

    # Use a regular expression to split on periods that are not part of "etc." or similar abbreviations
    sentences = re.split(r'(?<!\betc)\.\s+(?=[A-Z])', text_no_arabic_numerals)

    # Filter sentences with less than 4 words
    cleaned_sentences = [sentence.strip() for sentence in sentences if len(sentence.split()) >= 4]
    cleaned_text = '. '.join(cleaned_sentences)

    return cleaned_text

In [6]:
# Apply the function to create the new column 'cleaned_text'
df['cleaned_text'] = df['text'].apply(clean_text)

# Add sentences variable

In [7]:
# Split the cleaned text into sentences
df['sentences'] = df['cleaned_text'].apply(nltk.sent_tokenize)

# Count the number of sentences in the ordinance
df['sentence_count'] = df['sentences'].apply(lambda x: len(x))

# Add century variable

In [8]:
# Function to determine the century
def determine_century(year):
    if year == 1801:
        return 18
    return (year - 1) // 100 + 1

In [9]:
# Determine the century for each year
df['century'] = df['year'].apply(determine_century)

In [10]:
# Check dataframe
df.head()

Unnamed: 0,country,year,guild,text,cleaned_text,sentences,sentence_count,century
0,mexico,1757,cotton-weavers,"1.—Ordenanza primera. Primeramente, que las ma...","Primeramente, que las mantas ordinarias se han...","[Primeramente, que las mantas ordinarias se ha...",40,18
1,mexico,1620,bakers,"1.—Primeramente, antes todas cosas, todos los ...","—Primeramente, antes todas cosas, todos los pa...","[—Primeramente, antes todas cosas, todos los p...",5,17
2,mexico,1592,cloth-makers,1.— Que cualquiera persona de cualquiera calid...,— Que cualquiera persona de cualquiera calidad...,[— Que cualquiera persona de cualquiera calida...,5,16
3,mexico,1605,cloth-finishers,Primeramente que al principio de cada un año s...,Primeramente que al principio de cada un año s...,[Primeramente que al principio de cada un año ...,9,17
4,mexico,1706,tallow,"Primeramente, que en cada un año por principio...","Primeramente, que en cada un año por principio...","[Primeramente, que en cada un año por principi...",10,18


# Compute absolute word counts by country and century

In [11]:
# Compute total number of words for the whole dataset
df['word_count'] = df['cleaned_text'].str.split().str.len()
total_words = df['word_count'].sum()

print(f"Total number of words in the dataset: {total_words}")

# Tabulate word count by century and country
word_count_by_century_country = df.groupby(['century', 'country'])['word_count'].sum().unstack(fill_value=0)

print("\nWord count by century and country:")
print(word_count_by_century_country)

# If you want to see the total for each century
word_count_by_century = word_count_by_century_country.sum(axis=1)
print("\nTotal word count by century:")
print(word_count_by_century)

# If you want to see the total for each country
word_count_by_country = word_count_by_century_country.sum()
print("\nTotal word count by country:")
print(word_count_by_country)

Total number of words in the dataset: 94141

Word count by century and country:
country  mexico   peru
century               
16        11735   7246
17         4845  24667
18        18910  26738

Total word count by century:
century
16    18981
17    29512
18    45648
dtype: int64

Total word count by country:
country
mexico    35490
peru      58651
dtype: int64


# Average word counts for the whole dataset and by country and century

In [12]:
# Average word count for the whole sample
avg_word_count_total = df['word_count'].mean()
print(f"Average word count per observation for the whole sample: {avg_word_count_total:.2f}")

# Average word count by country
avg_word_count_by_country = df.groupby('country')['word_count'].mean()
print("\nAverage word count per observation by country:")
print(avg_word_count_by_country)

# Average word count by century
avg_word_count_by_century = df.groupby('century')['word_count'].mean()
print("\nAverage word count per observation by century:")
print(avg_word_count_by_century)

# Average word count by both country and century
avg_word_count_by_country_century = df.groupby(['country', 'century'])['word_count'].mean().unstack(fill_value=0)
print("\nAverage word count per observation by country and century:")
print(avg_word_count_by_country_century)

Average word count per observation for the whole sample: 1743.35

Average word count per observation by country:
country
mexico    1690.00000
peru      1777.30303
Name: word_count, dtype: float64

Average word count per observation by century:
century
16    1460.076923
17    1475.600000
18    2173.714286
Name: word_count, dtype: float64

Average word count per observation by country and century:
century           16           17        18
country                                    
mexico   1173.500000   807.500000  3782.000
peru     2415.333333  1761.928571  1671.125


# Compute word count per ordinance

In [13]:
# Function to print guild and word count for a specific country
def print_guild_word_count(country):
    country_df = df[df['country'] == country]
    print(f"\nGuild and Word Count for {country}:")
    for _, row in country_df.iterrows():
        print(f"Guild: {row['guild']}, Word Count: {row['word_count']}")
    
    # Calculate and print average word count for the country
    avg_word_count = country_df['word_count'].mean()
    print(f"\nAverage word count for {country}: {avg_word_count:.2f}")

# Print results for Mexico
print_guild_word_count('mexico')

# Print results for Peru
print_guild_word_count('peru')


Guild and Word Count for mexico:
Guild: cotton-weavers, Word Count: 2351
Guild: bakers, Word Count: 496
Guild: cloth-makers, Word Count: 733
Guild: cloth-finishers, Word Count: 1362
Guild: tallow, Word Count: 739
Guild: hatters, Word Count: 521
Guild: candle-makers, Word Count: 859
Guild: cloth-weavers, Word Count: 857
Guild: gold, Word Count: 974
Guild: harness-makers, Word Count: 346
Guild: sackcloth-weavers, Word Count: 4504
Guild: hatters, Word Count: 663
Guild: bakers, Word Count: 925
Guild: silk, Word Count: 1029
Guild: tailors, Word Count: 1219
Guild: silver-smiths, Word Count: 7464
Guild: hatters, Word Count: 3007
Guild: gold-weavers, Word Count: 1150
Guild: candle-makers, Word Count: 1696
Guild: turners, Word Count: 743
Guild: bakers, Word Count: 3852

Average word count for mexico: 1690.00

Guild and Word Count for peru:
Guild: blacksmiths, Word Count: 896
Guild: butchers, Word Count: 827
Guild: button-makers, Word Count: 583
Guild: hatters, Word Count: 915
Guild: fireworks-

# Check sentences in ordinance i

In [14]:
i = 0

# Convert string to list
data_list = df['sentences'][i]

# Print each entry in the list
for entry in data_list:
    print(entry)

Primeramente, que las mantas ordinarias se han de tejer en veinte y cuatro lienzos, y para que queden en marco de tres cuar- tas su arán de tejer en peine de treinta y en vara, se penal al que tal contrario hiciere, de cinco pesos de oro de minas, aplicados por cuartas partes, Cámara, Ciudad, Juez y denunciador.
—Item, que las mantas de siete ochavas, se han de tejer en peine de treinta y dos lienzos, y no de otra manera, so la dicha pena, aplicada como dicho es.
—Item, que en los mismos peines de treinta y dos, se puedan tejer las mantas de vara de ancho.
—Item, que los mantales, servilletas, huipiles, ceñidores y otros tejidos semejantes, se arreglen a la cuenta, marca y disposición de la manta, y siendo cosas menudas, se arreglen a de treinta, manera como decir hupilas de legítima cali- dad, ceñidores, etc., en peine de peine y dos en vara, y mienteras más fino mejor, por ser a favor del arte.
—Item, que los chapanecos, petatillos, etc., tejan en peines de veinte y seis, en vara.
—I

In [15]:
for i in range(len(df)):
    # Convert string representation of list to an actual list
    sentences_list = df['sentences'][i]
    
    # Print the number of sentences in each ordinance
    print(f'Number of sentences in ordinance {i}: {len(sentences_list)}')

Number of sentences in ordinance 0: 40
Number of sentences in ordinance 1: 5
Number of sentences in ordinance 2: 5
Number of sentences in ordinance 3: 9
Number of sentences in ordinance 4: 10
Number of sentences in ordinance 5: 4
Number of sentences in ordinance 6: 9
Number of sentences in ordinance 7: 11
Number of sentences in ordinance 8: 12
Number of sentences in ordinance 9: 9
Number of sentences in ordinance 10: 59
Number of sentences in ordinance 11: 5
Number of sentences in ordinance 12: 7
Number of sentences in ordinance 13: 5
Number of sentences in ordinance 14: 8
Number of sentences in ordinance 15: 52
Number of sentences in ordinance 16: 17
Number of sentences in ordinance 17: 18
Number of sentences in ordinance 18: 18
Number of sentences in ordinance 19: 20
Number of sentences in ordinance 20: 42
Number of sentences in ordinance 21: 8
Number of sentences in ordinance 22: 11
Number of sentences in ordinance 23: 7
Number of sentences in ordinance 24: 8
Number of sentences in 

# Compute average sentence length of dataset

In [16]:
# Function to calculate average sentence length in words
def avg_sentence_length(text):
    sentences = text.split('.')
    return np.mean([len(sentence.split()) for sentence in sentences if sentence.strip()])

In [17]:
# Calculate average sentence length for each entry
df['avg_sentence_length'] = df['cleaned_text'].apply(avg_sentence_length)

# Calculate log of sentence length
df['log_sentence_length'] = np.log(df['avg_sentence_length'])

# Separate data for Mexico and Peru
data = df['avg_sentence_length']
mexico_data = df[df['country'] == 'mexico']['log_sentence_length']
peru_data = df[df['country'] == 'peru']['log_sentence_length']

# Calculate averages
average_length = data.mean()
mexico_avg = mexico_data.mean()
peru_avg = peru_data.mean()

In [18]:
# View dataset with average sentence length
df.head()

Unnamed: 0,country,year,guild,text,cleaned_text,sentences,sentence_count,century,word_count,avg_sentence_length,log_sentence_length
0,mexico,1757,cotton-weavers,"1.—Ordenanza primera. Primeramente, que las ma...","Primeramente, que las mantas ordinarias se han...","[Primeramente, que las mantas ordinarias se ha...",40,18,2351,56.02381,4.025777
1,mexico,1620,bakers,"1.—Primeramente, antes todas cosas, todos los ...","—Primeramente, antes todas cosas, todos los pa...","[—Primeramente, antes todas cosas, todos los p...",5,17,496,99.2,4.597138
2,mexico,1592,cloth-makers,1.— Que cualquiera persona de cualquiera calid...,— Que cualquiera persona de cualquiera calidad...,[— Que cualquiera persona de cualquiera calida...,5,16,733,146.6,4.987708
3,mexico,1605,cloth-finishers,Primeramente que al principio de cada un año s...,Primeramente que al principio de cada un año s...,[Primeramente que al principio de cada un año ...,9,17,1362,151.333333,5.019485
4,mexico,1706,tallow,"Primeramente, que en cada un año por principio...","Primeramente, que en cada un año por principio...","[Primeramente, que en cada un año por principi...",10,18,739,73.9,4.302713


# Two sample Welch-test to check if log-sentence length differs between Mexico and Peru
Welch test assumes that the samples are normally distributed. It does the assumption that both sample have equal variances.

In [19]:
print(f'Average sentence length for the whole dataset: {average_length}')
print(f"Average log-sentence length for Mexico: {mexico_avg:.2f}")
print(f"Average log-sentence length for Peru: {peru_avg:.2f}")

# Perform Welch's t-test
t_statistic, p_value = stats.ttest_ind(mexico_data, peru_data, equal_var=False)

print(f"Welch's t-statistic: {t_statistic:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("The difference in log-sentence length is statistically significant.")
else:
    print("The difference in log-sentence length is not statistically significant.")

Average sentence length for the whole dataset: 92.09258905489041
Average log-sentence length for Mexico: 4.51
Average log-sentence length for Peru: 4.41
Welch's t-statistic: 0.7645
P-value: 0.4514
The difference in log-sentence length is not statistically significant.
