This notebook transform the llm_classifications_results.csv into a sentence-level dataset to test for statistical significance

# Import libraries

In [9]:
import pandas as pd
import ast
import os

# Load llm_classification_results.csv
We transform the classification_dict in this dataset into a sentence-level dataset for our probit regression analysis.

In [10]:
# Getting path of current working directory
current_directory = os.getcwd()
print(f"Current directory: {current_directory}")

# Path to llm_classification_results.csv
llm_classification_results_path = os.path.normpath(os.path.join(current_directory, '..', 'datasets', 'llm_classification_results.csv'))
print(f"Path to llm_classification_results.csv: {llm_classification_results_path}")

Current directory: /Users/niclasgriesshaber/Desktop/guilds-llm/03_probit_regression
Path to llm_classification_results.csv: /Users/niclasgriesshaber/Desktop/guilds-llm/datasets/llm_classification_results.csv


In [11]:
# Load regulations dataset for general analysis
df = pd.read_csv(llm_classification_results_path)

In [12]:
df.head()

Unnamed: 0,country,year,guild,text,cleaned_text,sentences,sentence_count,classification_count,classification_dict,century
0,mexico,1757,cotton-weavers,"1.—Ordenanza primera. Primeramente, que las ma...","Primeramente, que las mantas ordinarias se han...","['Primeramente, que las mantas ordinarias se h...",40,"{'24': 1, '2': 19, '14': 7, '4': 9, '04': 1, '...","{'0': 1, '1': 7, '2': 20, '3': 2, '4': 19, '5'...",18
1,mexico,1620,bakers,"1.—Primeramente, antes todas cosas, todos los ...","—Primeramente, antes todas cosas, todos los pa...","['—Primeramente, antes todas cosas, todos los ...",5,"{'1': 1, '34': 1, '3': 1, '4': 2}","{'0': 0, '1': 1, '2': 0, '3': 2, '4': 3, '5': ...",17
2,mexico,1592,cloth-makers,1.— Que cualquiera persona de cualquiera calid...,— Que cualquiera persona de cualquiera calidad...,['— Que cualquiera persona de cualquiera calid...,5,"{'24': 1, '4': 4}","{'0': 0, '1': 0, '2': 1, '3': 0, '4': 5, '5': ...",16
3,mexico,1605,cloth-finishers,Primeramente que al principio de cada un año s...,Primeramente que al principio de cada un año s...,['Primeramente que al principio de cada un año...,9,"{'14': 2, '1': 5, '2': 1, '04': 1}","{'0': 1, '1': 7, '2': 1, '3': 0, '4': 3, '5': ...",17
4,mexico,1706,tallow,"Primeramente, que en cada un año por principio...","Primeramente, que en cada un año por principio...","['Primeramente, que en cada un año por princip...",10,"{'4': 1, '14': 1, '1': 2, '2': 2, '3': 2, '15'...","{'0': 1, '1': 5, '2': 2, '3': 2, '4': 2, '5': ...",18


# Convert string into dictionary
Saving the dataframes as csv has the side-effect that dictionaries and lists are saved as strings.

In [13]:
# Sample ensure_dict function from the previous example
def ensure_dict(variable):
    # Check if the variable is a string
    if isinstance(variable, str):
        try:
            # Attempt to convert the string to a dictionary
            return ast.literal_eval(variable)
        except (ValueError, SyntaxError):
            # If conversion fails, return the original string
            return variable
    else:
        # If the variable isn't a string, return it unchanged
        return variable

In [14]:
# Apply the function to the 'classification_count' column
df['classification_dict'] = df['classification_dict'].apply(ensure_dict)

# Build sentence-level dataset
Unit of analysis: The sentence. This allows us to perform a probit regression.

In [15]:
def build_dataset(df, category_key, category_label):
    # Add a new column to count the occurrences of the specified category
    df[f'{category_label.lower().replace(" ", "_")}_counts'] = df['classification_dict'].apply(lambda x: x.get(category_key, 0))
    
    # Initialize an empty list to store the rows for the new DataFrame
    rows = []
    
    # Iterate over each row in the original DataFrame
    for _, row in df.iterrows():
        # Add rows with quality = 1
        for _ in range(row[f'{category_label.lower().replace(" ", "_")}_counts']):
            rows.append({
                'country': row['country'],
                'guild': row['guild'],
                'century': row['century'],
                'year': row['year'],
                'depvar': 1
            })
        
        # Add rows with quality = 0
        for _ in range(row['sentence_count'] - row[f'{category_label.lower().replace(" ", "_")}_counts']):
            rows.append({
                'country': row['country'],
                'guild': row['guild'],
                'century': row['century'],
                'year': row['year'],
                'depvar': 0
            })
    
    # Create the new DataFrame from the list of rows
    new_df = pd.DataFrame(rows)
    
    # Create dummy variables for 'mexico' and centuries
    new_df['dummy_mexico'] = (new_df['country'] == 'mexico').astype(int)
    new_df['dummy17'] = (new_df['century'] == 17).astype(int)
    new_df['dummy18'] = (new_df['century'] == 18).astype(int)
    
    # Create interaction terms
    new_df['mexico_x_17'] = new_df['dummy_mexico'] * new_df['dummy17']
    new_df['mexico_x_18'] = new_df['dummy_mexico'] * new_df['dummy18']
    
    # Add the index column
    new_df['index'] = range(1, len(new_df) + 1)
    
    # Convert the label to lowercase, replace spaces with underscores, and add the necessary suffix
    # Rename the 'depvar' column to the category name
    new_df = new_df.rename(columns={'depvar': category_label.lower().replace(" ", "_")})
    
    return new_df

In [16]:
# Define labels for categories. Order matters.
labels = [
    "Entry Barriers",
    "Human Capital",
    "Product Quality",
    "Markets",
    "Enforcement",
    "Religion",
    "Other"
]

# Initialize an empty list to store dataframes
dfs = []

# Build datasets for each category and append them to the list
for i, label in enumerate(labels):
    category_df = build_dataset(df, str(i), label)
    dfs.append(category_df)

# Merge all dataframes on common columns ['country', 'guild', 'century', 'year', 'index']
merged_df = pd.concat(dfs, axis=1)

# Remove duplicate columns (keeping the first occurrence)
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]

# Specify the desired order of columns
new_column_order = ['index', 'country', 'guild', 'century', 'year', 'entry_barriers', 
                    'human_capital', 'product_quality', 'markets', 'enforcement', 
                    'religion', 'other', 'dummy_mexico', 'dummy17', 'dummy18', 
                    'mexico_x_17', 'mexico_x_18']

# Reorder the columns in merged_df
merged_df = merged_df[new_column_order]

In [17]:
# Save path
save_path = os.path.normpath(os.path.join(current_directory, '..', 'datasets', 'sentence_dataset.csv'))
print(f"Save sentence-level_dataset under path: {save_path}")

Save sentence-level_dataset under path: /Users/niclasgriesshaber/Desktop/guilds-llm/datasets/sentence_dataset.csv


In [18]:
# Save the merged dataset
merged_df.to_csv(save_path, index=False)

print(f"Merged dataset saved to: {save_path}")
print(f"Shape of the merged dataset: {merged_df.shape}")
print("Columns in the merged dataset:")
print(merged_df.columns.tolist())

Merged dataset saved to: /Users/niclasgriesshaber/Desktop/guilds-llm/datasets/sentence_dataset.csv
Shape of the merged dataset: (1044, 17)
Columns in the merged dataset:
['index', 'country', 'guild', 'century', 'year', 'entry_barriers', 'human_capital', 'product_quality', 'markets', 'enforcement', 'religion', 'other', 'dummy_mexico', 'dummy17', 'dummy18', 'mexico_x_17', 'mexico_x_18']
