In [None]:
import pandas as pd
import json
from google.colab import files
import requests
import zipfile
import os

In [None]:
# Download the JSON zip file from GitHub
zip_url = "https://github.com/rae-drt/Copy1Hackathon/raw/refs/heads/main/Metadata/COPY1_catalogue.zip"
zip_filename = "COPY_1_processed_json.zip"

print(f"Downloading {zip_filename}...")
response = requests.get(zip_url)
with open(zip_filename, 'wb') as f:
    f.write(response.content)
print("Download complete.")

# Extract the zip file
extract_dir = "extracted_json_data/" # Extract to the parent directory
os.makedirs(extract_dir, exist_ok=True)

print(f"Extracting {zip_filename} to {extract_dir}...")
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
print("Extraction complete.")

# Read each JSON file in the extracted directory and append its content to empty list
data_list = []
json_files_dir = os.path.join(extract_dir, "FinalData") # Look for files in the nested FinalData directory
print(f"Reading JSON files from {json_files_dir}...")
file_count = 0
# Check if the json_files_dir exists before listing files
if os.path.exists(json_files_dir):
    for filename in os.listdir(json_files_dir):
        if filename.startswith('reject'):
            continue
        if filename.endswith(".json"):
            filepath = os.path.join(json_files_dir, filename)
            print(f"Reading file: {filepath}")
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    if isinstance(data, list):
                        data_list.extend(data)
                    else:
                        data_list.append(data)
                file_count += 1
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {filepath}: {e}")
else:
    print(f"Directory not found: {json_files_dir}")

print(f"Finished reading {file_count} JSON files.")
print(f"Total items in data_list: {len(data_list)}")

In [None]:
# Extract the inner dictionaries from data_list
processed_data = []
for item in data_list:
    # Assuming each item in data_list is a dictionary with one key and a nested dictionary as value
    # We want to extract the nested dictionary
    if isinstance(item, dict):
        for key, value in item.items():
            if isinstance(value, dict):
                processed_data.append(value)
            else:
                print(f"Warning: Value for key '{key}' is not a dictionary: {value}")
    else:
        print(f"Warning: Item in data_list is not a dictionary: {item}")


# Convert the processed data (list of inner dictionaries) to DataFrame
df = pd.DataFrame(processed_data)

# Normalize nested JSON columns, handling potential errors
try:
    description_fields_df = pd.json_normalize(df['DescriptionFields'])
    scope_content_df = pd.json_normalize(df['scopeContent'])

    # Concatenate the normalized dataframes and drop the original nested columns
    df = pd.concat([df.drop(['DescriptionFields', 'scopeContent'], axis=1), description_fields_df, scope_content_df], axis=1)
except KeyError as e:
    print(f"KeyError during normalization: {e}. Skipping normalization for these columns.")
    # If KeyError occurs, keep the original df without normalization of missing columns
except Exception as e:
    print(f"An error occurred during normalization: {e}. Skipping normalization.")


# Display the first few rows of the resulting dataframe
display(df.head())

In [None]:
# Function to split a column by the first comma and create two new columns and strip whitespace
def split_column_by_first_comma(df, column_name):
    if column_name in df.columns:
        split_data = df[column_name].astype(str).str.split(',', n=1, expand=True)
        df[f'{column_name}_name'] = split_data[0].str.strip() # Apply strip to the name part
        # Check if the second part exists before trying to access it and strip it
        if split_data.shape[1] > 1:
            df[f'{column_name}_address'] = split_data[1].str.strip() # Apply strip to the address part
            # Remove trailing full stop if it exists (after stripping)
            df[f'{column_name}_address'] = df[f'{column_name}_address'].astype(str).str.rstrip('.')
        else:
            df[f'{column_name}_address'] = None # Or any other placeholder for missing data
    return df

# Split 'copyright owner' column
df = split_column_by_first_comma(df, 'CopyrightOwner')

# Split 'copyrightauthor' column
df = split_column_by_first_comma(df, 'CopyrightAuthor')

In [None]:
# Reorder columns
original_columns = df.columns.tolist()
reordered_columns = []
for col in original_columns:
    reordered_columns.append(col)
    if col == 'CopyrightOwner':
        reordered_columns.append('CopyrightOwner_name')
        reordered_columns.append('CopyrightOwner_address')
    elif col == 'CopyrightAuthor':
        reordered_columns.append('CopyrightAuthor_name')
        reordered_columns.append('CopyrightAuthor_address')

# Remove duplicates caused by adding the new columns already
reordered_columns = [col for col in reordered_columns if col in original_columns or col.endswith('_name') or col.endswith('_address')]
reordered_columns = list(dict.fromkeys(reordered_columns)) # Remove duplicates while preserving order

# Filter out the original name and address columns from their original positions
reordered_columns = [col for col in reordered_columns if col not in ['CopyrightOwner_name', 'CopyrightOwner_address', 'CopyrightAuthor_name', 'CopyrightAuthor_address']]


In [None]:
# Insert the name and address columns after the original columns
final_columns = []
for col in df.columns:
    final_columns.append(col)
    if col == 'CopyrightOwner':
        final_columns.append('CopyrightOwner_name')
        final_columns.append('CopyrightOwner_address')
    elif col == 'CopyrightAuthor':
        final_columns.append('CopyrightAuthor_name')
        final_columns.append('CopyrightAuthor_address')

In [None]:
# Remove duplicates from the final list
final_columns = list(dict.fromkeys(final_columns))

In [None]:
# Select columns in the desired order
df = df[final_columns]

In [None]:
# Remove columns starting with 'C' followed by a number, and specific columns
columns_to_drop = [col for col in df.columns if pd.Series(col).str.match(r'^C\d+').any()]
columns_to_drop.extend(['ephemera', 'placeNames']) # Add 'ephemera' and 'placeNames' to the list

# Ensure no duplicate column names are in the drop list
columns_to_drop = list(set(columns_to_drop))

df = df.drop(columns=columns_to_drop, errors='ignore') # Use errors='ignore' to avoid error if columns don't exist

# Rename 'description' column to 'scopecontent'
if 'description' in df.columns:
    df = df.rename(columns={'description': 'scopecontent'})
    print("Renamed 'description' column to 'scopecontent'")
else:
    print("'description' column not found.")


# Display the first few rows of the modified dataframe to confirm
display(df.head())

In [None]:
from ipywidgets import IntRangeSlider, interact
from IPython.display import display
import numpy as np

# Assuming your data has a 'coveringFromDate' column from which we can determine the year range
# Get the minimum and maximum years from the dataframe, handling potential non-numeric values
# Convert to numeric, coerce errors to NaN, drop NaNs, convert to string, slice, convert to int
numeric_dates = pd.to_numeric(df['coveringFromDate'], errors='coerce').dropna().astype(int).astype(str).str[:4]
min_year = int(numeric_dates.min()) if not numeric_dates.empty else 1900 # Default if no valid years
max_year = int(numeric_dates.max()) if not numeric_dates.empty else 2000 # Default if no valid years

# Set the desired minimum and maximum years for the slider
slider_min_year = 1860
slider_max_year = 1912

# Create a range slider for the year range
year_range_slider = IntRangeSlider(
    value=[max(min_year, slider_min_year), min(max_year, slider_max_year)], # Set initial value within the desired range
    min=slider_min_year,
    max=slider_max_year,
    step=1,
    description='Select Year Range:',
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

display(year_range_slider)

In [None]:
# Get the selected start and end years from the slider
start_year = year_range_slider.value[0]
end_year = year_range_slider.value[1]

# Convert 'coveringFromDate' to numeric, coerce errors to NaN, and then to string before slicing
covering_from_year_str = pd.to_numeric(df['coveringFromDate'], errors='coerce').astype('Int64').astype(str).str[:4]

# Create a boolean mask for rows where the extracted year string is a digit
is_digit_mask = covering_from_year_str.str.isdigit()

# Apply the digit mask first, then convert to int for comparison
# This ensures that we only attempt to convert strings that are actually digits
mask = is_digit_mask & (covering_from_year_str[is_digit_mask].astype(int).between(start_year, end_year))

# Apply the mask to the DataFrame, reindexing to handle the boolean mask
df_filtered = df[mask.reindex(df.index, fill_value=False)]

print(f"DataFrame filtered for years between {start_year} and {end_year}.")
display(df_filtered.head())

In [None]:
# Save the DataFrame to a CSV file
output_csv_file = f"COPY 1_json_combined_split_{start_year}_{end_year}.csv"
df_filtered.to_csv(output_csv_file, index=False)

print(f"DataFrame saved to '{output_csv_file}'")

# Download the CSV file
from google.colab import files
try:
  files.download(output_csv_file)
except Exception as e:
  print(f"Error downloading file: {e}")
  print("Please ensure the file path is correct.")