In [17]:
import json
import os
import re
import hashlib
import chardet
import yaml
from frictionless import Resource, validate

def get_format_and_mediatype(file_extension):
    format_mediatype_map = {
        'csv': ('csv', 'text/csv'),
        'json': ('json', 'application/json'),
        'xlsx': ('xlsx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'),
        'xls': ('xls', 'application/vnd.ms-excel'),
        'tsv': ('tsv', 'text/tab-separated-values'),
        'txt': ('txt', 'text/plain')
    }
    return format_mediatype_map.get(file_extension.lower(), ('unknown', 'application/octet-stream'))

def compute_md5(file_path):
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
    result = chardet.detect(raw_data)
    return result['encoding']

def get_file_size(file_path):
    return os.path.getsize(file_path)

def load_schema(schema_path):
    with open(schema_path, 'r') as file:
        if schema_path.endswith('.json'):
            return json.load(file)
        elif schema_path.endswith('.yaml') or schema_path.endswith('.yml'):
            return yaml.safe_load(file)
        else:
            raise ValueError("Unsupported schema file format. Please provide a .json or .yaml file.")

def sanitize_name(name):
    # Convert to lowercase
    name = name.lower()
    # Replace invalid characters with hyphens
    name = re.sub(r'[^a-z0-9._/-]', '-', name)
    # Remove leading and trailing hyphens
    name = name.strip('-')
    return name

def frictionless_resource_maker(schema_path, input_file_path, description='', user_format=None, user_mediatype=None, title=None):
    """
    Creates a frictionless resource file with the given parameters.

    Parameters:
    - schema_path (str): Path to the JSON or YAML schema file.
    - input_file_path (str): Path to the input data file.
    - description (str): Description of the resource (optional).
    - user_format (str): User-specified format for the resource (optional).
    - user_mediatype (str): User-specified media type for the resource (optional).
    - title (str): Title of the resource (optional).

    This function generates a YAML file with metadata about the resource including
    format, media type, encoding, file size, MD5 hash, and schema.
    """
    # Check if the schema file exists
    if not os.path.exists(schema_path):
        print(f"Schema file '{schema_path}' does not exist.")
        return

    # Check if the input file exists
    if not os.path.exists(input_file_path):
        print(f"Input file '{input_file_path}' does not exist.")
        return

    # Read the schema file
    try:
        schema = load_schema(schema_path)
    except (json.JSONDecodeError, yaml.YAMLError, ValueError) as e:
        print(f"Error loading schema file: {e}")
        return

    # Extract the resource name and file extension from the input file path
    resource_name, file_extension = os.path.splitext(os.path.basename(input_file_path))
    file_extension = file_extension.lstrip('.')

    # Sanitize the resource name
    resource_name = sanitize_name(resource_name)

    # Determine the format and mediatype from the file extension
    if user_format and user_mediatype:
        format, mediatype = user_format, user_mediatype
    else:
        format, mediatype = get_format_and_mediatype(file_extension)

    # Compute the MD5 hash of the input file
    file_hash = compute_md5(input_file_path)

    # Detect the encoding of the input file
    encoding = detect_encoding(input_file_path)

    # Get the file size in bytes
    file_size = get_file_size(input_file_path)

    # Define the structure of the frictionless resource file
    resource = {
        'name': resource_name,
        'path': input_file_path,
        'description': description,
        'format': format,
        'mediatype': mediatype,
        'encoding': encoding,
        'bytes': file_size,
        'hash': file_hash,
        'schema': schema
    }

    if title:
        resource['title'] = title

    # Convert the dictionary to a YAML string with indentation for readability
    resource_yaml = yaml.dump(resource, sort_keys=False, default_flow_style=False)

    # Write the YAML string to a file
    resource_filename = f'{resource_name}.resource.yaml'
    print(f"Writing resource file to {resource_filename}")

    with open(resource_filename, 'w') as file:
        file.write(resource_yaml)

    print("Validating resource on disk (including data and schema). This may take some time.")
    resource_on_disk = Resource(resource_filename)
    results = resource_on_disk.validate()
    if results.valid:
        print("Resource is valid\n")
    else:
        print("ERROR: Resource is NOT valid. Errors follow.\n")
        print(results)
        raise RuntimeError

# Example usage
frictionless_resource_maker(
    schema_path='Land_Use_Types_descriptions.schema.yaml',
    input_file_path='Land_Use_Types_descriptions.csv',
    description='An example dataset',
    user_format='custom_format',
    user_mediatype='application/custom',
    title='Example Resource Title'
)


Writing resource file to land_use_types_descriptions.resource.yaml
Validating resource on disk (including data and schema). This may take some time.
Resource is valid

