In [9]:
from functools import partial
import pandas as pd
import numpy as np
from pathlib import Path
import yaml
import platform
import os

# File paths
NDARread_path = r"S:\MEGLAB_STUDIES\DATA_MANAGEMENT\NDAR\20519_Brains Keep Changing\EdgarR01BrainsKeepCh-NDARGUIDCreator_DATA_2024-12-11_1506.csv"
NDARconverted_path = r"S:\MEGLAB_STUDIES\DATA_MANAGEMENT\NDAR\20519_Brains Keep Changing\Converted_NDAR_GUID_12_11_2024.csv"

# Load the data
ndar = pd.read_csv(NDARread_path)

# Columns to keep
keep_cols = [
    'study_id', 'redcap_event_name', 'redcap_repeat_instrument', 'redcap_repeat_instance', 'con_name', 'participant_name', 
    'demo_dob', 'demo_birthcity', 'city_of_birth_pviq', 'demo_biosex'
]
ndar = ndar[keep_cols]

# Group by 'study_id' and take the first occurrence
ndar = ndar.groupby('study_id', as_index=False).first()

# Rename columns
ndar.rename(columns={'demo_birthcity': 'COB', 'demo_biosex': 'SEX'}, inplace=True)

# Map SEX values
ndar['SEX'] = ndar['SEX'].map({'FEMALE': 'F', 'MALE': 'M'})

# Split DOB into month, day, and year
ndar[['MOB', 'DOB', 'YOB']] = ndar['demo_dob'].str.split('/', n=2, expand=True)

# Function to split names into components
def split_name(name):
    # Remove unwanted special characters
    name = name.strip() if isinstance(name, str) else ''
    name = name.replace(';', '')

    # Split by spaces
    parts = name.split()

    # Initialize components
    firstname = parts[0] if len(parts) > 0 else ''
    middlename = ''
    lastname = ''
    has_middlename = 'NO'

    # Handle cases based on the number of parts
    if len(parts) == 2:  # First and Last
        lastname = parts[1]
    elif len(parts) == 3:  # First, Middle, Last
        middlename = parts[1]
        lastname = parts[2]
        has_middlename = 'YES'
    elif len(parts) > 3:  # More than three parts
        if '(' in parts[1]:  # Handle nicknames
            middlename = ' '.join(parts[2:-1])  # Skip nickname part
            lastname = parts[-1]
        elif '.' in parts[1] or 'THE' in parts:  # Handle initials or suffixes
            middlename = ' '.join(parts[1:-1])
            lastname = parts[-1]
        else:
            middlename = parts[1]
            lastname = ' '.join(parts[2:])
        has_middlename = 'YES'

    return pd.Series([firstname, middlename, has_middlename, lastname])

# Ensure 'participant_name' does not contain NaN or empty values
ndar['participant_name'] = ndar['participant_name'].fillna('').replace('', np.nan)
ndar['name_to_split'] = ndar['participant_name'].combine_first(ndar['con_name'])

# Apply the split_name function
ndar[['FIRSTNAME', 'MIDDLENAME', 'SUBJECTHASMIDDLENAME', 'LASTNAME']] = ndar['name_to_split'].apply(split_name)

name_columns = ['FIRSTNAME', 'MIDDLENAME', 'LASTNAME']

# Apply to the dataframe
for col in name_columns:
    ndar[col] = ndar[col].str.replace(r'([^\w\s-]|^)(\w)', lambda m: m.group(0).upper(), regex=True).str.lower()
    ndar[col] = ndar[col].str.capitalize()

columns_to_drop = [
    'name_to_split', 'con_name', 'participant_name', 
    'redcap_event_name', 'demo_dob', 
    'city_of_birth_pviq', 'redcap_repeat_instrument', 'redcap_repeat_instance'
]
ndar.drop(columns=columns_to_drop, inplace=True)

# Add sequential ID
ndar['ID'] = range(1, len(ndar) + 1)

# Reorder columns
def reorder_columns(df, column_order):
    return df[column_order]

desired_order = ['study_id', 'ID', 'FIRSTNAME', 'MIDDLENAME', 'LASTNAME', 'MOB', 'DOB', 'YOB', 'COB', 'SEX', 'SUBJECTHASMIDDLENAME']
ndar = reorder_columns(ndar, desired_order)

# Save the processed DataFrame to a CSV
ndar.to_csv(NDARconverted_path, index=False)