In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_excel("eleyo_files/missiongraduates_sacc_invoice_details_breakdown_08-13-2024_11-25_17235735371076493.xlsx")

In [15]:
#From rate
keyword_to_school = {
    "Alvarado": "Alvarado Elementary School",
    "Bryant": "Bryant Elementary School",
    "Everett": "Everett Beacon Middle School",
    "Flynn": "Flynn Elementary School",
    "Marshall": "Marshall Elementary School",
    "Sanchez": "Sanchez Elementary School",
    "TECA": "Thomas Edison Charter Academy",
    "Bessie": "Bessie Carmichael Beacon TK-8",
    "Cleveland": "Cleveland Elementary School"
}
def find_substring_in_rate(rate):
    matches = [keyword_to_school[keyword] for keyword in keyword_to_school if keyword in rate]
    if len(matches) > 1:
        print(f"Error: Multiple matches found in row with Rate '{rate}': {matches}")
        return np.nan
    elif len(matches) == 1:
        return matches[0]
    return np.nan

In [16]:
df['Site_Filled'] = df['Rate'].apply(lambda rate: find_substring_in_rate(str(rate)))

In [17]:
#Copy over site column
df['Site_Filled'] = df.apply(lambda row: row['Site'] if pd.notna(row['Site']) else row['Site_Filled'], axis=1)

In [18]:
#By name
def reverse_name(name):
    if ',' in name:
        parts = name.split(',', 1)
        return ' '.join(part.strip() for part in reversed(parts))
    return name

def extract_after_for(text):
    return text.split('for ', 1)[1].strip() if 'for ' in text else text

def process_user(user, rate):
    if pd.isna(user):
        return user
    
    if not isinstance(user, str):
        return user
    if 'and' in user.lower():
        if isinstance(rate, str) and 'Prorate' in rate and 'to' in rate:
            return user
        if isinstance(rate, str) and 'Prorate' in rate:
            return extract_after_for(rate)
    if ', and' in user.lower():
        return ""
    else:
        return reverse_name(user)


# Assuming you have a DataFrame called 'df' with columns 'user' and 'rate'
# If not, you can create one like this:
# df = pd.DataFrame({
#     'user': ['Smith, John', 'Jane Doe and Jack Doe', 'Brown, Robert A.', 'Alice Johnson and Bob Johnson'],
#     'rate': ['Regular', 'Prorate - 75%', 'Prorate - 50%', 'Regular']
# })

# Apply the function to the DataFrame
df['processed_user'] = df.apply(lambda row: process_user(row['User'], row['Rate']), axis=1)

In [19]:
# Assuming you have a DataFrame 'df' with columns 'processed_user' and 'Site'
# If not, you can create a sample one like this:
# df = pd.DataFrame({
#     'processed_user': ['John Smith', 'Jane Doe', 'John Smith', 'Alice Johnson'],
#     'Site': ['Site A', 'Site B', 'Site C', 'Site A']
# })

def create_user_site_dict(df):
    user_site_dict = {}
    
    for _, row in df.iterrows():
        user = row['processed_user']
        site = row['Site_Filled']
        
        if pd.isna(user):
            continue
        if pd.isna(site):
            continue
        if user not in user_site_dict:
            user_site_dict[user] = []
        
        if site not in user_site_dict[user]:
            user_site_dict[user].append(site)
    
    return user_site_dict

# Create the dictionary
user_to_sites = create_user_site_dict(df)

In [20]:
filtered_dict = {key: value for key, value in user_to_sites.items() if len(value) > 1}

# Output the filtered dictionary
print(filtered_dict)

{'': ['Flynn Elementary School', 'Bryant Elementary School', 'Alvarado Elementary School', 'Bessie Carmichael Beacon TK-8', 'Sanchez Elementary School', 'Thomas Edison Charter Academy', 'Cleveland Elementary School', 'Marshall Elementary School', 'Everett Beacon Middle School'], 'Avery Farr': ['Alvarado Elementary School', 'Marshall Elementary School'], 'Flynn McKernan': ['Alvarado Elementary School', 'Flynn Elementary School'], 'Edwin Steven Alarcon linares and Dylan Geovany Alarcon linares': ['Everett Beacon Middle School', 'Sanchez Elementary School'], 'Josue Lopez': ['Alvarado Elementary School', 'Everett Beacon Middle School'], 'Keila Sarai Berrios and Esteban Asael Lainez': ['Sanchez Elementary School', 'Everett Beacon Middle School'], 'Natany sofia aguilera and Margaret G Caldas': ['Alvarado Elementary School', 'Everett Beacon Middle School'], 'Isabella Lucia Luna': ['Alvarado Elementary School', 'Marshall Elementary School'], 'Ixchel Carrio and Malila Carrio': ['Marshall Elemen

In [21]:
# Function to look up site in the dictionary
def lookup_site(user):
    site = user_to_sites.get(user, np.nan)
    if isinstance(site, list) and len(site) == 1:
        return site[0]  # Return the single site in the list
    return np.nan  # Return NaN if the list has more than one element or if no entry is found

# Check if 'Site_Filled' exists
if 'Site_Filled' not in df.columns:
    raise ValueError("The 'Site_Filled' column does not exist in the DataFrame.")

# Fill 'Site_Filled' using the dictionary only if it is currently NaN
df['Site_Filled'] = df.apply(
    lambda row: row['Site_Filled'] if pd.notna(row['Site_Filled']) else lookup_site(row['processed_user']),
    axis=1
)

In [22]:
def process_user(user, site_filled):
    if pd.isna(user):
        return site_filled
    
    if not(pd.isna(site_filled)):
        return site_filled
    
    user_str = str(user)
    
    if " and " in user_str:
        users = user_str.split(" and ")
        sites = [user_to_sites.get(u, []) for u in users]
        
        # If one site is found and the other is empty, return the valid site
        if len(sites[0]) == 1 and not sites[1]:
            return sites[0][0]
        elif len(sites[1]) == 1 and not sites[0]:
            return sites[1][0]
        elif len(sites[0]) == 1 and len(sites[1]) == 1 and sites[0] == sites[1]:
            if pd.isna(site_filled):
                return sites[0][0]
        else:
            print(f"Error: Multiple sites found for users '{user}': {sites}")
            return np.nan
    else:
        site = user_to_sites.get(user_str, np.nan)
        if isinstance(site, list) and len(site) == 1:
            return site[0]  # Return the single site in the list
    return site_filled  # Return original value if no match or conditions are met

# Apply the function to the DataFrame
df['Site_Filled'] = df.apply(
    lambda row: process_user(row['processed_user'], row['Site_Filled']),
    axis=1
)

Error: Multiple sites found for users 'Liliana Nicita, Gabriela J NICITA, and Liliana J NICITA': [[], []]
Error: Multiple sites found for users 'Liliana Nicita, Gabriela J NICITA, and Liliana J NICITA': [[], []]
Error: Multiple sites found for users 'Natany sofia aguilera and Margaret G Caldas': [['Everett Beacon Middle School'], ['Alvarado Elementary School']]
Error: Multiple sites found for users 'Sarabi Jael Herrera Hernandez and Rony García': [['Everett Beacon Middle School'], ['Sanchez Elementary School']]
Error: Multiple sites found for users 'Samantha Georgopoulos and Oliver Georgopoulos': [['Everett Beacon Middle School'], ['Sanchez Elementary School']]
Error: Multiple sites found for users 'Samantha Georgopoulos and Oliver Georgopoulos': [['Everett Beacon Middle School'], ['Sanchez Elementary School']]
Error: Multiple sites found for users 'Hermita Lucero Garcia Juarez and Linda Estrella Garcia Juarez': [['Bryant Elementary School', 'Sanchez Elementary School'], ['Bryant Eleme