# Maroon Demographic Information

In [44]:
# Imports
from inflection import pluralize, singularize, underscore
import polars as pl

from util.files import write_to_json

# Constants
BASE_DIR = "../scratch"
COUNTS_COLUMN = "counts"
EMPTY_STRING = ""
NOT_APPLICABLE = "n/a"

In [45]:
# Helper functions
def list_to_parsed_list(unparsed_list, fuzzy_func):
    parsed_list = []
    for element in unparsed_list:
        if (
            element == EMPTY_STRING
            or element.lower() == NOT_APPLICABLE
            or element.lower() == "undecided"
        ):
            continue
        if "," in element:
            for p in element.lower().split(","):
                fmt_element = element.lower().strip()
                if p != EMPTY_STRING:
                    if fuzzy_func:
                        parsed_list.append(fuzzy_func(fmt_element).title())
                    else:
                        parsed_list.append(fmt_element.title())
        else:
            fmt_element = element.lower().strip()
            if fuzzy_func:
                parsed_list.append(fuzzy_func(fmt_element).title())
            else:
                parsed_list.append(fmt_element.title())
    return parsed_list


def list_to_parsed_list_non_split(unparsed_list, fuzzy_func):
    parsed_list = []
    for element in unparsed_list:
        if element == EMPTY_STRING or element.lower() == NOT_APPLICABLE:
            continue

        fmt_element = element.lower().strip()
        if fuzzy_func:
            parsed_list.append(fuzzy_func(fmt_element).title())
        else:
            parsed_list.append(fmt_element.title())
    return parsed_list


def fuzzy_study_match(major):
    if major == "cs":
        major = "computer science"
    elif major in ["lls", "llso", "law letters & society"]:
        major = "law letters and society"
    elif "econ" in major:
        major = "economics"
    elif "public policy" in major:
        major = "public policy"
    elif "history" in major:
        major = "history"
    elif "visual art" in major:
        major = "visual art"
    elif "creative writing" in major:
        major = "english and creative writing"
    elif major == "taps":
        major = "theater and performance studies"

    return major


def fuzzy_language_match(lang):
    if "cantonese" in lang:
        lang = "cantonese"
    elif "mandarin" in lang:
        lang = "mandarin"

    return lang


def pl_count_to_obj_list(pl_dict):
    column_name = ""
    for k in [k for k in pl_dict.keys() if k != COUNTS_COLUMN]:
        column_name = k

    obj_list = []
    for i in range(len(pl_dict[COUNTS_COLUMN])):
        obj_list.append(
            {
                column_name: pl_dict[column_name][i],
                "weight": pl_dict[COUNTS_COLUMN][i],
            }
        )

    return obj_list

def isolate_column_counts(original_col_name:str, file_name:str = ""):
    new_col_name = underscore(singularize(original_col_name))

    col_value_counts = pl.Series(
        list_to_parsed_list(df[original_col_name].to_list(), fuzzy_study_match)
    ).value_counts()
    col_value_counts.columns = [new_col_name, COUNTS_COLUMN]

    if file_name == "":
        file_name = pluralize(new_col_name)

    write_to_json(
        f"{BASE_DIR}/{file_name}.json",
        pl_count_to_obj_list(col_value_counts.to_dict(as_series=False)),
    )

In [46]:
# Dealing with and parsing data
# Read file
df = pl.read_csv(f"{BASE_DIR}/Maroon_2023_Staff_Survey.csv")

# Make Timestamp a DateTime
df = df.with_columns(
    pl.col("Timestamp")
    .str.strptime(
        dtype=pl.Datetime, format="%Y/%m/%d %l:%M:%S %p %Z", strict=False
    )
    .cast(pl.Datetime)
)

In [47]:
isolate_column_counts("Majors")

In [48]:
isolate_column_counts("Minors")

In [49]:
isolate_column_counts("FluentLanguages")

In [50]:
isolate_column_counts("MaroonInAWord", "maroon_in_a_word")

In [51]:
isolate_column_counts("RaceEthnicity", "race_ethnicity")

In [52]:
isolate_column_counts("HispanicLatino", "hispanic_latino")

In [53]:
isolate_column_counts("HighSchoolType")

In [54]:
isolate_column_counts("USRegion", "us_region")

In [55]:
isolate_column_counts("GenderIdentity")

In [56]:
isolate_column_counts("SexualOrientation")

In [57]:
isolate_column_counts("FGLI", "fgli")

In [58]:
isolate_column_counts("FederalFinancialAide", "federal_aide")

In [59]:
isolate_column_counts("FamilyIncome")

In [60]:
isolate_column_counts("RSOsExternalOrgs", "rsos")

In [61]:
isolate_column_counts("CanVoteAtMaroon", "can_vote")

In [62]:
isolate_column_counts("MaroonHelpApplications", "helps_applications")

In [63]:
isolate_column_counts("CitizenshipStatus")