# Maroon Demographic Information

In [9]:
# Imports
import polars as pl

from util.files import write_to_json

# Constants
BASE_DIR = "../scratch"
COUNTS_COLUMN = "counts"
EMPTY_STRING = ""
NOT_APPLICABLE = "n/a"

In [10]:
# Helper functions
def list_to_parsed_list(unparsed_list, fuzzy_func):
    parsed_list = []
    for element in unparsed_list:
        if (
            element == EMPTY_STRING
            or element.lower() == NOT_APPLICABLE
            or element.lower() == "undecided"
        ):
            continue
        if "," in element:
            for p in element.lower().split(","):
                fmt_element = element.lower().strip()
                if p != EMPTY_STRING:
                    if fuzzy_func:
                        parsed_list.append(fuzzy_func(fmt_element).title())
                    else:
                        parsed_list.append(fmt_element.title())
        else:
            fmt_element = element.lower().strip()
            if fuzzy_func:
                parsed_list.append(fuzzy_func(fmt_element).title())
            else:
                parsed_list.append(fmt_element.title())
    return parsed_list


def list_to_parsed_list_non_split(unparsed_list, fuzzy_func):
    parsed_list = []
    for element in unparsed_list:
        if element == EMPTY_STRING or element.lower() == NOT_APPLICABLE:
            continue

        fmt_element = element.lower().strip()
        if fuzzy_func:
            parsed_list.append(fuzzy_func(fmt_element).title())
        else:
            parsed_list.append(fmt_element.title())
    return parsed_list


def fuzzy_study_match(major):
    if major == "cs":
        major = "computer science"
    elif major in ["lls", "llso", "law letters & society"]:
        major = "law letters and society"
    elif "econ" in major:
        major = "economics"
    elif "public policy" in major:
        major = "public policy"
    elif "history" in major:
        major = "history"
    elif "visual art" in major:
        major = "visual art"
    elif "creative writing" in major:
        major = "english and creative writing"
    elif major == "taps":
        major = "theater and performance studies"

    return major


def fuzzy_language_match(lang):
    if "cantonese" in lang:
        lang = "cantonese"
    elif "mandarin" in lang:
        lang = "mandarin"

    return lang


def pl_count_to_obj_list(pl_dict):
    column_name = ""
    for k in [k for k in pl_dict.keys() if k != COUNTS_COLUMN]:
        column_name = k

    obj_list = []
    for i in range(len(pl_dict[COUNTS_COLUMN])):
        obj_list.append(
            {
                column_name: pl_dict[column_name][i],
                "weight": pl_dict[COUNTS_COLUMN][i],
            }
        )

    return obj_list

In [11]:
# Dealing with and parsing data
# Read file
df = pl.read_csv(f"{BASE_DIR}/Maroon_2023_Staff_Survey.csv")

# Make Timestamp a DateTime
df = df.with_columns(
    pl.col("Timestamp")
    .str.strptime(
        dtype=pl.Datetime, format="%Y/%m/%d %l:%M:%S %p %Z", strict=False
    )
    .cast(pl.Datetime)
)

In [12]:
majors = pl.Series(
    list_to_parsed_list(df["Majors"].to_list(), fuzzy_study_match)
).value_counts()
majors.columns = ["major", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/majors.json",
    pl_count_to_obj_list(majors.to_dict(as_series=False)),
)

In [None]:
minors = pl.Series(
    list_to_parsed_list(df["Minors"].to_list(), fuzzy_study_match)
).value_counts()
minors.columns = ["minor", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/minors.json",
    pl_count_to_obj_list(minors.to_dict(as_series=False)),
)

In [None]:
languages = pl.Series(
    list_to_parsed_list(
        [v for v in df["FluentLanguages"].to_list() if v != EMPTY_STRING],
        fuzzy_language_match,
    )
).value_counts()
languages.columns = ["language", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/languages.json",
    pl_count_to_obj_list(languages.to_dict(as_series=False)),
)

In [None]:
maroon_in_word = pl.Series(
    list_to_parsed_list(
        [v for v in df["MaroonInAWord"].to_list() if v != EMPTY_STRING], None
    )
).value_counts()
maroon_in_word.columns = ["descriptiveWord", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/maroon_in_a_word.json",
    pl_count_to_obj_list(maroon_in_word.to_dict(as_series=False)),
)

In [None]:
race_ethnicity = pl.Series(
    list_to_parsed_list_non_split(
        [v for v in df["RaceEthnicity"].to_list() if v != EMPTY_STRING], None
    )
).value_counts()
race_ethnicity.columns = ["raceEthnicity", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/race_ethnicity.json",
    pl_count_to_obj_list(race_ethnicity.to_dict(as_series=False)),
)

In [None]:
hispanic_latino = pl.Series(
    list_to_parsed_list_non_split(
        [v for v in df["HispanicLatino"].to_list() if v != EMPTY_STRING], None
    )
).value_counts()
hispanic_latino.columns = ["hispanicLatino", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/hispanic_latino.json",
    pl_count_to_obj_list(hispanic_latino.to_dict(as_series=False)),
)

In [None]:
high_school = pl.Series(
    list_to_parsed_list_non_split(
        [v for v in df["HighSchoolType"].to_list() if v != EMPTY_STRING], None
    )
).value_counts()
high_school.columns = ["highSchoolType", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/high_school_type.json",
    pl_count_to_obj_list(high_school.to_dict(as_series=False)),
)

In [None]:
region = pl.Series(
    list_to_parsed_list_non_split(
        [v for v in df["USRegion"].to_list() if v != EMPTY_STRING], None
    )
).value_counts()
region.columns = ["us_region", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/us_region.json",
    pl_count_to_obj_list(region.to_dict(as_series=False)),
)

In [None]:
region = pl.Series(
    list_to_parsed_list_non_split(
        [v for v in df["USRegion"].to_list() if v != EMPTY_STRING], None
    )
).value_counts()
region.columns = ["us_region", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/us_region.json",
    pl_count_to_obj_list(region.to_dict(as_series=False)),
)

In [None]:
gender = pl.Series(
    list_to_parsed_list_non_split(
        [v for v in df["GenderIdentity"].to_list() if v != EMPTY_STRING], None
    )
).value_counts()
gender.columns = ["gender", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/gender.json",
    pl_count_to_obj_list(gender.to_dict(as_series=False)),
)

In [None]:
orientation = pl.Series(
    list_to_parsed_list_non_split(
        [v for v in df["SexualOrientation"].to_list() if v != EMPTY_STRING],
        None,
    )
).value_counts()
orientation.columns = ["orientation", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/orientation.json",
    pl_count_to_obj_list(orientation.to_dict(as_series=False)),
)

In [None]:
fgli = pl.Series(
    list_to_parsed_list_non_split(
        [v for v in df["FGLI"].to_list() if v != EMPTY_STRING], None
    )
).value_counts()
fgli.columns = ["fgli", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/fgli.json",
    pl_count_to_obj_list(fgli.to_dict(as_series=False)),
)

In [None]:
federal_aid = pl.Series(
    list_to_parsed_list_non_split(
        [v for v in df["FederalFinancialAide"].to_list() if v != EMPTY_STRING],
        None,
    )
).value_counts()
federal_aid.columns = ["federal_fa", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/federal_aid.json",
    pl_count_to_obj_list(federal_aid.to_dict(as_series=False)),
)

In [None]:
family_income = pl.Series(
    list_to_parsed_list_non_split(
        [v for v in df["FamilyIncome"].to_list() if v != EMPTY_STRING], None
    )
).value_counts()
family_income.columns = ["family_income", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/family_income.json",
    pl_count_to_obj_list(family_income.to_dict(as_series=False)),
)

In [None]:
rsos = pl.Series(
    list_to_parsed_list_non_split(
        [v for v in df["RSOsExternalOrgs"].to_list() if v != EMPTY_STRING], None
    )
).value_counts()
rsos.columns = ["rsos", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/rsos.json",
    pl_count_to_obj_list(rsos.to_dict(as_series=False)),
)

In [None]:
rsos = pl.Series(
    list_to_parsed_list_non_split(
        [v for v in df["RSOsExternalOrgs"].to_list() if v != EMPTY_STRING], None
    )
).value_counts()
rsos.columns = ["rsos", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/rsos.json",
    pl_count_to_obj_list(rsos.to_dict(as_series=False)),
)

In [None]:
can_vote = pl.Series(
    list_to_parsed_list_non_split(
        [v for v in df["CanVoteAtMaroon"].to_list() if v != EMPTY_STRING], None
    )
).value_counts()
can_vote.columns = ["can_vote", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/can_vote.json",
    pl_count_to_obj_list(can_vote.to_dict(as_series=False)),
)

In [None]:
help_applications = pl.Series(
    list_to_parsed_list_non_split(
        [
            v
            for v in df["MaroonHelpApplications"].to_list()
            if v != EMPTY_STRING
        ],
        None,
    )
).value_counts()
help_applications.columns = ["help_applications", COUNTS_COLUMN]
write_to_json(
    f"{BASE_DIR}/help_applications.json",
    pl_count_to_obj_list(help_applications.to_dict(as_series=False)),
)