# Maroon Demographic Information

In [7]:
# Imports
import polars as pl

from util.files import write_to_json

In [8]:
# Helper functions
def list_to_parsed_list(unparsed_list, fuzzy_func):
    parsed_list = []
    for l in unparsed_list:
        if l == "" or l.lower() == "n/a" or l.lower() == "undecided":
            continue
        if "," in l:
            for p in l.lower().split(","):
                if p != "":
                    if fuzzy_func:
                        parsed_list.append(fuzzy_func(p.strip()).title())
                    else:
                        parsed_list.append(p.strip().title())
        else:
            if fuzzy_func:
                parsed_list.append(fuzzy_func(l.lower().strip()).title())
            else:
                parsed_list.append(l.lower().strip().title())
    return parsed_list


def fuzzy_study_match(major):
    if major == "cs":
        major = "computer science"
    elif major in ["lls", "llso", "law letters & society"]:
        major = "law letters and society"
    elif "econ" in major:
        major = "economics"
    elif "public policy" in major:
        major = "public policy"
    elif "history" in major:
        major = "history"
    elif "visual art" in major:
        major = "visual art"
    elif "creative writing" in major:
        major = "english and creative writing"
    elif major == "taps":
        major = "theater and performance studies"

    return major


def fuzzy_language_match(lang):
    if "cantonese" in lang:
        lang = "cantonese"
    elif "mandarin" in lang:
        lang = "mandarin"

    return lang

In [9]:
# Dealing with and parsing data
# Read file
df = pl.read_csv("scratch/Maroon_2023_Staff_Survey.csv")

# Make Timestamp a DateTime
df = df.with_columns(
    pl.col("Timestamp")
    .str.strptime(
        dtype=pl.Datetime, format="%Y/%m/%d %l:%M:%S %p %Z", strict=False
    )
    .cast(pl.Datetime)
)

# Get majors data
majors = pl.Series(
    list_to_parsed_list(df["Majors"].to_list(), fuzzy_study_match)
).value_counts()
majors.columns = ["major", "counts"]
majors = majors.to_dict(as_series=False)
write_to_json("scratch/majors.json", majors)

# Get minors data
minors = pl.Series(
    list_to_parsed_list(df["Minors"].to_list(), fuzzy_study_match)
).value_counts()
minors.columns = ["minor", "counts"]
minors = minors.to_dict(as_series=False)
write_to_json("scratch/minors.json", minors)

# Get languages data
languages = pl.Series(
    list_to_parsed_list(
        [v for v in df["FluentLanguages"].to_list() if v != ""],
        fuzzy_language_match,
    )
).value_counts()
languages.columns = ["language", "counts"]
languages = languages.to_dict(as_series=False)
write_to_json("scratch/languages.json", languages)

# Get info for word bubble
maroon_in_word = pl.Series(
    list_to_parsed_list(
        [v for v in df["MaroonInAWord"].to_list() if v != ""], None
    )
).value_counts()
maroon_in_word.columns = ["descriptiveWord", "counts"]
maroon_in_word = maroon_in_word.to_dict(as_series=False)
write_to_json("scratch/maroon_in_a_word.json", maroon_in_word)