In [95]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pycountry
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
pd.set_option('display.max_columns', 500)
np.random.seed(42)

In [96]:
os.listdir("data/")

['ppp.csv', 'salaries.csv', 'stack_overflow']

In [97]:
def read_ppp() -> pd.DataFrame:
    """
    Reads PPP csv and returns resulting data frame
    Data Manipulations:
    - Only use years since we're not going to use anything before unless we go historic route
    - Fill nans with string type Null
    - - thought process is to us other functions that will detect str type and throw an error
    - - if nan operation will probably go through, so doing isinstance == str would be best probably
    - Also index columns are the country code, should match to the PyCountry library
    
    Inputs: None
    Output: pd.DataFrame
    """
    years = ["2019", "2020", "2021", "2022"]
    ppp = pd.read_csv("data/ppp.csv", header=2, index_col="Country Code")[years]
    # ppp = ppp.fillna("Null") # this way we can control the type, so we can create a function that checks type before anything else
    return ppp

In [98]:
def walrus_helper(salaries: pd.DataFrame) -> dict:
    """
    Helper function that is just a for loop that goes through unique job titles and assigns a basic name

    Input: pd.DataFrame
    Output: dict{str: str}
    """
    mapping = {}
    for job in list(salaries["job_title"].unique()):
        if (short := "Analyst") in job:
            mapping[job] = short
    
        elif (short := "Engineer") in job:
            mapping[job] = short
            
        elif (short := "Scientist") in job or "Science" in job:
                mapping[job] = short
            
        elif (short := "Architect") in job:
            mapping[job] = short
    
        elif (short := "Manager") in job:
            mapping[job] = short
    
        elif (short := "Developer") in job:
            mapping[job] = short
    
        else:
            mapping[job] = "Other"
    return mapping

In [102]:
def read_salaries() -> pd.DataFrame:
    """
    Reads the salaries from ai-net and returns them into a dataframe
    Data Manipulation:
    - Change 2 letter country names into 3 letter names for uniformity
    - Map above function in job_title to simpler names
    - Only taking 2020 - 2023, we have no data on 2024
    
    Input: None
    Output: pd.DataFrame
    """
    salaries = pd.read_csv("data/salaries.csv")
    country_abbreviations = {country.alpha_2: country.alpha_3 for country in pycountry.countries}
    mapping = walrus_helper(salaries)
    
    salaries[["employee_residence", "company_location"]] = salaries[["employee_residence", "company_location"]].replace(country_abbreviations)
    salaries["job_title"] = salaries["job_title"].replace(mapping)
    salaries = salaries[salaries["work_year"] < 2024]
    
    return salaries

In [103]:
ppp = read_ppp()
ppp

Unnamed: 0_level_0,2019,2020,2021,2022
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ABW,1.422507,1.391948,1.265626,1.222731
AFE,,,,
AFG,17.946128,18.948446,18.648551,
AFW,,,,
AGO,136.214650,148.932143,197.863546,214.989164
...,...,...,...,...
XKX,0.334320,0.334571,0.339767,0.338762
YEM,,,,
ZAF,6.707215,6.969045,7.102923,6.951546
ZMB,4.649869,5.220797,6.261902,6.206963


In [104]:
read_salaries()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
926,2023,SE,FT,Other,258700,USD,258700,USA,0,USA,M
927,2023,SE,FT,Other,146600,USD,146600,USA,0,USA,M
928,2023,MI,FT,Scientist,190000,USD,190000,USA,100,USA,M
929,2023,MI,FT,Scientist,160000,USD,160000,USA,100,USA,M
930,2023,EN,FT,Analyst,90000,USD,90000,USA,100,USA,M
...,...,...,...,...,...,...,...,...,...,...,...
11377,2020,SE,FT,Scientist,412000,USD,412000,USA,100,USA,L
11378,2021,MI,FT,Scientist,151000,USD,151000,USA,100,USA,L
11379,2020,EN,FT,Scientist,105000,USD,105000,USA,100,USA,S
11380,2020,EN,CT,Analyst,100000,USD,100000,USA,100,USA,L
