## Imports

In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pycountry
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from func_library import StackOverflowDataTester, StackOverflowData, read_ppp
import pingouin as pg

# import warnings

# warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)
np.random.seed(42)

## Functions

In [None]:
def walrus_helper(salaries: pd.DataFrame) -> dict:
    """
    Helper function that is just a for loop that goes through unique job titles and assigns a basic name

    Input: pd.DataFrame
    Output: dict{str: str}
    """
    mapping = {}
    for job in list(salaries["job_title"].unique()):
        if (short := "Analyst") in job:
            mapping[job] = short.lower()
    
        elif (short := "Engineer") in job:
            mapping[job] = short.lower() + "_other"
            
        elif (short := "Data Scientist") in job or "Data Science" in job:
                mapping[job] = '_'.join(short.lower().split(" "))
            
        elif "Architect" in job:
            mapping[job] = "systems_architect"
    
        elif "Manager" in job:
            mapping[job] = "management"
    
        elif (short := "Developer") in job:
            mapping[job] = short.lower()
            
        elif "math" in job.lower() or "stat" in job.lower():
            mapping[job] = "mathematician_statistician"
            
        else:
            mapping[job] = "scientist_other"
    return mapping

In [None]:
def read_salaries() -> pd.DataFrame:
    """
    Reads the salaries from ai-net and returns them into a dataframe
    Data Manipulation:
    - Change 2 letter country names into 3 letter names for uniformity
    - Map above function in job_title to simpler names
    - Only taking 2020 - 2023, we have no data on 2024
    
    Input: None
    Output: pd.DataFrame
    """
    salaries = pd.read_csv("data/salaries.csv")
    country_abbreviations = {country.alpha_2: country.alpha_3 for country in pycountry.countries}
    mapping = walrus_helper(salaries)
    
    salaries[["employee_residence", "company_location"]] = salaries[["employee_residence", "company_location"]].replace(country_abbreviations)
    salaries["job_title"] = salaries["job_title"].replace(mapping)
    salaries = salaries[salaries["work_year"] < 2024]
    
    return salaries

In [None]:
def create_onehot_skills(frames: dict) -> None:
    """
    Given a dictionary of pandas dataframes we want to one hot the skills in particular.
    We want to take the skills in the different columns and one hot them such we can sum them for groupby operations.
    We get a dictionary of pandas DataFrames and perform an inplace operation such that we don't have to create new memory.
    Return a dictionary of a list of strings for a couple reasons:
        - there's no way we will remember all of these so automation by putting these into a list seemed like the best idea
        - the keys will match those in the input in case we want to do something with these later per year
        - hashing onto a dictionary should allow for ease of access since no 2 years will have the same EXACT one hot columns, hence the list
    The above is deprecated, after merging with similar columns these will all be useless to us

    We also drop the _Empty for EVERYTHING since that information is useless to us
    
    Input: frames dict{str: pd.DataFrames}
    Ouput: None

    https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list

    Rough example flow of function for one sample:
    C; C++; Perl -> [C, C++, Perl] -> [1, 1, 1, 0]
    Python       -> [Python]       -> [0, 0, 0, 1]
    """
    # some constants
    standard = [("language", "lg"), ("database", "db"), ("platform", "pf"), ("webframe", "wf"), ("misctech", "mt")]
    status = [("wanttoworkwith", "www"), ("haveworkedwith", "hww")]
    
    for key, frame in frames.items():
        new_cols = []
        for stan, abv in standard:
            for stat, abr in status:
                coi = stan + stat # coi = column of interest
                abbr = abv + abr + "_"
                mlb = MultiLabelBinarizer(sparse_output=True) # saves ram
                frame[coi] = frame[coi].str.split(";")
                transformed = mlb.fit_transform(frame.pop(coi))
                new_cois = [abbr + name for name in mlb.classes_]
                frame = frame.join(
                            pd.DataFrame.sparse.from_spmatrix(
                                transformed,
                                index=frame.index,
                                columns=new_cois
                            )
                        )
                new_cois.remove(abbr + "Empty")
                new_cols += new_cois
                frame = frame.drop(abbr + "Empty", axis=1)
        # this needs to be here, if not throse Sparse type errors
        # # Sparse types don't allow normal groupby operations (ie reshape) so we need to turn them into ints
        # # int8 don't take up a ton and it's just 0's and 1's
        # # for all intents and purposes these are sparse matrices, we just want to avoid the object
        frame[new_cols] = frame[new_cols].fillna(0)
        frame[new_cols] = frame[new_cols].astype('int8')
        frames[key] = frame

## Reading (Run Once)

In [None]:
stack_overflow, skills_list, employments = StackOverflowData.make_aggregate_df(only_data_science_devs=True)
ppp = read_ppp()
salaries = read_salaries()



## Analysis

In [None]:
# more workable until we go by skills
s_o = stack_overflow.drop(skills_list, axis=1)
list(s_o.columns)

['yearscode',
 'edlevel',
 'comptotal',
 'country',
 'employment',
 'convertedcompyearly',
 'mainbranch',
 'sopartfreq',
 'sovisitfreq',
 'age',
 'soaccount',
 'year',
 'yearscodepro',
 'count',
 'socomm',
 'surveyease',
 'orgsize',
 'surveylength',
 'analyst',
 'data scientist',
 'developer',
 'engineer_other',
 'management',
 'scientist_other',
 'systems_architect']

In [None]:
salaries.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

### Descriptive Statistics

### One Way ANOVA on Location and Comp Salary

#### Helper Functions

In [None]:
def get_similar_countries(df: pd.DataFrame, want: int = 10) -> list:
    """
    Get the number of countries that we want so we can index for ANOVA
    Consistent across the years for a good comparison
    """
    groupped = df.groupby('year')
    loc_list = list()
    head = 1
    
    while len(loc_list) < want:
        loc_set = set()
        for year, frame in groupped:
            grouped = frame.groupby("country").size().sort_values(ascending=False)
            if not loc_set:
                loc_set = set(grouped.head(head).index)
                continue
            temp_set = set(grouped.head(head).index)
            loc_set = loc_set.intersection(temp_set)
        loc_list = list(loc_set)
        head += 1
    return loc_list

#### ANOVA

In [None]:
loc_so = s_o[["year", "country", "convertedcompyearly"]]
loc_so.loc[:, "year"] = loc_so.loc[:, "year"].astype('int64')

loc_sal = salaries[["work_year", "employee_residence", "salary_in_usd"]]
loc_sal.loc[:, "salary_in_usd"] = loc_sal.loc[:, "salary_in_usd"].astype("float64")
loc_sal.columns = loc_so.columns


location = pd.concat([loc_so, loc_sal], ignore_index=True)
location
loc_so.dtypes, loc_sal.dtypes

(year                    object
 country                 object
 convertedcompyearly    float64
 dtype: object,
 year                    int64
 country                object
 convertedcompyearly     int64
 dtype: object)

In [None]:
countries = get_similar_countries(location, 20)

In [None]:
for year, frame in groupped:
    print(year)
    frame = frame[frame["country"].isin(countries)]
    aov = pg.anova(data=frame, dv="convertedcompyearly", between="country", detailed=True)
    display(aov)

2019


Unnamed: 0,Source,SS,DF,MS,F,p-unc,np2
0,country,104474400000000.0,19,5498651000000.0,51.971066,6.123095000000001e-190,0.069654
1,Within,1395425000000000.0,13189,105802200000.0,,,


2020


Unnamed: 0,Source,SS,DF,MS,F,p-unc,np2
0,country,43773200000000.0,19,2303853000000.0,34.972102,1.3319250000000001e-123,0.071909
1,Within,564960100000000.0,8576,65876880000.0,,,


2021


Unnamed: 0,Source,SS,DF,MS,F,p-unc,np2
0,country,47769610000000.0,19,2514190000000.0,14.355395,3.937604e-46,0.027638
1,Within,1680634000000000.0,9596,175139000000.0,,,


2022


Unnamed: 0,Source,SS,DF,MS,F,p-unc,np2
0,country,89768990000000.0,19,4724684000000.0,6.682907,7.586757e-18,0.013915
1,Within,6361409000000000.0,8998,706980300000.0,,,


2023


Unnamed: 0,Source,SS,DF,MS,F,p-unc,np2
0,country,12712510000000.0,19,669079400000.0,55.024831,4.76594e-200,0.085467
1,Within,136029300000000.0,11187,12159590000.0,,,


### T-Tests On Salary if Needed

### One Way ANOVA on Degree

### One Way ANOVA on Experience

### One Way ANOVA on Location

### One Way ANOVA on Year

### Salary Skew Overall

### Identify any outliers

## Charts

### Choropleth Chart on Salary by Location (need)

### Line Chart of Salary by Location (need)

### Blox Plot of a Country to Show Skew if Any (need)

### Histogram of Countries with Highest Response Rates (need)

### Sunburst Plot (want)