## Imports

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pycountry
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from func_library import StackOverflowDataTester, StackOverflowData, AISalariesData, read_ppp, get_2023_usd_equivalent
from typing import TypeVar, Union
import pingouin as pg




# warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)
np.random.seed(42)

Number = TypeVar('Number', int, float, np.float16, np.float32, np.float64, np.float128, np.int16, np.int32, np.int64, np.int8, np.uint16, np.uint32, np.uint64, np.uint8)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Reading (Run Once)

In [3]:
ppp = read_ppp()
ai_salaries_df = AISalariesData.generate_df()
stack_overflow, skills_list, employments = StackOverflowData.generate_aggregate_df()


In [4]:
ai_jobs_converter = lambda row: get_2023_usd_equivalent(year=row["work_year"],
                                                        country_code=row["company_location"],
                                                        salary_val=row["salary"],
                                                        ppp_df=ppp)
ai_salaries_df["usd_2023_"] = ai_salaries_df.apply(ai_jobs_converter, axis=1)



Country code US or year 2023 not found in the PPP DataFrame


KeyError: 2023

Unnamed: 0,country,year,age
292,,2020,35-44 years old
292,,2020,35-44 years old
292,,2020,35-44 years old
410,,2020,18-24 years old
410,,2020,18-24 years old
410,,2020,18-24 years old
550,,2020,25-34 years old
550,,2020,25-34 years old
550,,2020,25-34 years old
802,,2020,


## Analysis

In [None]:
# more workable until we go by skills
s_o = stack_overflow.drop(skills_list, axis=1)
list(s_o.columns)

['yearscode',
 'edlevel',
 'comptotal',
 'country',
 'employment',
 'convertedcompyearly',
 'mainbranch',
 'sopartfreq',
 'sovisitfreq',
 'age',
 'soaccount',
 'year',
 'yearscodepro',
 'count',
 'socomm',
 'surveyease',
 'orgsize',
 'surveylength',
 'analyst',
 'data scientist',
 'developer',
 'engineer_other',
 'management',
 'scientist_other',
 'systems_architect']

In [None]:
ai_salaries_df.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

### Descriptive Statistics

### One Way ANOVA on Location and Comp Salary

#### Helper Functions

In [None]:
def get_similar_countries(df: pd.DataFrame, want: int = 10) -> list:
    """
    Get the number of countries that we want so we can index for ANOVA
    Consistent across the years for a good comparison
    """
    groupped = df.groupby('year')
    loc_list = list()
    head = 1
    
    while len(loc_list) < want:
        loc_set = set()
        for year, frame in groupped:
            grouped = frame.groupby("country").size().sort_values(ascending=False)
            if not loc_set:
                loc_set = set(grouped.head(head).index)
                continue
            temp_set = set(grouped.head(head).index)
            loc_set = loc_set.intersection(temp_set)
        loc_list = list(loc_set)
        head += 1
    return loc_list

#### ANOVA

In [None]:
loc_so = s_o[["year", "country", "convertedcompyearly"]]
loc_so.loc[:, "year"] = loc_so.loc[:, "year"].astype('int64')

loc_sal = ai_salaries_df[["work_year", "employee_residence", "salary_in_usd"]]
loc_sal.loc[:, "salary_in_usd"] = loc_sal.loc[:, "salary_in_usd"].astype("float64")
loc_sal.columns = loc_so.columns


location = pd.concat([loc_so, loc_sal], ignore_index=True)
location
loc_so.dtypes, loc_sal.dtypes

(year                    object
 country                 object
 convertedcompyearly    float64
 dtype: object,
 year                    int64
 country                object
 convertedcompyearly     int64
 dtype: object)

In [None]:
countries = get_similar_countries(location, 20)

In [None]:
for year, frame in groupped:
    print(year)
    frame = frame[frame["country"].isin(countries)]
    aov = pg.anova(data=frame, dv="convertedcompyearly", between="country", detailed=True)
    display(aov)

2019


Unnamed: 0,Source,SS,DF,MS,F,p-unc,np2
0,country,104474400000000.0,19,5498651000000.0,51.971066,6.123095000000001e-190,0.069654
1,Within,1395425000000000.0,13189,105802200000.0,,,


2020


Unnamed: 0,Source,SS,DF,MS,F,p-unc,np2
0,country,43773200000000.0,19,2303853000000.0,34.972102,1.3319250000000001e-123,0.071909
1,Within,564960100000000.0,8576,65876880000.0,,,


2021


Unnamed: 0,Source,SS,DF,MS,F,p-unc,np2
0,country,47769610000000.0,19,2514190000000.0,14.355395,3.937604e-46,0.027638
1,Within,1680634000000000.0,9596,175139000000.0,,,


2022


Unnamed: 0,Source,SS,DF,MS,F,p-unc,np2
0,country,89768990000000.0,19,4724684000000.0,6.682907,7.586757e-18,0.013915
1,Within,6361409000000000.0,8998,706980300000.0,,,


2023


Unnamed: 0,Source,SS,DF,MS,F,p-unc,np2
0,country,12712510000000.0,19,669079400000.0,55.024831,4.76594e-200,0.085467
1,Within,136029300000000.0,11187,12159590000.0,,,


### T-Tests On Salary if Needed

### One Way ANOVA on Degree

### One Way ANOVA on Experience

### One Way ANOVA on Location

### One Way ANOVA on Year

### Salary Skew Overall

### Identify any outliers

## Charts

### Choropleth Chart on Salary by Location (need)

### Line Chart of Salary by Location (need)

### Blox Plot of a Country to Show Skew if Any (need)

### Histogram of Countries with Highest Response Rates (need)

### Sunburst Plot (want)