In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pycountry
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from func_library import StackOverflowDataTester
pd.set_option('display.max_columns', 500)
np.random.seed(42)

In [3]:
stack_overflow_files = (os.listdir("data/"))
# not worth going from 2011-2014. No data scientists.
# ok, so decision to do (2019 maybe) 2020-2023 for analysis

# GPT gave me this idea instead of going through every possible country manually
country_abbreviations_1 = {country.name: country.alpha_3 for country in pycountry.countries}
country_abbreviations_2 = {country.official_name: country.alpha_3 for country in pycountry.countries}

os.listdir("data/")



['data_test_metrics.json',
 'ppp.csv',
 '.DS_Store',
 'stack_overflow',
 'ai-jobs_salaries.csv']

In [4]:
# test the data to ensure it is consistent with that used in the analysis and report
stack_overflow_survey_years = ["2019", "2020", "2021", "2022", "2023"]
for so_year in stack_overflow_survey_years:
    so_data_tester = StackOverflowDataTester(so_year)
    so_data_tester.perform_tests()

All tests passed for 2019 survey data.


  sqr = _ensure_numeric((avg - values) ** 2)


All tests passed for 2020 survey data.
All tests passed for 2021 survey data.
All tests passed for 2022 survey data.
All tests passed for 2023 survey data.


Probably put the doc strings into markdowns
- will also be used to explain visualization

We could
- merge on the money after rounding
  - check how much data we have afterwards
- like worldwide -> skills
- function to merge and check the distribution afterwards

## Functions

In [5]:
def create_onehot_skills(frames: dict) -> None:
    """
    Given a dictionary of pandas dataframes we want to one hot the skills in particular.
    We want to take the skills in the different columns and one hot them such we can sum them for groupby operations.
    We get a dictionary of pandas DataFrames and perform an inplace operation such that we don't have to create new memory.
    Return a dictionary of a list of strings for a couple reasons:
        - there's no way we will remember all of these so automation by putting these into a list seemed like the best idea
        - the keys will match those in the input in case we want to do something with these later per year
        - hashing onto a dictionary should allow for ease of access since no 2 years will have the same EXACT one hot columns, hence the list
    The above is deprecated, after merging with similar columns these will all be useless to us

    We also drop the _Empty for EVERYTHING since that information is useless to us
    
    Input: frames dict{str: pd.DataFrames}
    Ouput: None

    https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list

    Rough example flow of function for one sample:
    C; C++; Perl -> [C, C++, Perl] -> [1, 1, 1, 0]
    Python       -> [Python]       -> [0, 0, 0, 1]
    """
    # some constants
    standard = [("language", "lg"), ("database", "db"), ("platform", "pf"), ("webframe", "wf"), ("misctech", "mt")]
    status = [("wanttoworkwith", "www"), ("haveworkedwith", "hww")]
    
    for key, frame in frames.items():
        new_cols = []
        for stan, abv in standard:
            for stat, abr in status:
                coi = stan + stat # coi = column of interest
                abbr = abv + abr + "_"
                mlb = MultiLabelBinarizer(sparse_output=True) # saves ram
                frame[coi] = frame[coi].str.split(";")
                transformed = mlb.fit_transform(frame.pop(coi))
                new_cois = [abbr + name for name in mlb.classes_]
                frame = frame.join(
                            pd.DataFrame.sparse.from_spmatrix(
                                transformed,
                                index=frame.index,
                                columns=new_cois
                            )
                        )
                new_cois.remove(abbr + "Empty")
                new_cols += new_cois
                frame = frame.drop(abbr + "Empty", axis=1)
        # this needs to be here, if not throse Sparse type errors
        # # Sparse types don't allow normal groupby operations (ie reshape) so we need to turn them into ints
        # # int8 don't take up a ton and it's just 0's and 1's
        # # for all intents and purposes these are sparse matrices, we just want to avoid the object
        frame[new_cols] = frame[new_cols].fillna(0)
        frame[new_cols] = frame[new_cols].astype('int8')
        frames[key] = frame

In [6]:
def abbr_education(frames: dict) -> None:
    """
    Similar in spirit to the other one hots, but this is in place
    Automatically abbreviates education levels across all frames
    Had to hard code the list again, not a big deal only 8 items
    
    Input: frames dict{str: pd.DataFrames}
    Ouput: None
    """
    # more hardcoded stuff that are needed
    abbreviations = ["Associate's", "Bachelor's", "Master's", "Elementary", "Professional", "Secondary", "Some College", "Else"]
    
    for key, frame in frames.items():
        # easier to replace this, makes it much easier to work with
        frame['edlevel'] = frame['edlevel'].replace({'I never completed any formal education': 'Something else'})

        # need the sorted since they have the same rough scheme
        levels = list(frame['edlevel'].unique())
        levels.sort()
        o = 0 # offset

        # dictionary to feed into repalce function
        replace_dict = {}
        for i in range(len(levels)):
            col = levels[i]
            if col == 'nan':
                break
            abbr = abbreviations[i-o]
            if 'doctoral' in col:
                replace_dict[col] = "Doctoral"
                o += 1
                continue
            replace_dict[col] = abbr
                
        frame['edlevel'] = frame['edlevel'].replace(replace_dict)
        frames[key] = frame

In [7]:
def bin_ages(frames: dict) -> None:
    bins = [0, 18, 24, 34, 44, 54, 64, 100]
    labels = ['Under 18 years old', '18-24 years old', '25-34 years old', '35-44 years old', '45-54 years old', '55-64 years old', '65 years or older']
    for year, frame in frames.items():    
        if frame["age"].dtypes == float:
            frame["age"] = pd.cut(frame["age"], bins=bins, labels=labels)
        frame["age"] = frame["age"].astype('str')
        
        frames[year] = frame

In [8]:
def find_similar_col(frames) -> list:
    """
    Returns the set of columns that the all share, ideally we maximize the ratio of this to merge.
    """
    union = []
    for key, frame in frames.items():
        union.append(set(frame.columns))
        
    standard = union[0]
    for cols in union[1:]:
        standard = standard.intersection(cols)
    return list(standard)




In [9]:
df, skills, employment = read_stackoverflow()

In [16]:


def read_ppp() -> pd.DataFrame:
    """
    Reads PPP csv and returns resulting data frame
    Data Manipulations:
    - Only use years since we're not going to use anything before unless we go historic route
    - Fill nans with string type Null
    - - thought process is to us other functions that will detect str type and throw an error
    - - if nan operation will probably go through, so doing isinstance == str would be best probably
    - Also index columns are the country code, should match to the PyCountry library
    
    Inputs: None
    Output: pd.DataFrame
    """
    years = ["2019", "2020", "2021", "2022"]
    ppp = pd.read_csv("data/ppp.csv", header=2, index_col="Country Code")[years]
    # ppp = ppp.fillna("Null") # this way we can control the type, so we can create a function that checks type before anything else
    return ppp


def usd_to_2023_usd(val, year):
    usd_2023_inflation_factors = {
        "2023": 1,
        "2022": 1.047,
        "2021": 1.072,
        "2020": 1.101,
        "2019": 1.128,
        "2018": 1.158,
        "2017": 1.194,
    }
    return val / usd_2023_inflation_factors[year]





((39366, 150), (54756, 150))

In [18]:
ppp = pd.read_csv("data/ppp.csv", header=2, index_col="Country Code")
ppp

Unnamed: 0_level_0,Country Name,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1
ABW,Aruba,"PPP conversion factor, GDP (LCU per internatio...",PA.NUS.PPP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.234558e-01,9.434030e-01,9.576392e-01,0.985048,1.025098,1.037939,1.052499,1.072983,1.130995,1.139703,1.126002,1.163179,1.196121,1.208735,1.210213,1.232879,1.237648,1.267346,1.296500,1.310185,1.278788,1.302941,1.317801,1.285141,1.308843,1.362318,1.355045,1.350690,1.364849,1.422507,1.391948,1.265626,1.222731,
AFE,Africa Eastern and Southern,"PPP conversion factor, GDP (LCU per internatio...",PA.NUS.PPP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
AFG,Afghanistan,"PPP conversion factor, GDP (LCU per internatio...",PA.NUS.PPP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.197498,10.070732,10.912901,11.735787,12.204165,14.560002,14.585482,14.179058,14.545134,16.613478,17.242264,17.509419,17.035878,17.022520,17.445828,17.205558,17.149657,17.946128,18.948446,18.648551,,
AFW,Africa Western and Central,"PPP conversion factor, GDP (LCU per internatio...",PA.NUS.PPP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
AGO,Angola,"PPP conversion factor, GDP (LCU per internatio...",PA.NUS.PPP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.059561e-08,2.114473e-08,1.191866e-07,0.000001,0.000026,0.000498,0.023966,0.046048,0.063458,0.411439,2.084113,4.205856,10.041779,19.096783,24.817238,34.259053,38.921754,39.530353,46.297752,38.291825,50.047312,64.605751,65.681671,66.019371,65.000671,68.182045,80.778969,92.951721,116.336716,136.214650,148.932143,197.863546,214.989164,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XKX,Kosovo,"PPP conversion factor, GDP (LCU per internatio...",PA.NUS.PPP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.309302,0.298625,0.313941,0.326939,0.330216,0.332721,0.339622,0.339128,0.339378,0.340077,0.337071,0.334320,0.334571,0.339767,0.338762,
YEM,"Yemen, Rep.","PPP conversion factor, GDP (LCU per internatio...",PA.NUS.PPP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.114709e+00,6.624071e+00,7.329543e+00,8.332318,9.853719,14.066185,18.892420,21.040487,19.070445,25.097901,30.271455,30.418071,32.560590,35.408381,39.349382,45.223997,49.831727,53.803492,63.548612,57.647042,70.414895,81.476761,88.792992,93.631317,,,,,,,,,,
ZAF,South Africa,"PPP conversion factor, GDP (LCU per internatio...",PA.NUS.PPP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.258557e+00,1.407964e+00,1.579144e+00,1.742831,1.873827,2.034262,2.157941,2.293984,2.451646,2.584119,2.758894,2.908572,3.223016,3.362784,3.472747,3.555619,3.658372,3.855828,4.080267,4.405504,4.619970,4.776628,5.107817,5.295738,5.571891,5.825256,6.159093,6.426701,6.526439,6.707215,6.969045,7.102923,6.951546,
ZMB,Zambia,"PPP conversion factor, GDP (LCU per internatio...",PA.NUS.PPP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.566932e-03,1.782822e-02,4.628519e-02,0.110166,0.194954,0.249402,0.304553,0.375443,0.434034,0.504717,0.654497,0.802215,0.943074,1.087662,1.268074,1.434236,1.593633,1.752957,1.902978,1.995984,2.247433,2.446346,2.651491,2.832861,3.064822,3.366482,3.878827,4.192580,4.397596,4.649869,5.220797,6.261902,6.206963,


## The Data

In [None]:
df, skills, job_titles = read_stackoverflow()

In [None]:
data_science_jobs_cols = ["Data scientist or machine learning specialist", "Data or business analyst", "Engineer, data", "Statistician"]

job_titles



array(['Academic researcher', 'Blockchain',
       'Cloud infrastructure engineer', 'Data or business analyst',
       'Data scientist or machine learning specialist',
       'Database administrator', 'Designer', 'DevOps specialist',
       'Developer, QA or test', 'Developer, back-end',
       'Developer, desktop or enterprise applications',
       'Developer, embedded applications or devices',
       'Developer, front-end', 'Developer, full-stack',
       'Developer, game or graphics', 'Developer, mobile', 'Educator',
       'Engineer, data', 'Engineer, site reliability',
       'Engineering manager', 'Marketing or sales professional',
       'Other (please specify):', 'Product manager', 'Project manager',
       'Scientist', 'Security professional',
       'Senior Executive (C-Suite, VP, etc.)', 'Senior executive/VP',
       'Student', 'System administrator'], dtype=object)

In [None]:
# df = frames_dict["df_data_2019"]
# skills

In [None]:
# df.dtypes[df. dtypes == 'Sparse[int32, 0]']

In [None]:
# list(frames_dict["df_data_2019"].dtypes)

In [None]:
# this is the number of entries we are working with in our frames
# seeing how to standardize the columns some more
# this is kind of useless now with one hotting everything

query = "Web"
for key, frame in frames_dict.items():
    lang = []
    for col in frame.columns:
        lang.append(col) if query in col else None
    print(f"{key}\t{len(frame)}\t{lang}")

NameError: name 'frames_dict' is not defined

In [None]:
# print(frames_dict["df_data_2019"].columns)

In [None]:
# print(frames_dict["df_data_2020"].columns)

In [None]:
# print(frames_dict["df_data_2021"].columns)

In [None]:
# print(frames_dict["df_data_2022"].columns)

In [None]:
# print(frames_dict["df_data_2023"].columns)

## Similarity with columns per the dataframes

In [None]:
# do they have similar columns?
def find_similar_col(frames) -> set:
    """
    Returns the set of columns that the all share, ideally we maximize the ratio of this to merge.
    """
    union = []
    for key, frame in frames.items():
        union.append(set(frame.columns))
        
    standard = union[0]
    for cols in union[1:]:
        standard = standard.intersection(cols)
    return standard

In [None]:
# find_similar_col(frames_dict)

## Countries given a cull factor

In [None]:
# play around with the number and see if this is the spread that we want
for key, frame in frames_dict.items():
    print(key)
    grouped = frame.groupby("country").count()
    grouped = grouped[grouped["mainbranch"] > 10]
    length = len(grouped)
    print(f"""{key}: {length}
    max: {grouped['mainbranch'].idxmax()}, {grouped['mainbranch'].max()}
    min: {grouped['mainbranch'].idxmin()}, {grouped['mainbranch'].min()}""")

In [None]:
# do they have similar columns?
def find_similar_country(frames: dict, cull_factor=20) -> set:
    """
    Given a particular minimum (cull_factor) find the countries in common among
    frames.
    """
    union = []
    for key, frame in frames.items():
        grouped = frame.groupby("country").count()
        grouped = grouped[grouped["mainbranch"] > cull_factor]
        union.append(set(grouped.index))
        
    standard = union[0]
    for cols in union[1:]:
        standard = standard.intersection(cols)
    return standard

def show_country_dist(frames: dict, countries: list, cull_factor: int) -> None:
    """
    Just plot a bar chart for our country distributions using the above function.
    """
    rows = len(frames)//2 + 1
    fig, axes = plt.subplots(nrows=rows, ncols=2, figsize=(15,15))
    fig.suptitle(f"{len(countries)} respondents consistent across surveys greater than {cull_factor} responses")
    for (key, frame), ax in zip(frames.items(), axes.reshape(-1)):
        grouped = frame.groupby("country").count()
        grouped = grouped.loc[list(countries)].sort_values("mainbranch")
        grouped.plot(y="mainbranch", ax=ax, kind="bar", legend=False)
        ax.set_title(key[-4:])
    
    plt.show()

In [None]:
# across all data sets here are the countries that are here most often
# where is US? UK? They have different, inconsistent names throughout the years
# # i.e. United States vs United States of America; UK vs United Kingdom, see above mapping
cull_factor = 20
country_sim = find_similar_country(frames_dict, cull_factor)
# show_country_dist(frames_dict, list(country_sim), cull_factor)

## One Hot Testing for Skills (deprecated)

In [None]:
# basically with every one of these is separated by a ;
# goal of next function:
# # find the sub-strings separated by ; nans will have to be replaced by "None" or "Empty"
# # one hot the entries for example, if C appears in one of these queries, for that particular
# # subject there will be a 1 for yes and 0 for no essentially
# # this is why we need the None/Empty so we can add them up
# # Eventually after one hotting we drop the None/Empty since it's a dummy column
# # we would then be able to add them up using count or something and put onto a graph/analysis


# standard = ["language", "database", "platform", "webframe", "misctech"]
# want = "wanttoworkwith"
# have = "haveworkedwith"
# for key, frame in frames_dict.items():
#     print(key)
#     for stan in standard:
#         print(f"{stan}: {frame[stan + want].isna().sum()} {frame[stan + want].sample(n=1).values}")
#     print()

In [None]:
# df = frames_dict["df_data_2019"].copy(deep=True) # don't want this to point at the frame in dict

In [None]:
# coi = 'languagewanttoworkwith'
# df[coi] = df[coi].str.split(";")
# mlb = MultiLabelBinarizer(sparse_output=True) # saves ram

# transformed = mlb.fit_transform(df.pop(coi))
# columns = ["langwork_" + name for name in mlb.classes_]

# df = df.join(
#             pd.DataFrame.sparse.from_spmatrix(
#                 transformed,
#                 index=df.index,
#                 columns=columns))

In [None]:
# frame = frames_dict["df_data_2019"].copy(deep=True)

In [None]:
# df.groupby('country').sum()[mlb.classes_]

In [None]:
# def create_onehot_skills(frames: dict):
#     # some constants
#     standard = [("language", "lg"), ("database", "db"), ("platform", "pf"), ("webframe", "wf"), ("misctech", "mt")]
#     status = [("wanttoworkwith", "www"), ("haveworkedwith", "hww")]

#     new_cols_per_year = {}
    
#     for key, frame in frames.items():
#         new_cols = []
#         print(key)
#         for stan, abv in standard:
#             for stat, abr in status:
#                 coi = stan + stat # coi = column of interest
#                 abbr = abv + abr + "_"
#                 mlb = MultiLabelBinarizer(sparse_output=True) # saves ram
#                 frame[coi] = frame[coi].str.split(";")
#                 transformed = mlb.fit_transform(frame.pop(coi))
#                 new_cois = [abbr + name for name in mlb.classes_]
#                 frame = frame.join(
#                             pd.DataFrame.sparse.from_spmatrix(
#                                 transformed,
#                                 index=frame.index,
#                                 columns=new_cois
#                             )
#                         )
#                 new_cois.remove(abbr + "Empty")
#                 new_cols += new_cois
#                 frame.drop(abbr + "Empty", axis=1)
#         frames[key] = frame
#         new_cols_per_year[key] = new_cols
#     return new_cols_per_year

In [None]:
# import copy
# cp_dict = copy.deepcopy(frames_dict)

In [None]:
# new_cols = create_onehot_skills(cp_dict)

## Ed Level Processing (deprecated)

In [None]:
# one-hot education for same reason
# same thing
# we have nans and doctoral degrees missing from 2023 

In [None]:
# import copy
# cp_dict = copy.deepcopy(frames_dict)
# abbr_education(cp_dict)

In [None]:
# for key, frame in cp_dict.items():
#     frame['edlevel'] = frame['edlevel'].replace({'I never completed any formal education': 'Something else'})
    
#     do = list(frame['edlevel'].unique())
#     print(key, len(do))
#     do.sort()
#     display(do)
#     print()

In [None]:
# for key, frame in cp_dict.items():
#     lb = LabelBinarizer(sparse_output=True) # saves ram
#     transformed = lb.fit_transform(frame.pop('edlevel'))
#     frame = frame.join(
#                 pd.DataFrame.sparse.from_spmatrix(
#                     transformed,
#                     index=frame.index,
#                     columns=lb.classes_
#                 )
#             )
#     if 'phd' not in frame.columns:
#         frame['phd'] = [0] * len(frame)
#     print(frame.columns[-10:])

## Employment (deprecated)

In [None]:
# find_similar_col(frames_dict)

In [None]:
col = "devtype"
for year in range(2019, 2024):
    frame = frames_dict[f"df_data_{year}"].copy()
    unique = frame[col].unique()
    # unique.sort()
    print(year, frame[col].dtypes, frame[col].isna().sum())
    print(unique)

In [None]:
df = frames_dict["df_data_2019"]
grouped = df.groupby('country').agg({"count":["sum"], "convertedcompyearly":["mean", "std"]})

In [None]:
grouped.columns

In [None]:
# this is how we would cull, not awful but also not best thing in the world
grouped = grouped[grouped[("count", "sum")] > cull_factor]
# grouped

In [None]:
frames_dict["df_data_2020"]["devtype"].value_counts()