In [7]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pycountry
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
pd.set_option('display.max_columns', 500)
np.random.seed(42)

In [8]:
stack_overflow_files = (os.listdir("data/"))
# not worth going from 2011-2014. No data scientists.
# ok, so decision to do (2019 maybe) 2020-2023 for analysis

# GPT gave me this idea instead of going through every possible country manually
country_abbreviations_1 = {country.name: country.alpha_3 for country in pycountry.countries}
country_abbreviations_2 = {country.official_name: country.alpha_3 for country in pycountry.countries}
os.listdir("data/")

['ppp.csv', 'salaries.csv', 'stack_overflow']

Probably put the doc strings into markdowns
- will also be used to explain visualization

We could
- merge on the money after rounding
  - check how much data we have afterwards
- like worldwide -> skills
- function to merge and check the distribution afterwards

## Functions

In [78]:
def create_onehot_skills(frames: dict) -> None:
    """
    Given a dictionary of pandas dataframes we want to one hot the skills in particular.
    We want to take the skills in the different columns and one hot them such we can sum them for groupby operations.
    We get a dictionary of pandas DataFrames and perform an inplace operation such that we don't have to create new memory.
    Return a dictionary of a list of strings for a couple reasons:
        - there's no way we will remember all of these so automation by putting these into a list seemed like the best idea
        - the keys will match those in the input in case we want to do something with these later per year
        - hashing onto a dictionary should allow for ease of access since no 2 years will have the same EXACT one hot columns, hence the list
    The above is deprecated, after merging with similar columns these will all be useless to us

    We also drop the _Empty for EVERYTHING since that information is useless to us
    
    Input: frames dict{str: pd.DataFrames}
    Ouput: None

    https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list

    Rough example flow of function for one sample:
    C; C++; Perl -> [C, C++, Perl] -> [1, 1, 1, 0]
    Python       -> [Python]       -> [0, 0, 0, 1]
    """
    # some constants
    standard = [("language", "lg"), ("database", "db"), ("platform", "pf"), ("webframe", "wf"), ("misctech", "mt")]
    status = [("wanttoworkwith", "www"), ("haveworkedwith", "hww")]
    
    for key, frame in frames.items():
        new_cols = []
        for stan, abv in standard:
            for stat, abr in status:
                coi = stan + stat # coi = column of interest
                abbr = abv + abr + "_"
                mlb = MultiLabelBinarizer(sparse_output=True) # saves ram
                frame[coi] = frame[coi].str.split(";")
                transformed = mlb.fit_transform(frame.pop(coi))
                new_cois = [abbr + name for name in mlb.classes_]
                frame = frame.join(
                            pd.DataFrame.sparse.from_spmatrix(
                                transformed,
                                index=frame.index,
                                columns=new_cois
                            )
                        )
                new_cois.remove(abbr + "Empty")
                new_cols += new_cois
                frame = frame.drop(abbr + "Empty", axis=1)
        # this needs to be here, if not throse Sparse type errors
        # # Sparse types don't allow normal groupby operations (ie reshape) so we need to turn them into ints
        # # int8 don't take up a ton and it's just 0's and 1's
        # # for all intents and purposes these are sparse matrices, we just want to avoid the object
        frame[new_cols] = frame[new_cols].fillna(0)
        frame[new_cols] = frame[new_cols].astype('int8')
        frames[key] = frame

In [79]:
def abbr_education(frames: dict) -> None:
    """
    Similar in spirit to the other one hots, but this is in place
    Automatically abbreviates education levels across all frames
    Had to hard code the list again, not a big deal only 8 items
    
    Input: frames dict{str: pd.DataFrames}
    Ouput: None
    """
    # more hardcoded stuff that are needed
    abbreviations = ["Associate's", "Bachelor's", "Master's", "Elementary", "Professional", "Secondary", "Some College", "Else"]
    
    for key, frame in frames.items():
        # easier to replace this, makes it much easier to work with
        frame['edlevel'] = frame['edlevel'].replace({'I never completed any formal education': 'Something else'})

        # need the sorted since they have the same rough scheme
        levels = list(frame['edlevel'].unique())
        levels.sort()
        o = 0 # offset

        # dictionary to feed into repalce function
        replace_dict = {}
        for i in range(len(levels)):
            col = levels[i]
            if col == 'nan':
                break
            abbr = abbreviations[i-o]
            if 'doctoral' in col:
                replace_dict[col] = "Doctoral"
                o += 1
                continue
            replace_dict[col] = abbr
                
        frame['edlevel'] = frame['edlevel'].replace(replace_dict)
        frames[key] = frame

In [80]:
def bin_ages(frames: dict) -> None:
    bins = [0, 18, 24, 34, 44, 54, 64, 100]
    labels = ['Under 18 years old', '18-24 years old', '25-34 years old', '35-44 years old', '45-54 years old', '55-64 years old', '65 years or older']
    for year, frame in frames.items():    
        if frame["age"].dtypes == float:
            frame["age"] = pd.cut(frame["age"], bins=bins, labels=labels)
        frame["age"] = frame["age"].astype('str')
        
        frames[year] = frame

In [81]:
def find_similar_col(frames) -> list:
    """
    Returns the set of columns that the all share, ideally we maximize the ratio of this to merge.
    """
    union = []
    for key, frame in frames.items():
        union.append(set(frame.columns))
        
    standard = union[0]
    for cols in union[1:]:
        standard = standard.intersection(cols)
    return list(standard)

In [82]:
def encode_devtype(df: pd.DataFrame) -> (pd.DataFrame, list):
    coi = "devtype"
    mlb = MultiLabelBinarizer(sparse_output=True) # saves ram
    df[coi] = df[coi].str.split(";")
    transformed = mlb.fit_transform(df.pop(coi))
    new_cols = mlb.classes_
    df = df.join(
                pd.DataFrame.sparse.from_spmatrix(
                    transformed,
                    index=df.index,
                    columns=mlb.classes_
                )
            )
    # see above binarizer
    df[new_cols] = df[new_cols].fillna(0)
    df[new_cols] = df[new_cols].astype('int8')
    return df, new_cols

In [100]:
def read_stackoverflow() -> (pd.DataFrame, list, list):
    """
    Reads CSVs and gets the numbe of data professionals. Any empty values are dropped from job title and 
    salary so we will always have data. Other columns may have nans.
    Data Manipulation:
    - dropping nans from salary and devtype combined
    - Changing the salary column to ConvertedCompYearly so we can merge all data frames comes time
    - Lowering column names since there was some weird camel case going on
    - Converting specific columns that mean the same thing per year into a singular name
    - Fill in nans for language/skill specific values with "nan"
      - this is so we can one hot later on for a more concise analysis, more later on
    - Binarize the different skills per year, see create_onehot_skills
    - Next we abbreviate education levels so that we can also one hot them, see above
    - Change education to keep into one column binarizing doesn't make any sense
    - Changing org size into something much more manageable, mainly the I don't know field
    - We merge them into one, the same groupby operations can still be done as if seperate
    - Encode devtype to binarize as well since it's very difficult to parse through ; every single time
        - we can do some clever work arounds
    - Lastly we return the skills columns really quick to save headache later on

    Inputs: Nothing
    Outputs: tuple(pd.DataFrame, list[str], list[str])
    """
    frames = {}
    stack_o_files = os.listdir("data/stack_overflow/")
    for file in stack_o_files:
        year = file[-8:-4]
        df = pd.read_csv(f"data/stack_overflow/{file}", encoding='ISO-8859-1')

        # standardize compensation columns
        if 'ConvertedComp' in df.columns:
            df = df.rename(columns={'ConvertedComp': 'ConvertedCompYearly'})

        # standardize some columns
        # using camel case resulted in errors with webframe where sometimes F was capitalized
        standard = ["language", "database", "platform", "webframe", "misctech"]
        df.columns = df.columns.str.lower()
        for stan in standard:
            if f"{stan}workedwith" in df.columns:
                df = df.rename(columns={f'{stan}workedwith': f'{stan}haveworkedwith', f'{stan}desirenextyear':f'{stan}wanttoworkwith'})
            df[f"{stan}haveworkedwith"] = df[f"{stan}haveworkedwith"].fillna(value="Empty")
            df[f"{stan}wanttoworkwith"] = df[f"{stan}wanttoworkwith"].fillna(value="Empty")

        # standardize some country names, now they should match with Kaggle dataset
        df["country"] = df["country"].replace(country_abbreviations_1)
        df["country"] = df["country"].replace(country_abbreviations_2)

        # we have some numbers so we can't just do entire df
        df[['edlevel', 'orgsize']] = df[['edlevel', 'orgsize']].fillna(value="nan")
        df['orgsize'] = df['orgsize'].replace({'I donâ\x80\x99t know': 'IDK'})
        
        df = df.dropna(subset=["devtype", "convertedcompyearly"])
        df = df[df["devtype"].str.contains("data", case=False)]
        df["count"] = [1] * len(df) # this is for our groupby so that we can say count > cull when we sum or count
        df["year"] = [year] * len(df)
        frames[f"df_data_{year}"] = df

    # oops forgot indentation
    abbr_education(frames)
    bin_ages(frames)
    create_onehot_skills(frames)
    similar = find_similar_col(frames)
    
    # finally going to standardize to merge devtypes
    for key, frame in frames.items():
        frames[key] = frame[similar]
    df = pd.concat([frame for key, frame in frames.items()], axis=0)
    df, employment = encode_devtype(df)
    skills = [col for col in df.columns if any(substr in col for substr in ['lg', 'db', 'pf', 'wf', 'mt'])]
    
    return df, skills, employment

## The Data

In [101]:
df, skills, job_titles = read_stackoverflow()

In [107]:
# list(df.columns)

In [108]:
# df = frames_dict["df_data_2019"]
# skills

In [15]:
# df.dtypes[df. dtypes == 'Sparse[int32, 0]']

In [92]:
# list(frames_dict["df_data_2019"].dtypes)

In [17]:
# this is the number of entries we are working with in our frames
# seeing how to standardize the columns some more
# this is kind of useless now with one hotting everything

query = "Web"
for key, frame in frames_dict.items():
    lang = []
    for col in frame.columns:
        lang.append(col) if query in col else None
    print(f"{key}\t{len(frame)}\t{lang}")

df_data_2019	13393	['lgwww_WebAssembly', 'lghww_WebAssembly']
df_data_2020	8294	[]
df_data_2021	9272	[]
df_data_2022	6921	[]
df_data_2023	2480	['pfwww_Amazon Web Services (AWS)', 'pfhww_Amazon Web Services (AWS)']


In [18]:
# print(frames_dict["df_data_2019"].columns)

In [19]:
# print(frames_dict["df_data_2020"].columns)

In [20]:
# print(frames_dict["df_data_2021"].columns)

In [21]:
# print(frames_dict["df_data_2022"].columns)

In [22]:
# print(frames_dict["df_data_2023"].columns)

## Similarity with columns per the dataframes

In [23]:
# do they have similar columns?
def find_similar_col(frames) -> set:
    """
    Returns the set of columns that the all share, ideally we maximize the ratio of this to merge.
    """
    union = []
    for key, frame in frames.items():
        union.append(set(frame.columns))
        
    standard = union[0]
    for cols in union[1:]:
        standard = standard.intersection(cols)
    return standard

In [109]:
# find_similar_col(frames_dict)

## Countries given a cull factor

In [25]:
# play around with the number and see if this is the spread that we want
for key, frame in frames_dict.items():
    print(key)
    grouped = frame.groupby("country").count()
    grouped = grouped[grouped["mainbranch"] > 10]
    length = len(grouped)
    print(f"""{key}: {length}
    max: {grouped['mainbranch'].idxmax()}, {grouped['mainbranch'].max()}
    min: {grouped['mainbranch'].idxmin()}, {grouped['mainbranch'].min()}""")

df_data_2019
df_data_2019: 83
    max: USA, 3856
    min: ARM, 11
df_data_2020
df_data_2020: 69
    max: USA, 2081
    min: BLR, 11
df_data_2021
df_data_2021: 70
    max: USA, 2144
    min: BIH, 11
df_data_2022
df_data_2022: 63
    max: USA, 1702
    min: EGY, 11
df_data_2023
df_data_2023: 36
    max: USA, 687
    min: CHN, 11


In [26]:
# do they have similar columns?
def find_similar_country(frames: dict, cull_factor=20) -> set:
    """
    Given a particular minimum (cull_factor) find the countries in common among
    frames.
    """
    union = []
    for key, frame in frames.items():
        grouped = frame.groupby("country").count()
        grouped = grouped[grouped["mainbranch"] > cull_factor]
        union.append(set(grouped.index))
        
    standard = union[0]
    for cols in union[1:]:
        standard = standard.intersection(cols)
    return standard

def show_country_dist(frames: dict, countries: list, cull_factor: int) -> None:
    """
    Just plot a bar chart for our country distributions using the above function.
    """
    rows = len(frames)//2 + 1
    fig, axes = plt.subplots(nrows=rows, ncols=2, figsize=(15,15))
    fig.suptitle(f"{len(countries)} respondents consistent across surveys greater than {cull_factor} responses")
    for (key, frame), ax in zip(frames.items(), axes.reshape(-1)):
        grouped = frame.groupby("country").count()
        grouped = grouped.loc[list(countries)].sort_values("mainbranch")
        grouped.plot(y="mainbranch", ax=ax, kind="bar", legend=False)
        ax.set_title(key[-4:])
    
    plt.show()

In [27]:
# across all data sets here are the countries that are here most often
# where is US? UK? They have different, inconsistent names throughout the years
# # i.e. United States vs United States of America; UK vs United Kingdom, see above mapping
cull_factor = 20
country_sim = find_similar_country(frames_dict, cull_factor)
# show_country_dist(frames_dict, list(country_sim), cull_factor)

## One Hot Testing for Skills (deprecated)

In [28]:
# basically with every one of these is separated by a ;
# goal of next function:
# # find the sub-strings separated by ; nans will have to be replaced by "None" or "Empty"
# # one hot the entries for example, if C appears in one of these queries, for that particular
# # subject there will be a 1 for yes and 0 for no essentially
# # this is why we need the None/Empty so we can add them up
# # Eventually after one hotting we drop the None/Empty since it's a dummy column
# # we would then be able to add them up using count or something and put onto a graph/analysis


# standard = ["language", "database", "platform", "webframe", "misctech"]
# want = "wanttoworkwith"
# have = "haveworkedwith"
# for key, frame in frames_dict.items():
#     print(key)
#     for stan in standard:
#         print(f"{stan}: {frame[stan + want].isna().sum()} {frame[stan + want].sample(n=1).values}")
#     print()

In [29]:
# df = frames_dict["df_data_2019"].copy(deep=True) # don't want this to point at the frame in dict

In [30]:
# coi = 'languagewanttoworkwith'
# df[coi] = df[coi].str.split(";")
# mlb = MultiLabelBinarizer(sparse_output=True) # saves ram

# transformed = mlb.fit_transform(df.pop(coi))
# columns = ["langwork_" + name for name in mlb.classes_]

# df = df.join(
#             pd.DataFrame.sparse.from_spmatrix(
#                 transformed,
#                 index=df.index,
#                 columns=columns))

In [31]:
# frame = frames_dict["df_data_2019"].copy(deep=True)

In [32]:
# df.groupby('country').sum()[mlb.classes_]

In [33]:
# def create_onehot_skills(frames: dict):
#     # some constants
#     standard = [("language", "lg"), ("database", "db"), ("platform", "pf"), ("webframe", "wf"), ("misctech", "mt")]
#     status = [("wanttoworkwith", "www"), ("haveworkedwith", "hww")]

#     new_cols_per_year = {}
    
#     for key, frame in frames.items():
#         new_cols = []
#         print(key)
#         for stan, abv in standard:
#             for stat, abr in status:
#                 coi = stan + stat # coi = column of interest
#                 abbr = abv + abr + "_"
#                 mlb = MultiLabelBinarizer(sparse_output=True) # saves ram
#                 frame[coi] = frame[coi].str.split(";")
#                 transformed = mlb.fit_transform(frame.pop(coi))
#                 new_cois = [abbr + name for name in mlb.classes_]
#                 frame = frame.join(
#                             pd.DataFrame.sparse.from_spmatrix(
#                                 transformed,
#                                 index=frame.index,
#                                 columns=new_cois
#                             )
#                         )
#                 new_cois.remove(abbr + "Empty")
#                 new_cols += new_cois
#                 frame.drop(abbr + "Empty", axis=1)
#         frames[key] = frame
#         new_cols_per_year[key] = new_cols
#     return new_cols_per_year

In [34]:
# import copy
# cp_dict = copy.deepcopy(frames_dict)

In [35]:
# new_cols = create_onehot_skills(cp_dict)

## Ed Level Processing (deprecated)

In [36]:
# one-hot education for same reason
# same thing
# we have nans and doctoral degrees missing from 2023 

In [37]:
# import copy
# cp_dict = copy.deepcopy(frames_dict)
# abbr_education(cp_dict)

In [38]:
# for key, frame in cp_dict.items():
#     frame['edlevel'] = frame['edlevel'].replace({'I never completed any formal education': 'Something else'})
    
#     do = list(frame['edlevel'].unique())
#     print(key, len(do))
#     do.sort()
#     display(do)
#     print()

In [39]:
# for key, frame in cp_dict.items():
#     lb = LabelBinarizer(sparse_output=True) # saves ram
#     transformed = lb.fit_transform(frame.pop('edlevel'))
#     frame = frame.join(
#                 pd.DataFrame.sparse.from_spmatrix(
#                     transformed,
#                     index=frame.index,
#                     columns=lb.classes_
#                 )
#             )
#     if 'phd' not in frame.columns:
#         frame['phd'] = [0] * len(frame)
#     print(frame.columns[-10:])

## Employment (deprecated)

In [110]:
# find_similar_col(frames_dict)

In [41]:
col = "devtype"
for year in range(2019, 2024):
    frame = frames_dict[f"df_data_{year}"].copy()
    unique = frame[col].unique()
    # unique.sort()
    print(year, frame[col].dtypes, frame[col].isna().sum())
    print(unique)

2019 object 0
['Data or business analyst;Data scientist or machine learning specialist;Database administrator;Engineer, data'
 'Database administrator;Developer, back-end;Developer, front-end;Developer, full-stack;Developer, QA or test;DevOps specialist'
 'Data or business analyst;Data scientist or machine learning specialist;Database administrator;Developer, back-end;Developer, desktop or enterprise applications;Developer, front-end;Developer, full-stack;Developer, game or graphics;Educator'
 ...
 'Data scientist or machine learning specialist;Engineer, data;Engineering manager;Product manager'
 'Data scientist or machine learning specialist;Developer, desktop or enterprise applications;Developer, full-stack;DevOps specialist'
 'Academic researcher;Database administrator;Developer, back-end;Developer, desktop or enterprise applications;Developer, embedded applications or devices;Developer, front-end;Developer, full-stack;Developer, game or graphics;Developer, mobile;Educator;Marketing

In [42]:
df = frames_dict["df_data_2019"]
grouped = df.groupby('country').agg({"count":["sum"], "convertedcompyearly":["mean", "std"]})

In [43]:
grouped.columns

MultiIndex([(              'count',  'sum'),
            ('convertedcompyearly', 'mean'),
            ('convertedcompyearly',  'std')],
           )

In [111]:
# this is how we would cull, not awful but also not best thing in the world
grouped = grouped[grouped[("count", "sum")] > cull_factor]
# grouped

In [45]:
frames_dict["df_data_2020"]["devtype"].value_counts()

devtype
Data scientist or machine learning specialist                                                                                                                                                  228
Developer, back-end;Engineer, data                                                                                                                                                             184
Data or business analyst                                                                                                                                                                       157
Data or business analyst;Data scientist or machine learning specialist                                                                                                                         129
Engineer, data                                                                                                                                                                                 118
                 