In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import pycountry
pd.set_option('display.max_columns', 500)
random_state = 42

In [2]:
stack_overflow_files = (os.listdir("data/"))
# not worth going from 2011-2014. No data scientists.
# ok, so decision to do (2019 maybe) 2020-2023 for analysis

# GPT gave me this idea instead of going through every possible country manually
country_abbreviations_1 = {country.name: country.alpha_2 for country in pycountry.countries}
country_abbreviations_2 = {country.name: country.official_name for country in pycountry.countries}
os.listdir("data/")



['ppp.csv', 'salaries.csv', 'stack_overflow']

In [22]:
def read_stackoverflow() -> dict:
    """
    Reads CSVs and gets the numbe of data professionals. Any empty values are dropped from job title and 
    salary so we will always have data. Other columns may have nans.
    Data Manipulation:
    - dropping nans from salary and devtype combined
    - Changing the salary column to ConvertedCompYearly so we can merge all data frames comes time
    - Lowering column names since there was some weird camel case going on
    - Converting specific columns that mean the same thing per year into a singular name
    - Fill in nans for language/skill specific values with "Empty"
      - this is so we can one hot later on for a more concise analysis, more later on
    """
    frames = {}
    stack_o_files = os.listdir("data/stack_overflow/")
    for file in stack_o_files:
        year = file[-8:-4]
        df = pd.read_csv(f"data/stack_overflow/{file}", encoding='ISO-8859-1')

        # standardize compensation columns
        if 'ConvertedComp' in df.columns:
            df = df.rename(columns={'ConvertedComp': 'ConvertedCompYearly'})

        # standardize some columns
        # using camel case resulted in errors with webframe where sometimes F was capitalized
        standard = ["language", "database", "platform", "webframe", "misctech"]
        df.columns = df.columns.str.lower()
        for stan in standard:
            if f"{stan}workedwith" in df.columns:
                df = df.rename(columns={f'{stan}workedwith': f'{stan}haveworkedwith', f'{stan}desirenextyear':f'{stan}wanttoworkwith'})
            df[f"{stan}haveworkedwith"] = df[f"{stan}haveworkedwith"].fillna(value="Empty")
            df[f"{stan}wanttoworkwith"] = df[f"{stan}wanttoworkwith"].fillna(value="Empty")

        # standardize some country names, now they should match with Kaggle dataset
        df["country"] = df["country"].replace(country_abbreviations_1)
        df["country"] = df["country"].replace(country_abbreviations_2)
        
        
        df = df.dropna(subset=["devtype", "convertedcompyearly"])
        df = df[df["devtype"].str.contains("data", case=False)]
        df["year"] = [year] * len(df)
        frames[f"df_data_{year}"] = df
    return frames

In [23]:
frames_dict = read_stackoverflow()

In [5]:
# this is the number of entries we are working with in our frames
# seeing how to standardize the columns some more

query = "Web"
for key, frame in frames_dict.items():
    lang = []
    for col in frame.columns:
        lang.append(col) if query in col else None
    print(f"{key}\t{len(frame)}\t{lang}")

df_data_2019	13393	[]
df_data_2020	8294	[]
df_data_2021	9272	[]
df_data_2022	6921	[]
df_data_2023	2480	[]


In [6]:
# print(frames_dict["df_data_2019"].columns)

In [7]:
# print(frames_dict["df_data_2020"].columns)

In [8]:
# print(frames_dict["df_data_2021"].columns)

In [9]:
# print(frames_dict["df_data_2022"].columns)

In [10]:
# print(frames_dict["df_data_2023"].columns)

In [11]:
# do they have similar columns?
def find_similar_col(frames) -> set:
    """
    Returns the set of columns that the all share, ideally we maximize the ratio of this to merge.
    """
    union = []
    for key, frame in frames.items():
        union.append(set(frame.columns))
        
    standard = union[0]
    for cols in union[1:]:
        standard = standard.intersection(cols)
    return standard

In [40]:
# find_similar_col(frames_dict)

In [13]:
# play around with the number and see if this is the spread that we want
for key, frame in frames_dict.items():
    grouped = (frame.groupby("country").count())
    grouped = grouped[grouped["mainbranch"] > 10]
    length = len(grouped)
    print(f"""{key}: {length}
    max: {grouped['mainbranch'].idxmax()}, {grouped['mainbranch'].max()}
    min: {grouped['mainbranch'].idxmin()}, {grouped['mainbranch'].min()}""")

df_data_2019: 83
    max: US, 3856
    min: AM, 11
df_data_2020: 69
    max: US, 2081
    min: BY, 11
df_data_2021: 70
    max: United States of America, 2144
    min: BA, 11
df_data_2022: 63
    max: United States of America, 1702
    min: EG, 11
df_data_2023: 36
    max: United States of America, 687
    min: CN, 11


In [14]:
# do they have similar columns?
def find_similar_country(frames: dict, cull_factor=20) -> set:
    """
    Given a particular minimum (cull_factor) find the countries in common among
    frames.
    """
    union = []
    for key, frame in frames.items():
        grouped = frame.groupby("country").count()
        grouped = grouped[grouped["mainbranch"] > cull_factor]
        union.append(set(grouped.index))
        
    standard = union[0]
    for cols in union[1:]:
        standard = standard.intersection(cols)
    return standard

def show_country_dist(frames: dict, countries: list, cull_factor: int) -> None:
    """
    Just plot a bar chart for our country distributions using the above function.
    """
    rows = len(frames)//2 + 1
    fig, axes = plt.subplots(nrows=rows, ncols=2, figsize=(15,15))
    fig.suptitle(f"{len(countries)} respondents consistent across surveys greater than {cull_factor} responses")
    for (key, frame), ax in zip(frames.items(), axes.reshape(-1)):
        grouped = frame.groupby("country").count()
        grouped = grouped.loc[list(countries)].sort_values("mainbranch")
        grouped.plot(y="mainbranch", ax=ax, kind="bar", legend=False)
        ax.set_title(key[-4:])
    
    plt.show()

In [15]:
# across all data sets here are the countries that are here most often
# where is US? UK? They have different, inconsistent names throughout the years
# # i.e. United States vs United States of America; UK vs United Kingdom, see above mapping
cull_factor = 20
country_sim = find_similar_country(frames_dict, cull_factor)
# show_country_dist(frames_dict, list(country_sim), cull_factor)

In [16]:
df_2019 = frames_dict["df_data_2019"]
df_2019.columns

Index(['respondent', 'mainbranch', 'hobbyist', 'opensourcer', 'opensource',
       'employment', 'country', 'student', 'edlevel', 'undergradmajor',
       'eduother', 'orgsize', 'devtype', 'yearscode', 'age1stcode',
       'yearscodepro', 'careersat', 'jobsat', 'mgridiot', 'mgrmoney',
       'mgrwant', 'jobseek', 'lasthiredate', 'lastint', 'fizzbuzz',
       'jobfactors', 'resumeupdate', 'currencysymbol', 'currencydesc',
       'comptotal', 'compfreq', 'convertedcompyearly', 'workweekhrs',
       'workplan', 'workchallenge', 'workremote', 'workloc', 'impsyn',
       'coderev', 'coderevhrs', 'unittests', 'purchasehow', 'purchasewhat',
       'languagehaveworkedwith', 'languagewanttoworkwith',
       'databasehaveworkedwith', 'databasewanttoworkwith',
       'platformhaveworkedwith', 'platformwanttoworkwith',
       'webframehaveworkedwith', 'webframewanttoworkwith',
       'misctechhaveworkedwith', 'misctechwanttoworkwith', 'devenviron',
       'opsys', 'containers', 'blockchainorg', 'b

In [38]:
# basically with every one of these is separated by a ;
# goal of next function:
# # find the sub-strings separated by ;, nans will have to be replaced by "None" or "Empty"
# # one hot the entries for example, if C appears in one of these queries, for that particular
# # subject there will be a 1 for yes and 0 for no essentially
# # this is why we need the None/Empty so we can add them up
# # Eventually after one hotting we drop the None/Empty since it's a dummy column
# # we would then be able to add them up using count or something and put onto a graph/analysis
standard = ["language", "database", "platform", "webframe", "misctech"]
want = "wanttoworkwith"
have = "haveworkedwith"
for key, frame in frames_dict.items():
    print(key)
    for stan in standard:
        print(f"{stan}: {frame[stan + want].isna().sum()} {frame[stan + want].sample(n=1).values}")
    print()

df_data_2019
language: 0 ['HTML/CSS;JavaScript;Python']
database: 0 ['MongoDB;PostgreSQL;SQLite;Other(s):']
platform: 0 ['Empty']
webframe: 0 ['Angular/Angular.js;jQuery']
misctech: 0 ['.NET;.NET Core;TensorFlow;Unity 3D']

df_data_2020
language: 0 ['HTML/CSS;Java;JavaScript;Python;SQL']
database: 0 ['MySQL']
platform: 0 ['Google Cloud Platform;Linux']
webframe: 0 ['Angular;Django']
misctech: 0 ['Node.js']

df_data_2021
language: 0 ['C#;Python;SQL']
database: 0 ['Empty']
platform: 0 ['AWS']
webframe: 0 ['Empty']
misctech: 0 ['.NET Framework;.NET Core / .NET 5']

df_data_2022
language: 0 ['Bash/Shell;Haskell;Python;Rust']
database: 0 ['Empty']
platform: 0 ['DigitalOcean']
webframe: 0 ['Django']
misctech: 0 ['Electron;Keras;NumPy;Pandas;React Native;Scikit-learn;TensorFlow;Torch/PyTorch']

df_data_2023
language: 0 ['C;Python;Rust']
database: 0 ['Elasticsearch;PostgreSQL;Redis']
platform: 0 ['Empty']
webframe: 0 ['Next.js;Node.js;Nuxt.js;Qwik;React;Remix;Solid.js;Vue.js']
misctech: 0 ['Ap