In [14]:
import pandas as pd
import numpy as np
import math
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [27]:
master_data = pd.read_csv("../Data/filtered/master-data.csv", 
                          sep=",", 
                          encoding='Latin-1', 
                          low_memory=False)

In [44]:
def clean_up_prof_lang(row):
    lang = row
    if row in ["OBJECTIVE_-C"]:
        lang = "OBJECTIVE-C"
    elif row in ["HTML/CSS"]:
        lang = "HTML_CSS"
    elif row in ["ARDUINO_/_RASPBERRY_PI"]:
        lang = "ARDUINO_RASPBERRY_PI"
    elif row in [".NET"]:
        lang = "DOTNET"
    elif row in ["UNKNOWN", "NONE"]:
        lang = "OTHER"
    elif row in ["NODE.JS"]:
        lang = "NODEJS"
    return lang.lower()

def categorize_prof_lang(row):
    lang = row 
    
    if row in ["c", "c++"]:
        lang = "c_c++"
    elif row in ["javascript", "nodejs"]:
        lang = "javascript_nodejs"
    elif row in ["css", "html_css"]:
        lang = "html_css"
    elif row in ["sql", "sql_server"]:
        lang = "sql"
    elif row in ["dotnet", "sas", "arduino_raspberry_pi"]:
        lang = "other"
    
    return lang

In [60]:
def clean_up_occupation(row):
    occupation = row
    if row in ["UNKNOWN"]:
        occupation= "OTHER"
    elif row in ["DEVELOPER_OTHER"]:
        occupation= "DEVELOPER"
    elif row in ["MOBILE DEVELOPER"]:
        occupation="MOBILE_DEVELOPER"
    elif row in ["ANALYST"]:
        occupation="DATA_ANALYST"
    return occupation.lower()

In [29]:
master_data["prof_lang_1"] = master_data["prof_lang_1"].apply(clean_up_prof_lang)

In [45]:
master_data["prof_lang_1"] = master_data["prof_lang_1"].apply(categorize_prof_lang)

In [61]:
master_data["occupation_norm"] = master_data["occupation_norm"].apply(clean_up_occupation)

In [62]:
master_data.rename(columns={'prof_lang_1': 'proficient_language', 'occupation_norm': 'occupation'}, inplace=True)

In [71]:
master_data.groupby("IT_experience_in_years")["IT_experience_in_years"].count()

IT_experience_in_years
2_to_5      64691
6_to_10     70248
above_11    91487
below_2     26314
Name: IT_experience_in_years, dtype: int64

In [72]:
transformer = make_column_transformer(
    (OneHotEncoder(sparse=False), ['gender', 'age', 'company_size', 'desktop_OS', 'proficient_language', 'occupation', 'IT_experience_in_years']),
    remainder='passthrough',
    verbose_feature_names_out=False)

transformed = transformer.fit_transform(master_data)
transformed_df = pd.DataFrame(
    transformed, 
    columns=transformer.get_feature_names()
)



In [73]:
transformed_df.columns

Index(['onehotencoder__x0_female', 'onehotencoder__x0_male',
       'onehotencoder__x0_non_conforming', 'onehotencoder__x0_transgender',
       'onehotencoder__x0_unknown', 'onehotencoder__x1_18_to_24',
       'onehotencoder__x1_25_to_34', 'onehotencoder__x1_35_to_44',
       'onehotencoder__x1_45_to_54', 'onehotencoder__x1_55_to_64',
       'onehotencoder__x1_above_65', 'onehotencoder__x1_below_18',
       'onehotencoder__x2_1000_to_4999', 'onehotencoder__x2_100_to_499',
       'onehotencoder__x2_5000_to_9999', 'onehotencoder__x2_500_to_999',
       'onehotencoder__x2_Unknown', 'onehotencoder__x2_above_10000',
       'onehotencoder__x2_below_100', 'onehotencoder__x3_linux',
       'onehotencoder__x3_mac', 'onehotencoder__x3_unix',
       'onehotencoder__x3_windows', 'onehotencoder__x4_c#',
       'onehotencoder__x4_c_c++', 'onehotencoder__x4_hadoop',
       'onehotencoder__x4_html_css', 'onehotencoder__x4_java',
       'onehotencoder__x4_javascript_nodejs', 'onehotencoder__x4_lua',
  

In [77]:
transformed_df.rename(columns={'onehotencoder__x0_female': 'is_female', 
                               'onehotencoder__x0_male': 'is_male',
                              'onehotencoder__x0_non_conforming': 'is_gender_non_confirming',
                              'onehotencoder__x0_transgender': 'is_transgender',
                              'onehotencoder__x0_unknown': 'is_gender_unknown',
                              'onehotencoder__x1_18_to_24': 'is_age_18_to_24',
                              'onehotencoder__x1_25_to_34': 'is_age_25_to34',
                              'onehotencoder__x1_35_to_44': 'is_age_35_44',
                              'onehotencoder__x1_45_to_54': 'is_age_45_54',
                              'onehotencoder__x1_55_to_64': 'is_age_55_to_64',
                              'onehotencoder__x1_above_65': 'is_age_above_65',
                              'onehotencoder__x1_below_18': 'is_age_below_18',
                              'onehotencoder__x2_below_100': 'is_company_size_below_100',
                               'onehotencoder__x2_100_to_499': 'is_company_size_100_to_499',
                               'onehotencoder__x2_500_to_999': 'is_company_size_500_to_999',
                               'onehotencoder__x2_1000_to_4999': 'is_company_size_1000_to_4999',
                               'onehotencoder__x2_5000_to_9999': 'is_company_size_5000_to_9999',
                               'onehotencoder__x2_above_10000': 'is_company_size_above_10000',
                               'onehotencoder__x2_Unknown': 'is_company_size_unknown',
                               'onehotencoder__x3_linux': 'is_desktop_os_linux',
                               'onehotencoder__x3_mac': 'is_desktop_os_mac',
                               'onehotencoder__x3_unix': 'is_desktop_os_unix',
                               'onehotencoder__x3_windows': 'is_desktop_os_windows',
                               "onehotencoder__x4_c#": "is_lang_c#",
                                "onehotencoder__x4_c_c++": "is_lang_c_c++",
                                "onehotencoder__x4_hadoop": "is_lang_hadoop",
                                "onehotencoder__x4_html_css": "is_lang_html_css",
                                "onehotencoder__x4_java": "is_lang_java",
                                "onehotencoder__x4_javascript_nodejs": "is_lang_javascript_nodejs",
                                "onehotencoder__x4_lua": "is_lang_lua",
                                "onehotencoder__x4_objective-c": "is_lang_objective_C",
                                "onehotencoder__x4_other": "is_lang_other",
                                "onehotencoder__x4_perl": "is_lang_perl",
                                "onehotencoder__x4_php": "is_lang_php",
                                "onehotencoder__x4_powershell": "is_lang_powershell",
                                "onehotencoder__x4_python": "is_lang_python",
                                "onehotencoder__x4_r": "is_lang_r",
                                "onehotencoder__x4_ruby": "is_lang_ruby",
                                "onehotencoder__x4_scala": "is_lang_scala",
                                "onehotencoder__x4_sql": "is_lang_sql",
                                "onehotencoder__x4_swift": "is_lang_swift",
                                "onehotencoder__x4_visual_basic": "is_lang_visual_basic",
                               "onehotencoder__x5_c_suite": "is_occupation_c_suite",
                                "onehotencoder__x5_data_analyst": "is_occupation_data_analyst",
                                "onehotencoder__x5_data_scientist": "is_occupation_data_scientist",
                                "onehotencoder__x5_desktop_developer": "is_occupation_desktop_developer",
                                "onehotencoder__x5_developer": "is_occupation_developer",
                                "onehotencoder__x5_engineer": "is_occupation_engineer",
                                "onehotencoder__x5_manager": "is_occupation_manager",
                                "onehotencoder__x5_mobile_developer": "is_occupation_mobile_developer",
                                "onehotencoder__x5_not_in_tech": "is_occupation_not_in_tech",
                                "onehotencoder__x5_other": "is_occupation_other",
                                "onehotencoder__x5_system_admin": "is_occupation_system_admin",
                                "onehotencoder__x5_web_developer": "is_occupation_web_developer",
                               "onehotencoder__x6_2_to_5": "is_IT_expr_2_to_5",
                                "onehotencoder__x6_6_to_10": "is_IT_expr_6_to_10",
                                "onehotencoder__x6_above_11": "is_IT_expr_above_11",
                                "onehotencoder__x6_below_2": "is_IT_expr_below_2"
                              }, inplace=True)

In [79]:
transformed_df.head()

Unnamed: 0,is_female,is_male,is_gender_non_confirming,is_transgender,is_gender_unknown,is_age_18_to_24,is_age_25_to34,is_age_35_44,is_age_45_54,is_age_55_to_64,...,is_occupation_web_developer,is_IT_expr_2_to_5,is_IT_expr_6_to_10,is_IT_expr_above_11,is_IT_expr_below_2,compensation,year,country,job_satisfaction,GDP
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,above-160000,2019,United States,8,65094.79943
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,above-160000,2019,United States,8,65094.79943
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,above-160000,2019,United States,8,65094.79943
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,above-160000,2019,United States,8,65094.79943
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,above-160000,2019,United States,8,65094.79943


In [80]:
# Writes the encoded data for model building
number_of_chunks = 3
for idx, chunk in enumerate(np.array_split(transformed_df, number_of_chunks)):
    chunk.to_csv(f'../Data/model_data/model-encoded-data-{idx}.csv', index=False)

In [81]:
# Verification: read the files into dataframes
files = ["../Data/model_data/model-encoded-data-0.csv", 
         "../Data/model_data/model-encoded-data-1.csv", 
         "../Data/model_data/model-encoded-data-2.csv"]
model_dfs = [pd.read_csv(f, sep=",", 
                          encoding='Latin-1', 
                          low_memory=False ) for f in files]

# combine the list of dataframes
model_data = pd.concat(model_dfs, ignore_index=True)

In [82]:
model_data.columns.tolist()

['is_female',
 'is_male',
 'is_gender_non_confirming',
 'is_transgender',
 'is_gender_unknown',
 'is_age_18_to_24',
 'is_age_25_to34',
 'is_age_35_44',
 'is_age_45_54',
 'is_age_55_to_64',
 'is_age_above_65',
 'is_age_below_18',
 'is_company_size_1000_to_4999',
 'is_company_size_100_to_499',
 'is_company_size_5000_to_9999',
 'is_company_size_500_to_999',
 'is_company_size_unknown',
 'is_company_size_above_10000',
 'is_company_size_below_100',
 'is_desktop_os_linux',
 'is_desktop_os_mac',
 'is_desktop_os_unix',
 'is_desktop_os_windows',
 'is_lang_c#',
 'is_lang_c_c++',
 'is_lang_hadoop',
 'is_lang_html_css',
 'is_lang_java',
 'is_lang_javascript_nodejs',
 'is_lang_lua',
 'is_lang_objective_C',
 'is_lang_other',
 'is_lang_perl',
 'is_lang_php',
 'is_lang_powershell',
 'is_lang_python',
 'is_lang_r',
 'is_lang_ruby',
 'is_lang_scala',
 'is_lang_sql',
 'is_lang_swift',
 'is_lang_visual_basic',
 'is_occupation_c_suite',
 'is_occupation_data_analyst',
 'is_occupation_data_scientist',
 'is_oc

In [84]:
model_data.head()

Unnamed: 0,is_female,is_male,is_gender_non_confirming,is_transgender,is_gender_unknown,is_age_18_to_24,is_age_25_to34,is_age_35_44,is_age_45_54,is_age_55_to_64,...,is_occupation_web_developer,is_IT_expr_2_to_5,is_IT_expr_6_to_10,is_IT_expr_above_11,is_IT_expr_below_2,compensation,year,country,job_satisfaction,GDP
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,above-160000,2019,United States,8,65094.79943
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,above-160000,2019,United States,8,65094.79943
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,above-160000,2019,United States,8,65094.79943
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,above-160000,2019,United States,8,65094.79943
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,above-160000,2019,United States,8,65094.79943


In [85]:
model_data.shape

(252740, 63)