In [50]:
import pandas as pd
import numpy as np
import math
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [51]:
master_data = pd.read_csv("../Data/filtered/master-data.csv", 
                          sep=",", 
                          encoding='Latin-1', 
                          low_memory=False)

In [52]:
master_data.head()

Unnamed: 0,compensation,year,country,age,gender,IT_experience_in_years,company_size,occupation,proficient_languages,desktop_OS,job_satisfaction
0,80000-100000,2011,Other Asia,35_to_44,male,11,below_100,"Executive (VP of Eng, CTO, CIO, etc.)","['JavaScript', 'CSS', 'PHP', 'SQL', 'C++', 'C'...",Linux,It pays the bills
1,20000-40000,2011,United States of America,18_to_24,male,41310,below_100,"Executive (VP of Eng, CTO, CIO, etc.)","['JavaScript', 'CSS', 'PHP', 'Python', 'SQL', ...",Windows 7,It pays the bills
2,80000-100000,2011,United States of America,18_to_24,male,41435,1000_to_4999,Web Application Developer,"['JavaScript', 'CSS', 'PHP', 'SQL']",Linux,I enjoy going to work
3,80000-100000,2011,Germany,35_to_44,male,11,500_to_999,Desktop Application Developer,['C#'],Windows 7,I enjoy going to work
4,60000-80000,2011,United Kingdom,35_to_44,male,11,500_to_999,IT Staff / System Administrator,"['SQL', 'C#', 'C++', 'C']",Windows 7,I enjoy going to work


In [53]:
transformer = make_column_transformer(
    (OneHotEncoder(), ['gender', 'age', 'company_size']),
    remainder='passthrough',
    verbose_feature_names_out=False)

transformed = transformer.fit_transform(master_data)
transformed_df = pd.DataFrame(
    transformed, 
    columns=transformer.get_feature_names()
)



In [54]:
transformed_df.rename(columns={'onehotencoder__x0_female': 'is_female', 
                               'onehotencoder__x0_male': 'is_male',
                              'onehotencoder__x0_non_conforming': 'is_gender_non_confirming',
                              'onehotencoder__x0_transgender': 'is_transgender',
                              'onehotencoder__x0_unknown': 'is_gender_unknown',
                              'onehotencoder__x1_18_to_24': 'is_age_18_to_24',
                              'onehotencoder__x1_25_to_34': 'is_age_25_to34',
                              'onehotencoder__x1_35_to_44': 'is_age_35_44',
                              'onehotencoder__x1_45_to_54': 'is_age_45_54',
                              'onehotencoder__x1_55_to_64': 'is_age_55_to_64',
                              'onehotencoder__x1_above_65': 'is_age_above_65',
                              'onehotencoder__x1_below_18': 'is_age_below_18',
                              'onehotencoder__x2_below_100': 'is_company_size_below_100',
                               'onehotencoder__x2_100_to_499': 'is_company_size_100_to_499',
                               'onehotencoder__x2_500_to_999': 'is_company_size_500_to_999',
                               'onehotencoder__x2_1000_to_4999': 'is_company_size_1000_to_4999',
                               'onehotencoder__x2_5000_to_9999': 'is_company_size_5000_to_9999',
                               'onehotencoder__x2_above_10000': 'is_company_size_above_10000',
                               'onehotencoder__x2_Unknown': 'is_company_size_unknown',
                               
                              }, inplace=True)

In [55]:
transformed_df.columns.tolist()

['is_female',
 'is_male',
 'is_gender_non_confirming',
 'is_transgender',
 'is_gender_unknown',
 'is_age_18_to_24',
 'is_age_25_to34',
 'is_age_35_44',
 'is_age_45_54',
 'is_age_55_to_64',
 'is_age_above_65',
 'is_age_below_18',
 'is_company_size_1000_to_4999',
 'is_company_size_100_to_499',
 'is_company_size_5000_to_9999',
 'is_company_size_500_to_999',
 'is_company_size_unknown',
 'is_company_size_above_10000',
 'is_company_size_below_100',
 'compensation',
 'year',
 'country',
 'IT_experience_in_years',
 'occupation',
 'proficient_languages',
 'desktop_OS',
 'job_satisfaction']

In [56]:
transformed_df.head()

Unnamed: 0,is_female,is_male,is_gender_non_confirming,is_transgender,is_gender_unknown,is_age_18_to_24,is_age_25_to34,is_age_35_44,is_age_45_54,is_age_55_to_64,...,is_company_size_above_10000,is_company_size_below_100,compensation,year,country,IT_experience_in_years,occupation,proficient_languages,desktop_OS,job_satisfaction
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,80000-100000,2011,Other Asia,11,"Executive (VP of Eng, CTO, CIO, etc.)","['JavaScript', 'CSS', 'PHP', 'SQL', 'C++', 'C'...",Linux,It pays the bills
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,20000-40000,2011,United States of America,41310,"Executive (VP of Eng, CTO, CIO, etc.)","['JavaScript', 'CSS', 'PHP', 'Python', 'SQL', ...",Windows 7,It pays the bills
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,80000-100000,2011,United States of America,41435,Web Application Developer,"['JavaScript', 'CSS', 'PHP', 'SQL']",Linux,I enjoy going to work
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,80000-100000,2011,Germany,11,Desktop Application Developer,['C#'],Windows 7,I enjoy going to work
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,60000-80000,2011,United Kingdom,11,IT Staff / System Administrator,"['SQL', 'C#', 'C++', 'C']",Windows 7,I enjoy going to work


In [57]:
# Writes the encoded data for model building
number_of_chunks = 3
for idx, chunk in enumerate(np.array_split(transformed_df, number_of_chunks)):
    chunk.to_csv(f'../Data/model_data/model-encoded-data-{idx}.csv', index=False)

In [58]:
# Verification: read the files into dataframes
files = ["../Data/model_data/model-encoded-data-0.csv", 
         "../Data/model_data/model-encoded-data-1.csv", 
         "../Data/model_data/model-encoded-data-2.csv"]
model_dfs = [pd.read_csv(f, sep=",", 
                          encoding='Latin-1', 
                          low_memory=False ) for f in files]

# combine the list of dataframes
model_data = pd.concat(model_dfs, ignore_index=True)

In [59]:
model_data.columns.tolist()

['is_female',
 'is_male',
 'is_gender_non_confirming',
 'is_transgender',
 'is_gender_unknown',
 'is_age_18_to_24',
 'is_age_25_to34',
 'is_age_35_44',
 'is_age_45_54',
 'is_age_55_to_64',
 'is_age_above_65',
 'is_age_below_18',
 'is_company_size_1000_to_4999',
 'is_company_size_100_to_499',
 'is_company_size_5000_to_9999',
 'is_company_size_500_to_999',
 'is_company_size_unknown',
 'is_company_size_above_10000',
 'is_company_size_below_100',
 'compensation',
 'year',
 'country',
 'IT_experience_in_years',
 'occupation',
 'proficient_languages',
 'desktop_OS',
 'job_satisfaction']

In [60]:
model_data.head()

Unnamed: 0,is_female,is_male,is_gender_non_confirming,is_transgender,is_gender_unknown,is_age_18_to_24,is_age_25_to34,is_age_35_44,is_age_45_54,is_age_55_to_64,...,is_company_size_above_10000,is_company_size_below_100,compensation,year,country,IT_experience_in_years,occupation,proficient_languages,desktop_OS,job_satisfaction
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,80000-100000,2011,Other Asia,11,"Executive (VP of Eng, CTO, CIO, etc.)","['JavaScript', 'CSS', 'PHP', 'SQL', 'C++', 'C'...",Linux,It pays the bills
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,20000-40000,2011,United States of America,41310,"Executive (VP of Eng, CTO, CIO, etc.)","['JavaScript', 'CSS', 'PHP', 'Python', 'SQL', ...",Windows 7,It pays the bills
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,80000-100000,2011,United States of America,41435,Web Application Developer,"['JavaScript', 'CSS', 'PHP', 'SQL']",Linux,I enjoy going to work
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,80000-100000,2011,Germany,11,Desktop Application Developer,['C#'],Windows 7,I enjoy going to work
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,60000-80000,2011,United Kingdom,11,IT Staff / System Administrator,"['SQL', 'C#', 'C++', 'C']",Windows 7,I enjoy going to work
