In [9]:
import pandas as pd
import numpy as np
import math
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [22]:
master_data = pd.read_csv("../Data/filtered/master-data.csv", 
                          sep=",", 
                          encoding='Latin-1', 
                          low_memory=False)

In [23]:
transformer = make_column_transformer(
    (OneHotEncoder(), ['gender', 'age']),
    remainder='passthrough',
    verbose_feature_names_out=False)

transformed = transformer.fit_transform(master_data)
transformed_df = pd.DataFrame(
    transformed, 
    columns=transformer.get_feature_names()
)



In [24]:
transformed_df.rename(columns={'onehotencoder__x0_female': 'is_female', 
                               'onehotencoder__x0_male': 'is_male',
                              'onehotencoder__x0_non_conforming': 'is_gender_non_confirming',
                              'onehotencoder__x0_transgender': 'is_transgender',
                              'onehotencoder__x0_unknown': 'is_gender_unknown',
                              'onehotencoder__x1_18_to_24': 'is_age_18_to_24',
                              'onehotencoder__x1_25_to_34': 'is_age_25_to34',
                              'onehotencoder__x1_35_to_44': 'is_age_35_44',
                              'onehotencoder__x1_45_to_54': 'is_age_45_54',
                              'onehotencoder__x1_55_to_64': 'is_age_55_to_64',
                              'onehotencoder__x1_above_65': 'is_age_above_65',
                              'onehotencoder__x1_below_18': 'is_age_below_18'}, inplace=True)

In [25]:
transformed_df.head()

Unnamed: 0,is_female,is_male,is_gender_non_confirming,is_transgender,is_gender_unknown,is_age_18_to_24,is_age_25_to34,is_age_35_44,is_age_45_54,is_age_55_to_64,...,is_age_below_18,compensation,year,country,IT_experience_in_years,company_size,occupation,proficient_languages,desktop_OS,job_satisfaction
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,Student / Unemployed,2011,Africa,<2,Start Up (1-25),Web Application Developer,['JavaScript'],Linux,FML
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,,2011,Other Europe,41310,Mature Small Business (25-100),Server Programmer,"['SQL', 'C']",Windows 7,So happy it hurts
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,,2011,India,41435,Mid Sized (100-999),Server Programmer,"['JavaScript', 'SQL']",Linux,
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,Student / Unemployed,2011,Germany,41310,Student,Student,['Haskell'],Linux,I enjoy going to work
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,"$80,000 - $100,000",2011,Other Asia,11,Start Up (1-25),"Executive (VP of Eng, CTO, CIO, etc.)","['JavaScript', 'CSS', 'PHP', 'SQL', 'C++', 'C'...",Linux,It pays the bills


In [26]:
transformed_df.columns

Index(['is_female', 'is_male', 'is_gender_non_confirming', 'is_transgender',
       'is_gender_unknown', 'is_age_18_to_24', 'is_age_25_to34',
       'is_age_35_44', 'is_age_45_54', 'is_age_55_to_64', 'is_age_above_65',
       'is_age_below_18', 'compensation', 'year', 'country',
       'IT_experience_in_years', 'company_size', 'occupation',
       'proficient_languages', 'desktop_OS', 'job_satisfaction'],
      dtype='object')

In [31]:
# Writes the encoded data for model building
number_of_chunks = 3
for idx, chunk in enumerate(np.array_split(transformed_df, number_of_chunks)):
    chunk.to_csv(f'../Data/model_data/model-encoded-data-{idx}.csv', index=False)

In [33]:
# read the files into dataframes
files = ["../Data/model_data/model-encoded-data-0.csv", 
         "../Data/model_data/model-encoded-data-1.csv", 
         "../Data/model_data/model-encoded-data-2.csv"]
model_dfs = [pd.read_csv(f, sep=",", 
                          encoding='Latin-1', 
                          low_memory=False ) for f in files]

# combine the list of dataframes
model_data = pd.concat(model_dfs, ignore_index=True)

In [35]:
model_data.columns.tolist()

['is_female',
 'is_male',
 'is_gender_non_confirming',
 'is_transgender',
 'is_gender_unknown',
 'is_age_18_to_24',
 'is_age_25_to34',
 'is_age_35_44',
 'is_age_45_54',
 'is_age_55_to_64',
 'is_age_above_65',
 'is_age_below_18',
 'compensation',
 'year',
 'country',
 'IT_experience_in_years',
 'company_size',
 'occupation',
 'proficient_languages',
 'desktop_OS',
 'job_satisfaction']