In [1]:
import pandas as pd
import numpy as np
import math
import random

In [74]:
master_data = pd.read_csv("../../Data/filtered/master-data.csv", sep=",", encoding='Latin-1', low_memory=False)

In [75]:
master_data.head()

Unnamed: 0,compensation,year,country,age,gender,IT_experience_in_years,company_size,occupation,proficient_languages,desktop_OS,job_satisfaction
0,Student / Unemployed,2011,Africa,< 20,female,<2,Start Up (1-25),Web Application Developer,['JavaScript'],Linux,FML
1,,2011,Other Europe,25-29,male,41310,Mature Small Business (25-100),Server Programmer,"['SQL', 'C']",Windows 7,So happy it hurts
2,,2011,India,25-29,female,41435,Mid Sized (100-999),Server Programmer,"['JavaScript', 'SQL']",Linux,
3,Student / Unemployed,2011,Germany,< 20,female,41310,Student,Student,['Haskell'],Linux,I enjoy going to work
4,"$80,000 - $100,000",2011,Other Asia,35-39,male,11,Start Up (1-25),"Executive (VP of Eng, CTO, CIO, etc.)","['JavaScript', 'CSS', 'PHP', 'SQL', 'C++', 'C'...",Linux,It pays the bills


In [53]:
# Fill nan by gender unknown 

master_data["gender"] = master_data["gender"].fillna('unknown')

In [63]:
def normalize_gender(row):
    gender = row
    if "non-conforming" in row:
        gender="non_conforming"
    elif "Other" in row:
        gender="unknown"
    elif "Transgender" in row:
        gender="Transgender"
    elif row in ["Male; Female", "Female;Male", "Woman;Man", "Man;Woman", "Man;Woman;Or, in your own words:"]:
        gender="Transgender"
    elif "Man" in row:
        gender="Male"
    elif "Woman" in row:
        gender="Female"
    elif ("Or, in your own words:" in row) or ("Prefer not to disclose" in row) or ("Prefer not to say" in row):
        gender="unknown"
    
    return row.lower()

In [64]:
master_data["gender"] = master_data["gender"].apply(normalize_gender)

In [66]:
master_data.gender.unique().tolist()

['female', 'male', 'unknown', 'non_conforming', 'transgender']

In [71]:
master_data.groupby("gender")["gender"].count()

gender
female             31339
male              385116
non_conforming      3712
transgender          782
unknown            74638
Name: gender, dtype: int64

In [73]:
# Writes the normalized data for data analysis
master_data.to_csv("../../Data/filtered/master-data.csv", index=False)

In [84]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [85]:
transformer = make_column_transformer(
    (OneHotEncoder(), ['gender']),
    remainder='passthrough',
    verbose_feature_names_out=False)

transformed = transformer.fit_transform(master_data)
transformed_df = pd.DataFrame(
    transformed, 
    columns=transformer.get_feature_names()
)



In [86]:
transformed_df.head()

Unnamed: 0,onehotencoder__x0_female,onehotencoder__x0_male,onehotencoder__x0_non_conforming,onehotencoder__x0_transgender,onehotencoder__x0_unknown,compensation,year,country,age,IT_experience_in_years,company_size,occupation,proficient_languages,desktop_OS,job_satisfaction
0,1.0,0.0,0.0,0.0,0.0,Student / Unemployed,2011,Africa,< 20,<2,Start Up (1-25),Web Application Developer,['JavaScript'],Linux,FML
1,0.0,1.0,0.0,0.0,0.0,,2011,Other Europe,25-29,41310,Mature Small Business (25-100),Server Programmer,"['SQL', 'C']",Windows 7,So happy it hurts
2,1.0,0.0,0.0,0.0,0.0,,2011,India,25-29,41435,Mid Sized (100-999),Server Programmer,"['JavaScript', 'SQL']",Linux,
3,1.0,0.0,0.0,0.0,0.0,Student / Unemployed,2011,Germany,< 20,41310,Student,Student,['Haskell'],Linux,I enjoy going to work
4,0.0,1.0,0.0,0.0,0.0,"$80,000 - $100,000",2011,Other Asia,35-39,11,Start Up (1-25),"Executive (VP of Eng, CTO, CIO, etc.)","['JavaScript', 'CSS', 'PHP', 'SQL', 'C++', 'C'...",Linux,It pays the bills


In [87]:
transformed_df.rename(columns={'onehotencoder__x0_female': 'is_female', 
                               'onehotencoder__x0_male': 'is_male',
                              'onehotencoder__x0_non_conforming': 'is_gender_non_confirming',
                              'onehotencoder__x0_transgender': 'is_transgender',
                              'onehotencoder__x0_unknown': 'is_gender_unknown'}, inplace=True)

In [90]:
# Writes the encoded data for model building
transformed_df.to_csv("../../Data/filtered/model-encoded-data.csv", index=False)