# Data Preprocessing - 1

In [1]:
import pandas as pd
import numpy as np

Data will be preprocessed column by column.

In [2]:
# Load combined data
df = pd.read_csv('./../../../datasets/data.csv')

After manually cheking the data, a some inconsistencies are found in the data which needs to be addressed before exploring the data.

In [3]:
# Some utility functions

def get_value_counts(series):
    temp_df = pd.DataFrame(series.value_counts()).reset_index()
    temp_df.columns = ['Values', 'Counts']
    return temp_df

## Columns with true or false values

Some columns contains 1.0 and 0.0 as well as true and false as values. It is better to convert to them into numerical values itself.

In [4]:
df = df.replace({'True':1.0, 'False':0.0})
df = df.replace({'1.0':1.0, '0.0':0.0})

## are you self-employed?

In [5]:
get_value_counts(df['are you self-employed?'])

Unnamed: 0,Values,Counts
0,0,1308
1,1,217


There is no discrepency in this column.

## how many employees does your company or organization have?

In [6]:
get_value_counts(df['how many employees does your company or organization have?'])

Unnamed: 0,Values,Counts
0,More than 1000,374
1,100-500,364
2,26-100,243
3,Jun-25,155
4,500-1000,106
5,6-25,34
6,01-May,25
7,1-5,7


Following are the issues:
1. '6-25' is also represented by 'Jun-25'
2. '1-5' is also represented by '01-May'

In [7]:
df['how many employees does your company or organization have?'] = df['how many employees does your company or organization have?'].str.replace("Jun-25", "6-25")
df['how many employees does your company or organization have?'] = df['how many employees does your company or organization have?'].str.replace("01-May", "1-5")

## what is your gender?

In [8]:
df['what is your gender?'] = df['what is your gender?'].str.lower()
df['what is your gender?'] = df['what is your gender?'].str.strip()

In [9]:
get_value_counts(df['what is your gender?'])

Unnamed: 0,Values,Counts
0,male,859
1,female,350
2,m,102
3,f,55
4,woman,16
...,...,...
76,cisgender male,1
77,"male (or female, or both)",1
78,contextual,1
79,genderqueer demigirl,1


There is lot of noise in this column.

In [10]:
df['what is your gender?'].unique()

array(['female', 'male', 'f', 'm', 'man', nan, 'genderfluid', 'nonbinary',
       'cis-male', 'mail', 'woman', 'male/androgynous', 'cis hetero male',
       'femalw', 'femail', 'male (cis)', 'uhhhhhhhhh fem genderqueer?',
       "male (hey this is the tech industry you're talking about)",
       'female (cis)', 'god king of the valajar', 'cis male',
       'my sex is female.', 'non-binary', 'male, cis',
       'agender/genderfluid', 'male-ish', 'sometimes',
       'female (cisgender)', 'woman-identified', 'contextual',
       'non binary', 'genderqueer demigirl', 'genderqueer/non-binary',
       'cis-female', 'cis female', 'f, cisgender', 'female-ish', '\\-',
       'trans woman', 'transfeminine', 'none', 'dude', 'ostensibly male',
       'agender', 'male, born with xy chromosoms', 'malel',
       'i identify as female', '*shrug emoji* (f)',
       'male (or female, or both)', 'trans man', 'transgender',
       'female/gender non-binary.', 'cis woman', 'cisgendered woman',
       'gend

In [11]:
male_list = ['male', 'm', 'man', 'mail', 'male/androgynous', 'cis-male', 'male (cis)', 
             'male (hey this is the tech industry you\'re talking about)', 'god king of the valajar', 'cis male', 
             'male, cis', 'male-ish', 'dude', 'male, born with xy chromosoms', 'malel', 'demiguy', 'cisgender male', 
             'let\'s keep it simple and say "male"', 'identify as male', 'masculine', 'make', 'cishet male', 
             'i have a penis', 'masculino']
female_list = ['female', 'f', 'woman', 'femalw', 'femail', 'female (cis)', 'my sex is female.', 
               'female (cisgender)', 'woman-identified', 'cis-female', 'cis female', 'f, cisgender', 'female-ish', 
               'i identify as female', '*shrug emoji* (f)', 'cis woman', 'cisgendered woman', 'she/her/they/them', 
               'cisgender female', 'female-identified', 'femmina', 'femile']
other_list = ['genderfluid', 'nonbinary', 'cis hetero male', 'uhhhhhhhhh fem genderqueer?', 'non-binary', 
              'agender/genderfluid', 'sometimes', 'contextual', 'non binary', 'genderqueer demigirl', 
              'genderqueer/non-binary', '\\-', 'transwoman', 'transfeminine', 'none', 'ostensibly male', 'agender', 
              'male (or female, or both)', 'trans man', 'transgender', 'female/gender non-binary.', 
              'genderqueer', 'trans female', 'other', 'swm', 'nb', 'nonbinary/femme', 'gender non-conforming woman', 
              'i am a wookie', 'questioning', 'rr', 'agender trans woman', '43', 'trans non-binary/genderfluid', 
              'non-binary and gender fluid']

In [12]:
df['what is your gender?'] = df['what is your gender?'].apply(lambda x: 'male' if x in male_list else ('female' if x in female_list else 'other'))

In [13]:
get_value_counts(df['what is your gender?'])

Unnamed: 0,Values,Counts
0,male,1002
1,female,446
2,other,77


In [14]:
df.to_csv('./../../../datasets/consistent_data.csv', index=False)

## Segregation of based on employment type

In [15]:
ce_df = df[df['are you self-employed?']==0]
se_df = df[df['are you self-employed?']==1]

In [16]:
ce_df.to_csv('./../../../datasets/consistent_ce_data.csv', index=False)
se_df.to_csv('./../../../datasets/consistent_se_data.csv', index=False)