In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# [Stackoverflow survey response data](https://insights.stackoverflow.com/survey/2018/#overview)

In [4]:
so_survey_df = pd.read_csv('./Combined_DS_v10.csv')
print(so_survey_df.shape)
so_survey_df.head()

(999, 11)


Unnamed: 0,SurveyDate,FormalEducation,ConvertedSalary,Hobby,Country,StackOverflowJobsRecommend,VersionControl,Age,Years Experience,Gender,RawSalary
0,2/28/18 20:20,Bachelor's degree (BA. BS. B.Eng.. etc.),,Yes,South Africa,,Git,21,13,Male,
1,6/28/18 13:26,Bachelor's degree (BA. BS. B.Eng.. etc.),70841.0,Yes,Sweeden,7.0,Git;Subversion,38,9,Male,70841.00
2,6/6/18 3:37,Bachelor's degree (BA. BS. B.Eng.. etc.),,No,Sweeden,8.0,Git,45,11,,
3,5/9/18 1:06,Some college/university study without earning ...,21426.0,Yes,Sweeden,,Zip file back-ups,46,12,Male,21426.00
4,4/12/18 22:41,Bachelor's degree (BA. BS. B.Eng.. etc.),41671.0,Yes,UK,8.0,Git,39,7,Male,"£41,671.00"


In [5]:
so_survey_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   SurveyDate                  999 non-null    object 
 1   FormalEducation             999 non-null    object 
 2   ConvertedSalary             665 non-null    float64
 3   Hobby                       999 non-null    object 
 4   Country                     999 non-null    object 
 5   StackOverflowJobsRecommend  487 non-null    float64
 6   VersionControl              999 non-null    object 
 7   Age                         999 non-null    int64  
 8   Years Experience            999 non-null    int64  
 9   Gender                      693 non-null    object 
 10  RawSalary                   665 non-null    object 
dtypes: float64(2), int64(2), object(7)
memory usage: 86.0+ KB


## How sparse is my data?

In [3]:
# subset the dataframe including only Age and Gender
sub_df = so_survey_df[['Age', 'Gender']]
sub_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Age     999 non-null    int64 
 1   Gender  693 non-null    object
dtypes: int64(1), object(1)
memory usage: 15.7+ KB


## Finding the missing values

In [6]:
# first 10 entries
print(sub_df.head(10))

# locs of the missing values
print(sub_df.head(10).isna())

# locs of non-missing values
print(sub_df.head(10).notna())

   Age  Gender
0   21    Male
1   38    Male
2   45     NaN
3   46    Male
4   39    Male
5   39    Male
6   34    Male
7   24  Female
8   23    Male
9   36     NaN
     Age  Gender
0  False   False
1  False   False
2  False    True
3  False   False
4  False   False
5  False   False
6  False   False
7  False   False
8  False   False
9  False    True
    Age  Gender
0  True    True
1  True    True
2  True   False
3  True    True
4  True    True
5  True    True
6  True    True
7  True    True
8  True    True
9  True   False


## Listwise deletion

In [7]:
so_survey_df.shape

(999, 11)

In [8]:
# create new df dropping all incomplete rows
no_missing_values_rows = so_survey_df.dropna(how='any')
no_missing_values_rows.shape

(264, 11)

In [9]:
# create new df dropping all incomplete columns
no_missing_values_cols = so_survey_df.dropna(how='any', axis=1)
no_missing_values_cols.shape

(999, 7)

In [10]:
# drop all rows where Gender is missing
no_gender = so_survey_df.dropna(subset=['Gender'])
no_gender.shape

(693, 11)

## Replacing missing values with constants

In [11]:
# print number of occurences
print(so_survey_df['Gender'].value_counts())

Male                                                                         632
Female                                                                        53
Female;Male                                                                    2
Transgender                                                                    2
Female;Male;Transgender;Non-binary. genderqueer. or gender non-conforming      1
Male;Non-binary. genderqueer. or gender non-conforming                         1
Female;Transgender                                                             1
Non-binary. genderqueer. or gender non-conforming                              1
Name: Gender, dtype: int64


In [13]:
# replace missing values
so_survey_df['Gender'].fillna('Not Given', inplace=True)
print(so_survey_df['Gender'].value_counts())

Male                                                                         632
Not Given                                                                    306
Female                                                                        53
Female;Male                                                                    2
Transgender                                                                    2
Female;Transgender                                                             1
Female;Male;Transgender;Non-binary. genderqueer. or gender non-conforming      1
Male;Non-binary. genderqueer. or gender non-conforming                         1
Non-binary. genderqueer. or gender non-conforming                              1
Name: Gender, dtype: int64


## Filling continuous missing values

In [14]:
# print the first five rows of StackOverflowJobsRecommend
so_survey_df[['StackOverflowJobsRecommend']].head()

Unnamed: 0,StackOverflowJobsRecommend
0,
1,7.0
2,8.0
3,
4,8.0


In [15]:
# fill missing values with the mean
so_survey_df['StackOverflowJobsRecommend'].fillna(
    so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True)

so_survey_df[['StackOverflowJobsRecommend']].head()

Unnamed: 0,StackOverflowJobsRecommend
0,7.061602
1,7.0
2,8.0
3,7.061602
4,8.0


In [16]:
# round
so_survey_df['StackOverflowJobsRecommend'] = round(
    so_survey_df['StackOverflowJobsRecommend'])
so_survey_df[['StackOverflowJobsRecommend']].head()

Unnamed: 0,StackOverflowJobsRecommend
0,7.0
1,7.0
2,8.0
3,7.0
4,8.0


## Dealing with stray characters

In [19]:
# remove the commas in the column
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace(',', '')

# remove the dollar signs in the column
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('$', '')

so_survey_df['RawSalary'].head()

0          NaN
1     70841.00
2          NaN
3     21426.00
4    £41671.00
Name: RawSalary, dtype: object

In [20]:
# attempt to convert the column to numeric
numeric_vals = pd.to_numeric(so_survey_df['RawSalary'],
                             errors='coerce')  # 'coerce' coerces any values causing issues to NaN

# find the indices of missing values
idx = numeric_vals.isna()

# print the relevant rows
print(so_survey_df['RawSalary'][idx])

0            NaN
2            NaN
4      £41671.00
6            NaN
8            NaN
         ...    
989          NaN
990          NaN
992          NaN
994          NaN
997          NaN
Name: RawSalary, Length: 401, dtype: object


In [21]:
# replace the offending characters
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('£', '')

# convert the column to float
so_survey_df['RawSalary'] = pd.to_numeric(so_survey_df['RawSalary'])

# print
print(so_survey_df['RawSalary'].head())

0        NaN
1    70841.0
2        NaN
3    21426.0
4    41671.0
Name: RawSalary, dtype: float64


## Method chaining

In [23]:
# so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace(
#     ',', '').str.replace('$', '').str.replace('£', '').astype(float)