In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# [Stackoverflow survey response data](https://insights.stackoverflow.com/survey/2018/#overview)

In [2]:
so_survey_df = pd.read_csv('./Combined_DS_v10.csv')
print(so_survey_df.shape)
so_survey_df.head()

(999, 11)


Unnamed: 0,SurveyDate,FormalEducation,ConvertedSalary,Hobby,Country,StackOverflowJobsRecommend,VersionControl,Age,Years Experience,Gender,RawSalary
0,2/28/18 20:20,Bachelor's degree (BA. BS. B.Eng.. etc.),,Yes,South Africa,,Git,21,13,Male,
1,6/28/18 13:26,Bachelor's degree (BA. BS. B.Eng.. etc.),70841.0,Yes,Sweeden,7.0,Git;Subversion,38,9,Male,70841.00
2,6/6/18 3:37,Bachelor's degree (BA. BS. B.Eng.. etc.),,No,Sweeden,8.0,Git,45,11,,
3,5/9/18 1:06,Some college/university study without earning ...,21426.0,Yes,Sweeden,,Zip file back-ups,46,12,Male,21426.00
4,4/12/18 22:41,Bachelor's degree (BA. BS. B.Eng.. etc.),41671.0,Yes,UK,8.0,Git,39,7,Male,"£41,671.00"


In [3]:
so_survey_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   SurveyDate                  999 non-null    object 
 1   FormalEducation             999 non-null    object 
 2   ConvertedSalary             665 non-null    float64
 3   Hobby                       999 non-null    object 
 4   Country                     999 non-null    object 
 5   StackOverflowJobsRecommend  487 non-null    float64
 6   VersionControl              999 non-null    object 
 7   Age                         999 non-null    int64  
 8   Years Experience            999 non-null    int64  
 9   Gender                      693 non-null    object 
 10  RawSalary                   665 non-null    object 
dtypes: float64(2), int64(2), object(7)
memory usage: 86.0+ KB


## Selecting specific data types

In [4]:
# create a subset of only the numeric columns
so_numeric_df = so_survey_df.select_dtypes(include=['number'])

# print the column names contained in the subset
print(so_numeric_df.columns)

Index(['ConvertedSalary', 'StackOverflowJobsRecommend', 'Age',
       'Years Experience'],
      dtype='object')


## One-hot encoding and dummy variables

In [6]:
# convert the Country column to a one hot encoded Data Frame
one_hot_encoded = pd.get_dummies(so_survey_df,
                                 columns=['Country'],
                                 prefix='OH')

# print col names
print(one_hot_encoded.columns)

Index(['SurveyDate', 'FormalEducation', 'ConvertedSalary', 'Hobby',
       'StackOverflowJobsRecommend', 'VersionControl', 'Age',
       'Years Experience', 'Gender', 'RawSalary', 'OH_France', 'OH_India',
       'OH_Ireland', 'OH_Russia', 'OH_South Africa', 'OH_Spain', 'OH_Sweeden',
       'OH_UK', 'OH_USA', 'OH_Ukraine'],
      dtype='object')


In [7]:
# create dummy variables for the Country columns
dummy = pd.get_dummies(so_survey_df,
                       columns=['Country'],
                       drop_first=True,
                       prefix='DM')

# print col names
print(dummy.columns)

Index(['SurveyDate', 'FormalEducation', 'ConvertedSalary', 'Hobby',
       'StackOverflowJobsRecommend', 'VersionControl', 'Age',
       'Years Experience', 'Gender', 'RawSalary', 'DM_India', 'DM_Ireland',
       'DM_Russia', 'DM_South Africa', 'DM_Spain', 'DM_Sweeden', 'DM_UK',
       'DM_USA', 'DM_Ukraine'],
      dtype='object')


## Dealing with uncommon categories

In [13]:
# create a series out of the country column
countries = so_survey_df.loc[:, 'Country']

# get the counts of each category
country_counts = countries.value_counts()
print(f'raw:\n{country_counts}') 
    
# create a mask for only categories that occur less than 10 times
mask = countries.isin(country_counts[country_counts < 10].index)

# label all other categories as Other
countries[mask] = 'Other'

# print the updated counts
print(f'processed:\n{countries.value_counts()}')

raw:
South Africa    166
USA             164
Spain           134
Sweeden         119
France          115
Russia           97
UK               95
India            95
Other            14
Name: Country, dtype: int64
processed:
South Africa    166
USA             164
Spain           134
Sweeden         119
France          115
Russia           97
UK               95
India            95
Other            14
Name: Country, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


## Binarizing columns

In [14]:
# impute ConvertedSalary column with 0's
so_survey_df['ConvertedSalary'].fillna(0, inplace=True)

In [16]:
# create a Paid_Job column filled with zeros
so_survey_df['Paid_Job'] = 0

# replace all the Paid_Job values where ConvertedSalary > 0
so_survey_df.loc[so_survey_df['ConvertedSalary'] > 0, 'Paid_Job'] = 1
so_survey_df[['Paid_Job', 'ConvertedSalary']].head()

Unnamed: 0,Paid_Job,ConvertedSalary
0,0,0.0
1,1,70841.0
2,0,0.0
3,1,21426.0
4,1,41671.0


## Binning values

In [20]:
# bin the continuous variable ConvertedSalary into 5 bins
so_survey_df['equal_binned'] = pd.cut(so_survey_df['ConvertedSalary'], 5)
so_survey_df[['equal_binned', 'ConvertedSalary']].head()

Unnamed: 0,equal_binned,ConvertedSalary
0,"(-2000.0, 400000.0]",0.0
1,"(-2000.0, 400000.0]",70841.0
2,"(-2000.0, 400000.0]",0.0
3,"(-2000.0, 400000.0]",21426.0
4,"(-2000.0, 400000.0]",41671.0


In [22]:
# this time specify the bins
bins = [-np.inf, 10000, 50000, 100000, 150000, np.inf]

# bin labels
labels = ['Very low', 'Low', 'Medium', 'High', 'Very high']

# bin the column
so_survey_df['boundary_binned'] = pd.cut(so_survey_df['ConvertedSalary'],
                                         bins=bins,
                                         labels=labels)
so_survey_df[['boundary_binned', 'ConvertedSalary']].head()

Unnamed: 0,boundary_binned,ConvertedSalary
0,Very low,0.0
1,Medium,70841.0
2,Very low,0.0
3,Low,21426.0
4,Low,41671.0
