In [1]:
import sys,os
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import *


%matplotlib inline

# Designing the Perfect IT Company
## An exploration analysis on Stack Overflow’s 2020 survey

Part 1 - Understanding and Preparing the Data

## Business Understanding

We are going to use Stack Overflow's 2020 survey dataset which is available [here](https://insights.stackoverflow.com/survey/). This dataset contains questions on developer experience from career satisfaction and job search to education and opinions on open source software. We are interested on the job satisfaction and all things related to it.

With this we'd like to answer the following questions.
* What do the developers look for a company?
* Is salary or languages/technologies/frameworks effective enough to solve job satisfaction issues?
* What other hidden factors affects job satisfaction?

## Data Understanding

**Access and Explore Data and Schema**

In [2]:
df_schema = pd.read_csv("../data/raw/survey_results_schema.csv")
df_schema.head()

Unnamed: 0,Column,QuestionText
0,Respondent,Randomized respondent ID number (not in order ...
1,MainBranch,Which of the following options best describes ...
2,Hobbyist,Do you code as a hobby?
3,Age,What is your age (in years)? If you prefer not...
4,Age1stCode,At what age did you write your first line of c...


In [3]:
df_schema.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Column        61 non-null     object
 1   QuestionText  61 non-null     object
dtypes: object(2)
memory usage: 1.1+ KB


In [4]:
df = pd.read_csv("../data/raw/survey_results_public.csv")
df.head()

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27.0
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,...,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4.0
2,3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,...,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,
3,4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,...,,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4.0
4,5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,...,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64461 entries, 0 to 64460
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Respondent                    64461 non-null  int64  
 1   MainBranch                    64162 non-null  object 
 2   Hobbyist                      64416 non-null  object 
 3   Age                           45446 non-null  float64
 4   Age1stCode                    57900 non-null  object 
 5   CompFreq                      40069 non-null  object 
 6   CompTotal                     34826 non-null  float64
 7   ConvertedComp                 34756 non-null  float64
 8   Country                       64072 non-null  object 
 9   CurrencyDesc                  45472 non-null  object 
 10  CurrencySymbol                45472 non-null  object 
 11  DatabaseDesireNextYear        44070 non-null  object 
 12  DatabaseWorkedWith            49537 non-null  object 
 13  D

## Prepare Data

**Wrangle and Clean**

The survey is already pre-cleaned by StackOverflow. Few more enhancements is to rename a few column and convert them into snake case

**Schema**

In [6]:
df_schema.columns = [to_snake_case(col) for col in df_schema.columns]
df_schema.head()

Unnamed: 0,column,question_text
0,Respondent,Randomized respondent ID number (not in order ...
1,MainBranch,Which of the following options best describes ...
2,Hobbyist,Do you code as a hobby?
3,Age,What is your age (in years)? If you prefer not...
4,Age1stCode,At what age did you write your first line of c...


In [7]:
df_schema["field"] = df_schema.column.apply(lambda col: to_snake_case(col))
df_schema.head()

Unnamed: 0,column,question_text,field
0,Respondent,Randomized respondent ID number (not in order ...,respondent
1,MainBranch,Which of the following options best describes ...,main_branch
2,Hobbyist,Do you code as a hobby?,hobbyist
3,Age,What is your age (in years)? If you prefer not...,age
4,Age1stCode,At what age did you write your first line of c...,age1st_code


In [8]:
df_schema = df_schema[["field", "question_text"]]
df_schema.head()

Unnamed: 0,field,question_text
0,respondent,Randomized respondent ID number (not in order ...
1,main_branch,Which of the following options best describes ...
2,hobbyist,Do you code as a hobby?
3,age,What is your age (in years)? If you prefer not...
4,age1st_code,At what age did you write your first line of c...


In [9]:
df_schema.to_csv("../data/processed/so_schema.csv", index=False)

**Data**

In [10]:
df.columns = [to_snake_case(col) for col in df.columns]
df.head()

Unnamed: 0,respondent,main_branch,hobbyist,age,age1st_code,comp_freq,comp_total,converted_comp,country,currency_desc,...,survey_ease,survey_length,trans,undergrad_major,webframe_desire_next_year,webframe_worked_with,welcome_change,work_week_hrs,years_code,years_code_pro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27.0
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,...,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4.0
2,3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,...,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,
3,4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,...,,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4.0
4,5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,...,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8.0


In [11]:
df.to_csv("../data/processed/so_data.csv", index=False)