This notebook is used to extractor the necessary data for Salary Prediction from 2014 Stack Overflow Annual Developer Survey 

https://insights.stackoverflow.com/survey 

In [2]:
import pandas as pd
import numpy as np

NOTE: Make sure **Data/source/** folder has the survey data downloaded using **Scripts/download_survey_data.py --all** script

### 2014 Stack Overflow User Survey

In [3]:
survey_res = pd.read_csv("../Data/source/2014.csv", sep=",", encoding='Latin-1')

In [4]:
print(f"Number of responses to the survey in 2014: {survey_res.shape[0]-1}")

Number of responses to the survey in 2014: 7643


In [5]:
survey_res = survey_res.drop(0)
survey_res.head()

Unnamed: 0,What Country do you live in?,Unnamed: 1,Which US State or Territory do you live in?,How old are you?,What is your gender?,How many years of IT/Programming experience do you have?,Which of the following best describes your occupation?,"Including bonus, what is your annual compensation in USD?",How would you best describe the industry you currently work in?,How many developers are employed at your company?,...,Did you participate in the Apptivate contest?,What advertisers do you remember seeing on Stack Overflow?,What is your current Stack Overflow reputation?,How do you use Stack Overflow?,Unnamed: 114,Unnamed: 115,Unnamed: 116,Unnamed: 117,Unnamed: 118,How often do you find solutions to your programming problems on Stack Overflow without asking a new question?
1,India,,,30-34,Female,6/10/2014,Back-End Web Developer,"$20,000 - $40,000",Finance / Banking,100,...,No,,500,Read other people's questions to solve my prob...,Ask questions to solve problems,,,,,Almost Always
2,Thailand,,,20-24,Male,<2,Back-End Web Developer,Student / Unemployed,Healthcare,,...,,,Don't have an account,Read other people's questions to solve my prob...,,,,,,
3,Iran,,,25-29,Male,6/10/2014,Desktop Software Developer,"<$20,000",Not Currently Employed,1/5/2014,...,No,Tehcodez,1,Read other people's questions to solve my prob...,,,,,,Almost Always
4,Ukraine,,,< 20,Male,<2,Student,Student / Unemployed,Student,,...,,,50,Read other people's questions to solve my prob...,Ask questions to solve problems,,,,,Almost Always
5,India,,,25-29,Male,2/5/2014,Full-Stack Web Developer,Rather not say,Manufacturing,1/5/2014,...,,,,,,,,,,


In [6]:
# Extract the Country 

survey_res_flt = pd.DataFrame()
survey_res_flt["country"] = survey_res["What Country do you live in?"]

# Extract Age
survey_res_flt["age"] = survey_res["How old are you?"]

#Extract gender
survey_res_flt["gender"] = survey_res["What is your gender?"]

# Extract years of IT/programing experience
survey_res_flt["IT_experience_in_years"] = survey_res["How many years of IT/Programming experience do you have?"]

# Extract occupation 
survey_res_flt["occupation"] = survey_res["Which of the following best describes your occupation?"]

# Extract compensation
survey_res_flt["compensation"] = survey_res["Including bonus, what is your annual compensation in USD?"]

# Extract industry 
survey_res_flt["industry"] = survey_res["How would you best describe the industry you currently work in?"]

# Extract company size ~ Number of developers
survey_res_flt["company_size"] = survey_res["How many developers are employed at your company?"]

# Extract remote work details 
survey_res_flt["remotely_work_frequency"] = survey_res["Do you work remotely?"]
survey_res_flt["remote_work_satisfaction_rating"] = survey_res["Do you enjoy working remotely?"]
survey_res_flt["remote_work_location"] = survey_res["Where do you work remotely most of the time?"]

# Extract time spent on work 
survey_res_flt["work_hours"] = survey_res["In an average week, how do you spend your time at work?"]

In [7]:
# Extract programming language/technology 

language_col_start = survey_res.columns.get_loc("Which of the following languages or technologies have you used significantly in the past year?")

survey_res_flt["proficient_languages"] = (
    survey_res
    .iloc[:, language_col_start+1:language_col_start+12]
    .apply(lambda x: ",".join(x.astype(str)), axis = 1)
)

survey_res_flt["proficient_languages"] = [[pl for pl in pl_list.split(',') if pl != 'nan'] for pl_list in survey_res_flt['proficient_languages'].values.tolist()]

In [8]:
# Extract the desktop operating system used 
survey_res_flt["desktop_OS"] = survey_res["Which desktop operating system do you use the most?"]


# Extract Product technology
product_tech_col_start = survey_res.columns.get_loc("Which technology products do you own? (You can choose more than one)")

survey_res_flt["product_technology"] = (
    survey_res
    .iloc[:, product_tech_col_start+1:product_tech_col_start+14]
    .apply(lambda x: ",".join(x.astype(str)), axis = 1)
)

survey_res_flt["product_technology"] = [[pt for pt in pt_list.split(',') if pt != 'nan'] for pt_list in survey_res_flt['product_technology'].values.tolist()]


# Extract job switch frequency 
survey_res_flt["job_switch_in_20_months"] = survey_res["Have you changed jobs in the last 12 months?"]

In [9]:
# Extract stackoverflow reputation 
survey_res_flt["stackoverflow_reputation"] = survey_res["What is your current Stack Overflow reputation?"]

In [10]:
survey_res_flt.head()

Unnamed: 0,country,age,gender,IT_experience_in_years,occupation,compensation,industry,company_size,remotely_work_frequency,remote_work_satisfaction_rating,remote_work_location,work_hours,proficient_languages,desktop_OS,product_technology,job_switch_in_20_months,stackoverflow_reputation
1,India,30-34,Female,6/10/2014,Back-End Web Developer,"$20,000 - $40,000",Finance / Banking,100,Occasionally,It's not my preference. I'd prefer to be in an...,A home office,5-10 hours,"[Java, JavaScript, PHP, Python]",Linux,"[Android Tablet, Kindle Fire]",No,500
2,Thailand,20-24,Male,<2,Back-End Web Developer,Student / Unemployed,Healthcare,,Never,,,2-5 hours,[PHP],Ubuntu,"[Android Phone, iPad, Android Tablet]",No,Don't have an account
3,Iran,25-29,Male,6/10/2014,Desktop Software Developer,"<$20,000",Not Currently Employed,1/5/2014,Occasionally,I'm neutral about it,A home office,2-5 hours,"[C#, JavaScript, SQL]",Windows 7,[],Yes,1
4,Ukraine,< 20,Male,<2,Student,Student / Unemployed,Student,,Never,,,,[C++],Windows 7,[Android Tablet],No,50
5,India,25-29,Male,2/5/2014,Full-Stack Web Developer,Rather not say,Manufacturing,1/5/2014,Never,,,1-2 hours,"[Java, JavaScript, SQL]",Windows 8,[Android Tablet],,


In [11]:
survey_res_flt.to_csv("../Data/filtered/2014.csv", index=False)