This notebook is used to extractor the necessary data for Salary Prediction from 2012 Stack Overflow Annual Developer Survey 

https://insights.stackoverflow.com/survey 

In [1]:
import pandas as pd
import numpy as np

NOTE: Make sure **Data/source/** folder has the survey data downloaded using **Scripts/download_survey_data.py --all** script

### 2012 Stack Overflow User Survey

In [8]:
survey_res = pd.read_csv("../Data/source/2012.csv", sep=",", encoding='Latin-1')

In [9]:
print(f"Number of responses to the survey in 2012: {survey_res.shape[0]-1}")

Number of responses to the survey in 2012: 6243


In [10]:
survey_res = survey_res.drop(0)
survey_res.head()

Unnamed: 0,What Country or Region do you live in?,Which US State or Territory do you live in?,How old are you?,How many years of IT/Programming experience do you have?,How would you best describe the industry you currently work in?,Which best describes the size of your company?,Which of the following best describes your occupation?,What is your involvement in purchasing products or services for the company you work for? (You can choose more than one),Unnamed: 8,Unnamed: 9,...,Please rate the advertising you've seen on Stack Overflow,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,What advertisers do you remember seeing on Stack Overflow?,What is your current Stack Overflow reputation?,Which of our sites do you frequent most?,Unnamed: 74
1,India,,20-24,<2,Consulting,"Fortune 1000 (1,000+)",Server Programmer,Influencer,Recommender,,...,,,,,,,,,,
2,Germany,,25-29,<2,Other,Mature Small Business (25-100),Embedded Application Developer,,Recommender,,...,Neutral,Agree,Agree,Strongly Disagree,Strongly Disagree,Disagree,,Don't have an account,Stack Overflow,
3,United Kingdom,,20-24,41070,Finance / Banking,Mature Small Business (25-100),Web Application Developer,Influencer,Recommender,,...,Neutral,Neutral,Neutral,Neutral,Neutral,Strongly Agree,,1,Stack Overflow,
4,France,,20-24,40944,Software Products,Mature Small Business (25-100),Embedded Application Developer,,,,...,Agree,Strongly Disagree,Disagree,Disagree,Strongly Disagree,Disagree,None !,Don't have an account,Stack Overflow,
5,United States of America,Iowa,< 20,40944,Other,Student,Student,,,,...,Neutral,Neutral,Neutral,Neutral,Strongly Disagree,Strongly Agree,none,1,Stack Overflow,


In [11]:
# Extract the Country 

survey_res_flt = pd.DataFrame()
survey_res_flt["country"] = survey_res["What Country or Region do you live in?"]

# Extract Age
survey_res_flt["age"] = survey_res["How old are you?"]

# Extract years of IT/programing experience
survey_res_flt["IT_experience_in_years"] = survey_res["How many years of IT/Programming experience do you have?"]

# Extract industry 
survey_res_flt["industry"] = survey_res["How would you best describe the industry you currently work in?"]

# Extract company size 
survey_res_flt["company_size"] = survey_res["Which best describes the size of your company?"]

# Extract occupation 
survey_res_flt["occupation"] = survey_res["Which of the following best describes your occupation?"]

# Extract Current project details
survey_res_flt["current_development_project"] = survey_res["What type of project are you developing?"]

In [12]:
# Extract programming language 

language_col_start = survey_res.columns.get_loc("Which languages are you proficient in?")

survey_res_flt["proficient_languages"] = (
    survey_res
    .iloc[:, language_col_start+1:language_col_start+15]
    .apply(lambda x: ",".join(x.astype(str)), axis = 1)
)

survey_res_flt["proficient_languages"] = [[pl for pl in pl_list.split(',') if pl != 'nan'] for pl_list in survey_res_flt['proficient_languages'].values.tolist()]

In [13]:
# Extract the desktop operating system used 
survey_res_flt["desktop_OS"] = survey_res["Which desktop operating system do you use the most?"]

# Extract job satisfaction 
survey_res_flt["job_satisfaction"] = survey_res["What best describes your career / job satisfaction? "]

# Extract compensation
survey_res_flt["compensation"] = survey_res["Including bonus, what is your annual compensation in USD?"]

In [14]:
# Extract Product technology
product_tech_col_start = survey_res.columns.get_loc("Which technology products do you own? (You can choose more than one)")

survey_res_flt["product_technology"] = (
    survey_res
    .iloc[:, product_tech_col_start+1:product_tech_col_start+20]
    .apply(lambda x: ",".join(x.astype(str)), axis = 1)
)

survey_res_flt["product_technology"] = [[pt for pt in pt_list.split(',') if pt != 'nan'] for pt_list in survey_res_flt['product_technology'].values.tolist()]

In [15]:
# Extract stackoverflow reputation 
survey_res_flt["stackoverflow_reputation"] = survey_res["What is your current Stack Overflow reputation?"]

In [16]:
survey_res_flt.head()

Unnamed: 0,country,age,IT_experience_in_years,industry,company_size,occupation,current_development_project,proficient_languages,desktop_OS,job_satisfaction,compensation,product_technology,stackoverflow_reputation
1,India,20-24,<2,Consulting,"Fortune 1000 (1,000+)",Server Programmer,SaaS,"[JavaScript, SQL, C++, C]",Linux,Love my job,"<$20,000","[Android, Netbook]",
2,Germany,25-29,<2,Other,Mature Small Business (25-100),Embedded Application Developer,Mobile,"[JavaScript, CSS, PHP, Objective-C, C++, HTML5]",Windows 7,I enjoy going to work,"$20,000 - $40,000",[],Don't have an account
3,United Kingdom,20-24,41070,Finance / Banking,Mature Small Business (25-100),Web Application Developer,Web Platform,"[CSS, PHP, Objective-C, SQL, HTML5]",Mac OS X,I enjoy going to work,"$20,000 - $40,000","[Blackberry, Other media streaming device, Xbo...",1
4,France,20-24,40944,Software Products,Mature Small Business (25-100),Embedded Application Developer,Mobile,[C++],Linux,I'm not happy in my job,"$20,000 - $40,000","[Android, iPad]",Don't have an account
5,United States of America,< 20,40944,Other,Student,Student,Mobile,[],Mac OS X,I wish I had a job!,Student / Unemployed,"[Android, Kindle, Other gaming system]",1


In [17]:
survey_res_flt.to_csv("../Data/filtered/2012.csv", index=False)