In [1]:
import pandas as pd
import numpy as np

In [2]:
survey_df = pd.read_csv("Data/survey_results_public_2017.csv")

survey_df.columns

Index(['Respondent', 'Professional', 'ProgramHobby', 'Country', 'University',
       'EmploymentStatus', 'FormalEducation', 'MajorUndergrad', 'HomeRemote',
       'CompanySize',
       ...
       'StackOverflowMakeMoney', 'Gender', 'HighestEducationParents', 'Race',
       'SurveyLong', 'QuestionsInteresting', 'QuestionsConfusing',
       'InterestedAnswers', 'Salary', 'ExpectedSalary'],
      dtype='object', length=154)

In [15]:
survey_df['HaveWorkedLanguage'] = survey_df['HaveWorkedLanguage'].str.replace(" ", "")

In [16]:
def split_multicolumn(col_series):
    result_df = col_series.to_frame()
    options = []
    # Iterate over the column
    for idx, value  in col_series[col_series.notnull()].iteritems():
        # Break each value into list of options
        for option in value.split(';'):
            # Add the option as a column to result
            if not option in result_df.columns:
                options.append(option)
                result_df[option] = False
            # Mark the value in the option column as True
            result_df.at[idx, option] = True
    return result_df[options]

In [17]:
languages_worked_df = split_multicolumn(survey_df.HaveWorkedLanguage)

languages_worked_df

Unnamed: 0,Swift,JavaScript,Python,Ruby,SQL,Java,PHP,Matlab,R,Rust,...,Lua,VBA,Groovy,Go,Smalltalk,VisualBasic6,CommonLisp,Dart,Julia,Hack
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,True,True,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,True,False,True,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51387,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
51388,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
51389,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
51390,True,True,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [18]:
temp_df = survey_df[['Respondent','Country', 'Salary']]
clean_df = temp_df.join(languages_worked_df, how='outer')
clean_df.rename(columns={"Salary" : "ConvertedComp"}, inplace=True)
clean_df

Unnamed: 0,Respondent,Country,ConvertedComp,Swift,JavaScript,Python,Ruby,SQL,Java,PHP,...,Lua,VBA,Groovy,Go,Smalltalk,VisualBasic6,CommonLisp,Dart,Julia,Hack
0,1,United States,,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,United Kingdom,,False,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,United Kingdom,113750.0,False,False,True,False,False,True,True,...,False,False,False,False,False,False,False,False,False,False
3,4,United States,,False,False,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,5,Switzerland,,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51387,51388,United States,58000.0,False,True,True,False,True,False,False,...,False,True,False,False,False,False,False,False,False,False
51388,51389,Venezuela,,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
51389,51390,Canada,,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
51390,51391,United States,40000.0,True,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [19]:
languages_worked_percentages = languages_worked_df.mean().sort_values(ascending=False) * 100
language_percent = languages_worked_percentages.reset_index()
language_percent.rename({"index": "language", 0: "percentage"}, axis=1, inplace=True)

language_percent['survey_year'] = 2017
language_percent


Unnamed: 0,language,percentage,survey_year
0,JavaScript,44.510819,2017
1,SQL,36.492061,2017
2,Java,28.261208,2017
3,C#,24.276152,2017
4,Python,22.773973,2017
5,PHP,20.022572,2017
6,C++,15.868229,2017
7,C,13.570205,2017
8,TypeScript,6.787049,2017
9,Ruby,6.467933,2017


In [12]:
l = language_percent.language
l = l.str.lstrip()
l.value_counts()

Matlab            2
JavaScript        2
Common Lisp       2
Hack              2
F#                2
Go                2
Julia             2
Dart              2
R                 2
Python            2
Objective-C       2
Groovy            2
PHP               2
CoffeeScript      2
SQL               2
C++               2
Visual Basic 6    2
TypeScript        2
Ruby              2
VBA               2
Smalltalk         2
Rust              2
Elixir            2
Java              2
C                 2
C#                2
Erlang            2
VB.NET            2
Clojure           2
Haskell           2
Swift             2
Perl              2
Lua               2
Scala             2
Assembly          1
Name: language, dtype: int64

In [20]:
language_percent.to_csv("clean_data/language_percent_2017.csv")

In [21]:
top_10_language = language_percent.head(10)
top_10_language

Unnamed: 0,language,percentage,survey_year
0,JavaScript,44.510819,2017
1,SQL,36.492061,2017
2,Java,28.261208,2017
3,C#,24.276152,2017
4,Python,22.773973,2017
5,PHP,20.022572,2017
6,C++,15.868229,2017
7,C,13.570205,2017
8,TypeScript,6.787049,2017
9,Ruby,6.467933,2017


In [22]:
languages = top_10_language['language']

In [23]:
average_salary = []

for language in languages:
    true_df = clean_df.loc[clean_df[f"{language}"] == True]
    avg_salary = true_df['ConvertedComp'].mean()
    average_salary.append(avg_salary)

In [24]:
avg_salary_language = pd.DataFrame({"language" : languages,
                                   "avg_salary" : average_salary})
avg_salary_language['survey_year'] = 2017
avg_salary_language

Unnamed: 0,language,avg_salary,survey_year
0,JavaScript,56396.922706,2017
1,SQL,55663.308207,2017
2,Java,55184.950409,2017
3,C#,58305.499895,2017
4,Python,61641.073324,2017
5,PHP,44780.664854,2017
6,C++,58690.262754,2017
7,C,56663.80077,2017
8,TypeScript,59623.008951,2017
9,Ruby,65581.261691,2017


In [25]:
avg_salary_language.to_csv("clean_data/2017_avg_salary_by_language.csv")