In [2]:
import pandas as pd
import numpy as np

In [3]:
survey_df = pd.read_csv("Data/survey_results_public_2020.csv")

survey_df.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'Age', 'Age1stCode', 'CompFreq',
       'CompTotal', 'ConvertedComp', 'Country', 'CurrencyDesc',
       'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith',
       'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors',
       'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith',
       'MiscTechDesireNextYear', 'MiscTechWorkedWith',
       'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps',
       'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
       'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms',
       'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites',
       'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear',
       'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount',
       'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength',
       'Trans', 'UndergradMajor', 'WebframeDesireNextYear',
  

In [4]:
def split_multicolumn(col_series):
    result_df = col_series.to_frame()
    options = []
    # Iterate over the column
    for idx, value  in col_series[col_series.notnull()].iteritems():
        # Break each value into list of options
        for option in value.split(';'):
            # Add the option as a column to result
            if not option in result_df.columns:
                options.append(option)
                result_df[option] = False
            # Mark the value in the option column as True
            result_df.at[idx, option] = True
    return result_df[options]

In [5]:
languages_worked_df = split_multicolumn(survey_df.LanguageWorkedWith)

languages_worked_df

Unnamed: 0,C#,HTML/CSS,JavaScript,Swift,Objective-C,Python,Ruby,SQL,Java,PHP,...,VBA,Perl,Scala,C++,Go,Haskell,Rust,Dart,Julia,Assembly
0,True,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,True,False,False,False,False,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64456,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
64457,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
64458,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
64459,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
temp_df = survey_df[['Respondent','Country', 'ConvertedComp']]
clean_df = temp_df.join(languages_worked_df, how='outer')
clean_df.rename(columns={"ConvertedComp" : "ConvertedComp"}, inplace=True)
clean_df

Unnamed: 0,Respondent,Country,ConvertedComp,C#,HTML/CSS,JavaScript,Swift,Objective-C,Python,Ruby,...,VBA,Perl,Scala,C++,Go,Haskell,Rust,Dart,Julia,Assembly
0,1,Germany,,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,United Kingdom,,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,Russian Federation,,False,False,False,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
3,4,Albania,,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,5,United States,,False,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64456,64858,United States,,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
64457,64867,Morocco,,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
64458,64898,Viet Nam,,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
64459,64925,Poland,,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
languages_worked_percentages = languages_worked_df.mean().sort_values(ascending=False) * 100
language_percent = languages_worked_percentages.reset_index()
language_percent.rename({"index": "language", 0: "percentage"}, axis=1, inplace=True)

language_percent['survey_year'] = 2020
language_percent


Unnamed: 0,language,percentage,survey_year
0,JavaScript,60.225563,2020
1,HTML/CSS,56.128512,2020
2,SQL,48.731791,2020
3,Python,39.228371,2020
4,Java,35.795287,2020
5,Bash/Shell/PowerShell,29.44416,2020
6,C#,27.987465,2020
7,PHP,23.280743,2020
8,TypeScript,22.615225,2020
9,C++,21.26402,2020


In [8]:
language_percent.to_csv("clean_data/language_percent_2020.csv")

In [9]:
top_10_language = language_percent.head(10)
top_10_language

Unnamed: 0,language,percentage,survey_year
0,JavaScript,60.225563,2020
1,HTML/CSS,56.128512,2020
2,SQL,48.731791,2020
3,Python,39.228371,2020
4,Java,35.795287,2020
5,Bash/Shell/PowerShell,29.44416,2020
6,C#,27.987465,2020
7,PHP,23.280743,2020
8,TypeScript,22.615225,2020
9,C++,21.26402,2020


In [10]:
languages = top_10_language['language']

In [11]:
average_salary = []

for language in languages:
    true_df = clean_df.loc[clean_df[f"{language}"] == True]
    avg_salary = true_df['ConvertedComp'].mean()
    average_salary.append(avg_salary)

In [12]:
avg_salary_language = pd.DataFrame({"language" : languages,
                                   "avg_salary" : average_salary})
avg_salary_language['survey_year'] = 2020
avg_salary_language

Unnamed: 0,language,avg_salary,survey_year
0,JavaScript,102280.804691,2020
1,HTML/CSS,100018.777252,2020
2,SQL,103281.520372,2020
3,Python,113061.525519,2020
4,Java,105150.557027,2020
5,Bash/Shell/PowerShell,123621.343455,2020
6,C#,107592.112122,2020
7,PHP,81937.73262,2020
8,TypeScript,107735.75537,2020
9,C++,111340.135263,2020


In [13]:
avg_salary_language.to_csv("clean_data/2020_avg_salary_by_language.csv")