In [1]:
import pickle
import pandas as pd

In [2]:
FILE_PATH = '../data/raw/survey_results_public.csv'
LOAD_PATH = '../data/interim/'

# Load Data

In [3]:
survey = pd.read_csv(FILE_PATH)
with open(LOAD_PATH + 'chosen_columns.pkl', 'rb') as f:
    chosen_columns = pickle.load(f)

survey = survey[chosen_columns['analysis']]

In [4]:
survey.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73268 entries, 0 to 73267
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   MainBranch                    73268 non-null  object 
 1   Employment                    71709 non-null  object 
 2   RemoteWork                    58958 non-null  object 
 3   EdLevel                       71571 non-null  object 
 4   YearsCode                     71331 non-null  object 
 5   YearsCodePro                  51833 non-null  object 
 6   DevType                       61302 non-null  object 
 7   OrgSize                       51039 non-null  object 
 8   Country                       71771 non-null  object 
 9   CompTotal                     38422 non-null  float64
 10  CompFreq                      44425 non-null  object 
 11  LanguageHaveWorkedWith        70975 non-null  object 
 12  LanguageWantToWorkWith        67027 non-null  object 
 13  D

# Wrangle data
most of the columns are multi choose answers, and answerers are splittable with ';', so will list them in list instead of splittable string

In [5]:
def wrangle_column(column_name):
    target_column = survey[column_name]

    # first check if the columns is splittable with ;
    splittable = target_column.str.contains(';').any()

    if not splittable:
        return target_column
    else:
        # replace ', ' with '_' to make list more readable
        target_column = target_column.str.replace(', ', '_')

        # split column
        target_column = target_column.str.split(';')

        # make nullable values empty list
        mask_null = target_column.isnull()
        target_column.loc[mask_null] =target_column.loc[mask_null].apply(lambda x: [])
        return target_column



In [6]:
for column in survey.columns:
    if survey[column].dtype=='object':
        survey[column] = wrangle_column(column)


In [7]:
survey

Unnamed: 0,MainBranch,Employment,RemoteWork,EdLevel,YearsCode,YearsCodePro,DevType,OrgSize,Country,CompTotal,...,MiscTechHaveWorkedWith,MiscTechWantToWorkWith,ToolsTechHaveWorkedWith,ToolsTechWantToWorkWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsWantToWorkWith,OpSysProfessional use,VersionControlSystem,Age,Gender
0,None of these,[],,,,,[],,,,...,[],[],[],[],[],[],[],[],,[]
1,I am a developer by profession,[Employed_full-time],Fully remote,,,,[],,Canada,,...,[],[],[],[],[],[],[macOS],[Git],,[]
2,"I am not primarily a developer, but I write co...",[Employed_full-time],"Hybrid (some remote, some in-person)","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",14,5,[Data scientist or machine learning specialist...,20 to 99 employees,United Kingdom of Great Britain and Northern I...,32000.0,...,[Pandas],[.NET],[],[],"[Notepad++, Visual Studio]","[Notepad++, Visual Studio]",[Windows],[Git],25-34 years old,[Man]
3,I am a developer by profession,[Employed_full-time],Fully remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",20,17,[Developer_full-stack],100 to 499 employees,Israel,60000.0,...,[.NET],[.NET],[],[],"[Notepad++, Visual Studio, Visual Studio Code]","[Notepad++, Visual Studio, Visual Studio Code]",[Windows],[Git],35-44 years old,[Man]
4,I am a developer by profession,[Employed_full-time],"Hybrid (some remote, some in-person)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",8,3,"[Developer_front-end, Developer_full-stack, De...",20 to 99 employees,United States of America,,...,[.NET],"[.NET, Apache Kafka]",[npm],"[Docker, Kubernetes]","[Notepad++, Visual Studio, Visual Studio Code,...","[Rider, Visual Studio, Visual Studio Code]",[Windows],"[Git, Other (please specify):]",25-34 years old,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73263,I am a developer by profession,[Employed_full-time],Fully remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",8,5,[Developer_back-end],100 to 499 employees,Nigeria,60000.0,...,[Flutter],[],"[Docker, Homebrew, Kubernetes, npm]","[Docker, Homebrew, Kubernetes, npm]","[IPython/Jupyter, Sublime Text, Vim, Visual St...","[Sublime Text, Vim, Visual Studio Code]",[macOS],[Git],25-34 years old,[Man]
73264,I am a developer by profession,[Employed_full-time],Full in-person,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",6,5,[Data scientist or machine learning specialist],I don’t know,United States of America,107000.0,...,"[Keras, NumPy, Pandas, Scikit-learn, TensorFlo...","[NumPy, Pandas, Torch/PyTorch, Hugging Face Tr...",[],[],"[IPython/Jupyter, Notepad++, Spyder, Vim, Visu...","[Notepad++, Spyder, Vim, Visual Studio Code]","[Linux-based, Windows]",[Git],25-34 years old,[Man]
73265,"I am not primarily a developer, but I write co...",[Employed_full-time],"Hybrid (some remote, some in-person)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",42,33,"[Developer_full-stack, Developer_desktop or en...",20 to 99 employees,United States of America,,...,"[.NET, Pandas, React Native]","[.NET, Cordova, Ionic, Pandas, React Native, X...",[npm],"[npm, Unreal Engine]","[Spyder, Visual Studio, Visual Studio Code]","[Spyder, Visual Studio, Visual Studio Code]",[Windows],[Git],55-64 years old,[Man]
73266,I am a developer by profession,[Employed_full-time],"Hybrid (some remote, some in-person)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",50,31,"[Developer_front-end, Developer_desktop or ent...",10 to 19 employees,United Kingdom of Great Britain and Northern I...,58500.0,...,[],[],[],[],"[RAD Studio (Delphi_C++ Builder), Visual Studio]","[RAD Studio (Delphi_C++ Builder), Visual Studio]",[Windows],[SVN],55-64 years old,[Man]


In [8]:
survey.to_pickle(LOAD_PATH + '2.0-wrangled-data.pkl')