In [1]:
import pandas as pd

In [2]:
FILE_PATH = '../data/raw/survey_results_public.csv'
LOAD_PATH = '../data/interim/'
LOAD_DF_NAME = '2.0-wrangled-data.pkl'
SAVE_DF_NAME = '3.0-cleaned-data.pkl'

# Load Data

In [3]:
survey = pd.read_pickle(LOAD_PATH + LOAD_DF_NAME)

In [4]:
survey.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73268 entries, 0 to 73267
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   MainBranch                    73268 non-null  object 
 1   Employment                    73268 non-null  object 
 2   RemoteWork                    58958 non-null  object 
 3   EdLevel                       71571 non-null  object 
 4   YearsCode                     71331 non-null  object 
 5   YearsCodePro                  51833 non-null  object 
 6   DevType                       73268 non-null  object 
 7   OrgSize                       51039 non-null  object 
 8   Country                       71771 non-null  object 
 9   CompTotal                     38422 non-null  float64
 10  CompFreq                      44425 non-null  object 
 11  LanguageHaveWorkedWith        73268 non-null  object 
 12  LanguageWantToWorkWith        73268 non-null  object 
 13  D

# Clean data
From info dataframe, our data is full of nulls and redundant data that we need to handle

In [5]:
splited_columns = (survey.applymap(type) == list).any()
splited_columns = splited_columns[splited_columns].index

In [6]:
def indexing_nulls(column_name, splitted = True):
    if splitted:
        return survey[survey[column_name].apply(lambda x : not x)].index
    else:
        return survey[survey[column_name].isna()].index


## 1. Remove [DevType, LanguageHaveWorkedWith, YearsCode] nulls
First we need to delete any row that doesn't have any DevType or LanguageHaveWorkedWith or YearsCode, because that it's our target column and also we can't make sure of the info of his survey

In [7]:
imp_columns = ['DevType', 'LanguageHaveWorkedWith', 'YearsCode']
for column in imp_columns:
    non_index = indexing_nulls(column, splitted= column in splited_columns)
    survey.drop(non_index, inplace = True)

### Counting nulls after this step

In [8]:
for column in survey.columns:
    if column in splited_columns:
        print(column, ':', len(indexing_nulls(column)))
    else:
        print(column, survey[column].isna().sum())

MainBranch 0
Employment : 0
RemoteWork 2482
EdLevel 51
YearsCode 0
YearsCodePro 9510
DevType : 0
OrgSize 10262
Country 0
CompTotal 22470
CompFreq 16638
LanguageHaveWorkedWith : 0
LanguageWantToWorkWith : 3352
DatabaseHaveWorkedWith : 7837
DatabaseWantToWorkWith : 15413
PlatformHaveWorkedWith : 16053
PlatformWantToWorkWith : 24155
WebframeHaveWorkedWith : 13921
WebframeWantToWorkWith : 20132
MiscTechHaveWorkedWith : 21870
MiscTechWantToWorkWith : 28713
ToolsTechHaveWorkedWith : 13227
ToolsTechWantToWorkWith : 19583
NEWCollabToolsHaveWorkedWith : 658
NEWCollabToolsWantToWorkWith : 5814
OpSysProfessional use : 2947
VersionControlSystem : 152
Age 457
Gender : 585


# 2. replace nuns values in YearsCodePro to be zero

In [9]:
survey['YearsCodePro'] = survey['YearsCodePro'].fillna('0')

# 3. Convert [YearsCode, YearsCodePro] to be int64

In [10]:
replace_dict = {'Less than 1 year' : 1, 'More than 50 years' : 51 }
replace_column = ['YearsCodePro', 'YearsCode']

for column in replace_column:
    survey[column] = survey[column].replace(replace_dict).astype('int64')

# 4. Remove raws that have YearCode equal to zero

In [11]:
zero_code_index = survey[survey['YearsCode']==0].index
survey.drop(zero_code_index, inplace = True)

In [12]:
survey

Unnamed: 0,MainBranch,Employment,RemoteWork,EdLevel,YearsCode,YearsCodePro,DevType,OrgSize,Country,CompTotal,...,MiscTechHaveWorkedWith,MiscTechWantToWorkWith,ToolsTechHaveWorkedWith,ToolsTechWantToWorkWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsWantToWorkWith,OpSysProfessional use,VersionControlSystem,Age,Gender
2,"I am not primarily a developer, but I write co...",[Employed_full-time],"Hybrid (some remote, some in-person)","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",14,5,[Data scientist or machine learning specialist...,20 to 99 employees,United Kingdom of Great Britain and Northern I...,32000.0,...,[Pandas],[.NET],[],[],"[Notepad++, Visual Studio]","[Notepad++, Visual Studio]",[Windows],[Git],25-34 years old,[Man]
3,I am a developer by profession,[Employed_full-time],Fully remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",20,17,[Developer_full-stack],100 to 499 employees,Israel,60000.0,...,[.NET],[.NET],[],[],"[Notepad++, Visual Studio, Visual Studio Code]","[Notepad++, Visual Studio, Visual Studio Code]",[Windows],[Git],35-44 years old,[Man]
4,I am a developer by profession,[Employed_full-time],"Hybrid (some remote, some in-person)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",8,3,"[Developer_front-end, Developer_full-stack, De...",20 to 99 employees,United States of America,,...,[.NET],"[.NET, Apache Kafka]",[npm],"[Docker, Kubernetes]","[Notepad++, Visual Studio, Visual Studio Code,...","[Rider, Visual Studio, Visual Studio Code]",[Windows],"[Git, Other (please specify):]",25-34 years old,[]
7,I am a developer by profession,[Not employed_but looking for work],,Some college/university study without earning ...,1,0,"[Developer_full-stack, Student]",,India,,...,[],[],[npm],"[Unity 3D, Yarn]","[Atom, CLion, Eclipse, IntelliJ, Notepad++, Vi...","[Android Studio, IPython/Jupyter, Sublime Text...","[Linux-based, macOS]",[Git],18-24 years old,[Man]
9,I am a developer by profession,[Independent contractor_freelancer_or self-emp...,Fully remote,Some college/university study without earning ...,37,30,"[Developer_desktop or enterprise applications,...","Just me - I am a freelancer, sole proprietor, ...",Croatia,,...,[],[],[],[],"[Android Studio, RAD Studio (Delphi_C++ Builde...","[Android Studio, RAD Studio (Delphi_C++ Builde...",[Windows],[Git],45-54 years old,[Woman]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73263,I am a developer by profession,[Employed_full-time],Fully remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",8,5,[Developer_back-end],100 to 499 employees,Nigeria,60000.0,...,[Flutter],[],"[Docker, Homebrew, Kubernetes, npm]","[Docker, Homebrew, Kubernetes, npm]","[IPython/Jupyter, Sublime Text, Vim, Visual St...","[Sublime Text, Vim, Visual Studio Code]",[macOS],[Git],25-34 years old,[Man]
73264,I am a developer by profession,[Employed_full-time],Full in-person,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",6,5,[Data scientist or machine learning specialist],I don’t know,United States of America,107000.0,...,"[Keras, NumPy, Pandas, Scikit-learn, TensorFlo...","[NumPy, Pandas, Torch/PyTorch, Hugging Face Tr...",[],[],"[IPython/Jupyter, Notepad++, Spyder, Vim, Visu...","[Notepad++, Spyder, Vim, Visual Studio Code]","[Linux-based, Windows]",[Git],25-34 years old,[Man]
73265,"I am not primarily a developer, but I write co...",[Employed_full-time],"Hybrid (some remote, some in-person)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",42,33,"[Developer_full-stack, Developer_desktop or en...",20 to 99 employees,United States of America,,...,"[.NET, Pandas, React Native]","[.NET, Cordova, Ionic, Pandas, React Native, X...",[npm],"[npm, Unreal Engine]","[Spyder, Visual Studio, Visual Studio Code]","[Spyder, Visual Studio, Visual Studio Code]",[Windows],[Git],55-64 years old,[Man]
73266,I am a developer by profession,[Employed_full-time],"Hybrid (some remote, some in-person)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",50,31,"[Developer_front-end, Developer_desktop or ent...",10 to 19 employees,United Kingdom of Great Britain and Northern I...,58500.0,...,[],[],[],[],"[RAD Studio (Delphi_C++ Builder), Visual Studio]","[RAD Studio (Delphi_C++ Builder), Visual Studio]",[Windows],[SVN],55-64 years old,[Man]


In [13]:
survey.to_pickle(LOAD_PATH + SAVE_DF_NAME)