In [51]:
import pandas as pd
import numpy as np

In [52]:
DATA_PATH = '../data/raw/survey_results_public.csv'

In [53]:
# To split answers in columns with semicolon (Copied from Deena's Project)
def split_answers(data_series, delimiter=";"):
    """ 
    Split multiple answers in a single string 
    to a list of single strings each represnting a single answers 

    Parameters:
    * data_series (pd.Series): String series with answers 
    * delimiter (string): Another decimal integer 
                          Defaults to ";"

    Returns: (pd.Series): If column contains 
    """
    
    # Sub functions 
    def is_splittable(pd_series, delimiter):
        """ Check if results multiple should be splitted - Returns boolean """    
        return pd_series.str.contains(delimiter)
    
    def split_answer(pd_series, delimiter): 
        """Function to split single answer"""
        return pd_series.str.split(delimiter)
    
    # --------------------
    
    # Check if multiple answers exist - if none: return original 
    splittable_values = is_splittable(data_series, delimiter)
    if not splittable_values.any():
        return data_series
    
    # Else, split each value to a list 
    modified_series = split_answer(data_series, delimiter)    
    
    # Replace NAs with empty lists 
    mask_null = modified_series.isnull()
    modified_series.loc[mask_null] = modified_series.loc[mask_null].apply(lambda x: [])
    
    return modified_series

In [54]:
raw_df = pd.read_csv(DATA_PATH)
df_copy = raw_df.copy()

In [55]:
# # df_copy['LanguageHaveWorkedWith'].split(';')
# series = df_copy['LanguageHaveWorkedWith']
# for i in range(len(series)):
#     series.iloc[i] = str(series.iloc[i]).split(';')

# series[0]

In [56]:
REPLACE_DICT = {
    'YearsCodePro': {'Less than 1 year': 0, 'More than 50 years': 51}, 
    'YearsCode':    {'Less than 1 year': 0, 'More than 50 years': 51}, 
    'Age1stCode':   {'Older than 85':86, 'Younger than 5 years':4, '18 - 24 years':21, '11 - 17 years':14, 
                    '5 - 10 years':8, '25 - 34 years':30, '35 - 44 years':40, '45 - 54 years':50,
                    '55 - 64 years':60, 'nan':0, 'Older than 64 years':68}}

for col, replacement in REPLACE_DICT.items():
    df_copy[col] = df_copy[col].replace(replacement).astype(np.float32)

In [57]:
object_cols = df_copy.select_dtypes(include='object').columns.tolist()
for col in object_cols:
    df_copy[col] = split_answers(df_copy[col])

In [58]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83439 entries, 0 to 83438
Data columns (total 48 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   ResponseId                    83439 non-null  int64  
 1   MainBranch                    83439 non-null  object 
 2   Employment                    83323 non-null  object 
 3   Country                       83439 non-null  object 
 4   US_State                      14920 non-null  object 
 5   UK_Country                    4418 non-null   object 
 6   EdLevel                       83126 non-null  object 
 7   Age1stCode                    83243 non-null  float32
 8   LearnCode                     83439 non-null  object 
 9   YearsCode                     81641 non-null  float32
 10  YearsCodePro                  61216 non-null  float32
 11  DevType                       83439 non-null  object 
 12  OrgSize                       60726 non-null  object 
 13  C

In [86]:
# Visualizing to verify edits
i = df_copy.sample(1).index[0]
print(raw_df['LanguageHaveWorkedWith'].iloc[i])
print(df_copy['LanguageHaveWorkedWith'].iloc[i])

Bash/Shell;C#;HTML/CSS;Java;JavaScript;Kotlin;Node.js;Python;Swift
['Bash/Shell', 'C#', 'HTML/CSS', 'Java', 'JavaScript', 'Kotlin', 'Node.js', 'Python', 'Swift']


In [87]:
print(raw_df['YearsCodePro'].iloc[i])
print(df_copy['YearsCodePro'].iloc[i])

Less than 1 year
0.0


In [88]:
print(raw_df['DevType'].iloc[i])
print(df_copy['DevType'].iloc[i])

Developer, desktop or enterprise applications;Developer, back-end
['Developer, desktop or enterprise applications', 'Developer, back-end']


In [89]:
EXPORT_PATH = '../data/processed/1_preprocessed_df.pkl'
df_copy.to_pickle(EXPORT_PATH)