##### i) Setup environment

In [9]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
DATA_DIR = Path("..") / "Datasets" / "Raw data"

##### 1a) Load data and clean column names

In [10]:
df_2020 = pd.read_csv(DATA_DIR / "2020_rws.csv", encoding="latin1")

def clean_columns(df_2020):
    df_2020.columns = (
        df_2020.columns
        .str.strip()          # remove leading/trailing spaces
        .str.replace(r"\s+", " ", regex=True)  # normalize internal spaces
    )
    return df_2020
df_2020 = clean_columns(df_2020)

In [11]:
def clean_column_text(col: str) -> str:
    col = col.encode("utf-8", "ignore").decode("utf-8")
    col = re.sub(r"[\x91\x92\x93\x94]", "'", col)  # smart quotes
    col = re.sub(r"\s+", " ", col).strip()
    return col

df_2020.columns = [clean_column_text(c) for c in df_2020.columns]


##### 2a) Standardising Column Names

In [12]:
rename_map = {

    # Identifiers & demographics
    "Response ID": "response_id",
    "What year were you born?": "birth_year",
    "What is your gender?": "gender",
    "Which of the following best describes your industry?": "industry",
    "Which of the following best describes your industry? (Detailed)": "industry_detailed",
    "Which of the following best describes your current occupation?": "occupation",
    "Which of the following best describes your current occupation? (Detailed)": "occupation_detailed",
    "How many people are currently employed by your organisation?": "org_size",
    "Do you manage people as part of your current occupation?": "manager",
    "Which of the following best describes your household?": "household",
    "How long have you been in your current job?": "jobtenure",
    "Metro / Regional": "location",

    # Remote work exposure
    "Thinking about your current job, how much of your time did you spend remote working last year?":
        "remote_work_pct_last_year",
    "How much of your time would you have preferred to work remotely last year?":
        "remote_work_pref_pct_last_year",

    "Thinking about your current job, how much of your time did you spend remote working in the last 3 months?":
        "remote_work_pct_last_3_months",
    "How much of your time would you have preferred to work remotely in the last 3 months?":
        "remote_work_pref_pct_last_3_months",

    "Imagine that COVID-19 is cured or eradicated. Going forward, how much of your time would you prefer to work remotely?":
        "remote_work_pref_pct_future",

    # Likert – last year
    "Thinking about remote working last year, how strongly do you agree or disagree with the following statements? - My organisation encouraged people to work remotely":
        "remote_last_year_org_encouraged_agreement",
    "Thinking about remote working last year, how strongly do you agree or disagree with the following statements? - My organisation was well prepared for me to work remotely":
        "remote_last_year_org_prepared_agreement",
    "Thinking about remote working last year, how strongly do you agree or disagree with the following statements? - It was common for people in my organisation to work remotely":
        "remote_last_year_common_practice_agreement",
    "Thinking about remote working last year, how strongly do you agree or disagree with the following statements? - It was easy to get permission to work remotely":
        "remote_last_year_permission_easy_agreement",
    "Thinking about remote working last year, how strongly do you agree or disagree with the following statements? - I could easily collaborate with colleagues when working remotely":
        "remote_last_year_collaboration_easy_agreement",
    "Thinking about remote working last year, how strongly do you agree or disagree with the following statements? - I would recommend remote working to others":
        "remote_last_year_recommend_agreement",

    # Likert – last 3 months
    "Thinking about remote working in the last 3 months, how strongly do you agree or disagree with the following statements? - My organisation encouraged people to work remotely":
        "remote_last_3_months_org_encouraged_agreement",
    "Thinking about remote working in the last 3 months, how strongly do you agree or disagree with the following statements? - My organisation was well prepared for me to work remotely":
        "remote_last_3_months_org_prepared_agreement",
    "Thinking about remote working in the last 3 months, how strongly do you agree or disagree with the following statements? - It was common for people in my organisation to work remotely":
        "remote_last_3_months_common_practice_agreement",
    "Thinking about remote working in the last 3 months, how strongly do you agree or disagree with the following statements? - It was easy to get permission to work remotely":
        "remote_last_3_months_permission_easy_agreement",
    "Thinking about remote working in the last 3 months, how strongly do you agree or disagree with the following statements? - I could easily collaborate with colleagues when working remotely":
        "remote_last_3_months_collaboration_easy_agreement",
    "Thinking about remote working in the last 3 months, how strongly do you agree or disagree with the following statements? - I would recommend remote working to others":
        "remote_last_3_months_recommend_agreement",

    # Future expectations
    "Imagine that COVID-19 is cured or eradicated. How likely would you consider the following statements? - My employer would encourage more remote working":
        "future_remote_org_encourage_likelihood",
    "Imagine that COVID-19 is cured or eradicated. How likely would you consider the following statements? - My employer would make changes to support remote working":
        "future_remote_org_support_changes_likelihood",
    "Imagine that COVID-19 is cured or eradicated. How likely would you consider the following statements? - I would have more choice about whether I work remotely":
        "future_remote_employee_choice_likelihood",

    # Productivity
    "This question is about your productivity. Productivity means what you produce for each hour that you work. It includes the amount of work you achieve each hour, and the quality of your work each hour. Please compare your productivity when you work remotely to when you work at your employer's workplace. Roughly how productive are you, each hour, when you work remotely?":
        "remote_productivity_relative",

    # Time use – onsite
    "On a day when you attend your employer's workplace, how many hours would you spend doing the following activities? - Preparing for work and commuting":
        "onsite_commute_hours",
    "On a day when you attend your employer's workplace, how many hours would you spend doing the following activities? - Working":
        "onsite_work_hours",
    "On a day when you attend your employer's workplace, how many hours would you spend doing the following activities? - Personal and family time":
        "onsite_personal_hours",
    "On a day when you attend your employer's workplace, how many hours would you spend doing the following activities? - Caring and domestic responsibilities":
        "onsite_caring_hours",

    # Time use – remote
    "On a day when you do remote work, how many hours would you spend doing the following activities? - Preparing for work and commuting":
        "remote_commute_hours",
    "On a day when you do remote work, how many hours would you spend doing the following activities? - Working":
        "remote_work_hours",
    "On a day when you do remote work, how many hours would you spend doing the following activities? - Personal and family time":
        "remote_personal_hours",
    "On a day when you do remote work, how many hours would you spend doing the following activities? - Caring and domestic responsibilities":
        "remote_caring_hours",

    # Best / worst aspects
    "Compare remote working to working at your employer's workplace. Select the best aspect of remote working for you":
        "remote_best_aspect",
    "Compare remote working to working at your employer's workplace. Select the worst aspect of remote working for you":
        "remote_worst_aspect",
}

df_2020 = df_2020.rename(columns=rename_map)


##### 2b) Converting to Likert's Scale

In [13]:
likert_map = {
    "strongly disagree": 1,
    "somewhat disagree": 2,
    "neither agree nor disagree": 3,
    "somewhat agree": 4,
    "strongly agree": 5
}

likert_cols = [
    "remote_last_year_org_encouraged_agreement",
    "remote_last_year_org_prepared_agreement",
    "remote_last_year_common_practice_agreement",
    "remote_last_year_permission_easy_agreement",
    "remote_last_year_collaboration_easy_agreement",
    "remote_last_year_recommend_agreement",
    "remote_last_3_months_org_encouraged_agreement",
    "remote_last_3_months_org_prepared_agreement",
    "remote_last_3_months_common_practice_agreement",
    "remote_last_3_months_collaboration_easy_agreement",
    "remote_last_3_months_permission_easy_agreement",
    "remote_last_3_months_recommend_agreement",
    "Imagine that COVID-19 is cured or eradicated. How likely would you consider the following statements? - My employer would make changes to support remote working",
    "Imagine that COVID-19 is cured or eradicated. How likely would you consider the following statements? - I would have more choice about whether I work remotely",
    "Imagine that COVID-19 is cured or eradicated. How likely would you consider the following statements? - My employer would encourage more remote working"
]

df_2020[likert_cols] = (
    df_2020[likert_cols]
    .apply(lambda col: col.astype(str).str.lower().map(likert_map))
)

df_2020[likert_cols].isna().sum()

KeyError: "['Imagine that COVID-19 is cured or eradicated. How likely would you consider the following statements? - My employer would make changes to support remote working', 'Imagine that COVID-19 is cured or eradicated. How likely would you consider the following statements? - I would have more choice about whether I work remotely', 'Imagine that COVID-19 is cured or eradicated. How likely would you consider the following statements? - My employer would encourage more remote working'] not in index"

##### 2c) Standardize columns with percentages

In [None]:
df_2020['location'] = np.where(df_2020['location'].str.contains('Regio', case=False, na=False),
                               'Regional',
                               df_2020['location'])

df_2020["remote_work_pct_last_year"] = (
    df_2020["remote_work_pct_last_year"]
    .astype(str)
    .str.lower()
    .apply(lambda x: 0 if "rarely" in x
           else 5 if "less than" in x
           else int(x.split("%")[0]) if "%" in x
           else np.nan)
)

df_2020["remote_work_pct_last_year"].value_counts(dropna=False)

remote_work_pct_last_year
5      268
0      250
20     208
50     156
10     147
30     122
100     98
40      83
60      56
80      43
70      43
90      33
Name: count, dtype: int64

In [None]:
cols = [
    "remote_work_pref_pct_last_year",
    "remote_work_pct_last_3_months",
    "remote_work_pref_pct_last_3_months",
    "remote_work_pref_pct_future"
]

def parse_remote_pct(x):
    x = str(x).lower()
    if "rarely" in x or "prefer not" in x:
        return 0
    if "less than" in x:
        return 5
    if "%" in x:
        return int(x.split("%")[0])
    return np.nan

df_2020.loc[:, cols] = df_2020[cols].applymap(parse_remote_pct)

  df_2020.loc[:, cols] = df_2020[cols].applymap(parse_remote_pct)


##### 3. Save Cleaned Dataset

In [None]:
base_path = Path("..") / "Datasets" / "Cleaned data"
df_2020.to_csv(base_path / "cleaned_2020.csv", index=False)