# Add working conditions data from EWCS

### Load libraries

In [1]:
import pandas as pd
import pyreadstat

### Load data

In [2]:
df, meta = pyreadstat.read_dta(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/EWCS 1991-2015 UKDA ISCO.DTA"
)
variable_labels = meta.column_labels

### Choose only necessary data, tranform where needed

Identify and name the aggregated indexes for work conditions

In [3]:
labels_to_search = [
    "JQI Monthly earnings",
    "JQI Skills and discretion index",
    # "JQI social environment", - only for 2015
    "JQI Physical environment index",
    # "JQI Intensity index", - full only for 2015
    "JQI SLIM Intensity index",
    # "JQI Prospects index", - only for 2015
    # "JQI Working time quality index", - full only for 2015
    "JQI SLIM Working time quality index",
]

# Iterate through metadata to find labels and print corresponding variable names
for index, label in enumerate(meta.column_labels):
    if label in labels_to_search:
        print(label, ":", meta.column_names[index])

JQI Monthly earnings : adincome_mth
JQI Skills and discretion index : wq
JQI Physical environment index : envsec
JQI SLIM Intensity index : intens_slim
JQI SLIM Working time quality index : wlb_slim


In [4]:
df = df.rename(
    columns={
        "adincome_mth": "jqi_monthly_earnings",
        "wq": "jqi_skills_discretion",
        # "goodsoc": "jqi_social_environment",
        "envsec": "jqi_physical_environment",
        "intens_slim": "jqi_intensity_slim",
        # "prosp": "jqi_prospects",
        "wlb_slim": "jqi_working_time_quality_slim",
    }
)

Choose columns

In [5]:
df = df[
    [
        "countid",
        "year",
        "ISCO_08",
        "jqi_monthly_earnings",
        "jqi_skills_discretion",
        # "jqi_social_environment",
        "jqi_physical_environment",
        "jqi_intensity_slim",
        # "jqi_prospects",
        "jqi_working_time_quality_slim",
    ]
]

Define country names and choose the needed

In [6]:
countid_mapping = meta.value_labels["COUNTID"]
df["countid"] = df["countid"].map(countid_mapping)

In [7]:
countries = [
    "Austria",
    "Belgium",
    "Czech Republic",
    "Denmark",
    "Estonia",
    "France",
    "Germany",
    "Italy",
    "Slovenia",
    "Spain",
    "Switzerland",
]
df = df[df["countid"].isin(countries)].reset_index(drop=True)

Leave only waves 5 and 6 (2010 and 2015)

In [8]:
df = df[df.year >= 2010].reset_index(drop=True)

Drop lines with missing isco codes

In [9]:
df = df.dropna(subset="ISCO_08").reset_index(drop=True)

Adjust some isco codes

In [10]:
def modify_isco(value):
    if len(str(value)) == 1:
        return value * 1000
    elif len(str(value)) == 2:
        return value * 100
    elif len(str(value)) == 3:
        return value * 10
    else:
        return value


df["ISCO_08"] = df["ISCO_08"].apply(modify_isco)

Rename some variables

In [11]:
df = df.rename(columns={"countid": "country", "ISCO_08": "isco"})

Calculate social environment, job intensity, job prospects and working time quality indexes for 2010

In [12]:
# Still to do

In [13]:
df

Unnamed: 0,country,year,isco,jqi_monthly_earnings,jqi_skills_discretion,jqi_physical_environment,jqi_intensity_slim,jqi_working_time_quality_slim
0,Belgium,2015,8141,1530.153076,8.536879,66.666664,53.333336,88.500
1,Belgium,2015,5141,,64.617447,91.025642,24.444445,87.500
2,Belgium,2015,1323,1800.180054,76.372818,76.923080,52.222221,37.500
3,Belgium,2015,7115,1575.157593,42.832993,98.717949,18.888889,96.875
4,Belgium,2015,8322,1278.127930,30.789835,97.435898,53.333336,96.875
...,...,...,...,...,...,...,...,...
34701,Slovenia,2010,9111,99.932480,41.638947,84.615387,40.000000,100.000
34702,Slovenia,2010,5131,624.578003,48.814400,84.615387,40.000000,62.500
34703,Slovenia,2010,5141,437.204590,60.634476,80.769234,6.666667,87.500
34704,Slovenia,2010,3113,749.493591,89.690376,97.435898,18.888889,88.500


Aggregate on the level of isco, year and country

In [14]:
df = df.groupby(["country", "year", "isco"]).mean().reset_index(drop=False)

In [15]:
df = df.dropna().reset_index(drop=True)

Interpolate to obtain approximation of values for 2011 and 2013

In [16]:
# Convert 'year' column to datetime
df["year"] = pd.to_datetime(df["year"], format="%Y")

# Create DataFrames for years 2010 and 2015 separately
df_2010 = df[df["year"].dt.year == 2010]
df_2015 = df[df["year"].dt.year == 2015]

# Create an empty DataFrame for years 2011 and 2013
df_2011 = pd.DataFrame()
df_2013 = pd.DataFrame()

# Loop through unique combinations of 'country' and 'isco'
for country in df["country"].unique():
    for isco in df["isco"].unique():
        # Filter data for specific 'country' and 'isco' combination
        data_2010 = df_2010[(df_2010["country"] == country) & (df_2010["isco"] == isco)]
        data_2015 = df_2015[(df_2015["country"] == country) & (df_2015["isco"] == isco)]

        # Interpolate values for 'year' 2011 and 'year' 2013 if data for both 2010 and 2015 exists
        if not data_2010.empty and not data_2015.empty:
            # Calculate the interpolation for each column for 'year' 2011
            interpolated_values_2011 = data_2010.iloc[0, 3:].interpolate(
                method="linear", limit_area="inside", limit=1
            ) + (data_2015.iloc[0, 3:] - data_2010.iloc[0, 3:]).multiply(1 / 3)

            # Calculate the interpolation for each column for 'year' 2013
            interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
                method="linear", limit_area="inside", limit=1
            ) + (data_2015.iloc[0, 3:] - data_2010.iloc[0, 3:]).multiply(3 / 5)

            # Give names to the interpolated values based on column names
            interpolated_values_2011.index = data_2010.columns[3:]
            interpolated_values_2013.index = data_2010.columns[3:]

            # Create DataFrame rows for 'year' 2011 and 'year' 2013 with interpolated values
            row_2011 = pd.DataFrame(
                {
                    "country": [country],
                    "year": [pd.Timestamp(year=2011, month=1, day=1)],
                    "isco": [isco],
                }
            ).join(interpolated_values_2011.to_frame().T.reset_index(drop=True))

            row_2013 = pd.DataFrame(
                {
                    "country": [country],
                    "year": [pd.Timestamp(year=2013, month=1, day=1)],
                    "isco": [isco],
                }
            ).join(interpolated_values_2013.to_frame().T.reset_index(drop=True))

            # Append the interpolated values for 'year' 2011 and 'year' 2013 to their respective DataFrames
            df_2011 = pd.concat([df_2011, row_2011], ignore_index=True)
            df_2013 = pd.concat([df_2013, row_2013], ignore_index=True)

# Append the interpolated 'year' 2011 and 'year' 2013 data to the original DataFrame
df = pd.concat([df, df_2011, df_2013], ignore_index=True)
df["year"] = pd.to_datetime(df["year"], format="%Y")
df["year"] = df["year"].dt.year

  interpolated_values_2011 = data_2010.iloc[0, 3:].interpolate(method='linear', limit_area='inside', limit=1) + \
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(method='linear', limit_area='inside', limit=1) + \
  interpolated_values_2011 = data_2010.iloc[0, 3:].interpolate(method='linear', limit_area='inside', limit=1) + \
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(method='linear', limit_area='inside', limit=1) + \
  interpolated_values_2011 = data_2010.iloc[0, 3:].interpolate(method='linear', limit_area='inside', limit=1) + \
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(method='linear', limit_area='inside', limit=1) + \
  interpolated_values_2011 = data_2010.iloc[0, 3:].interpolate(method='linear', limit_area='inside', limit=1) + \
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(method='linear', limit_area='inside', limit=1) + \
  interpolated_values_2011 = data_2010.iloc[0, 3:].interpolate(method='linear', limit_ar

In [17]:
df

Unnamed: 0,country,year,isco,jqi_monthly_earnings,jqi_skills_discretion,jqi_physical_environment,jqi_intensity_slim,jqi_working_time_quality_slim
0,Austria,2010,1114,2366.148112,77.290234,97.863248,45.925926,100.0
1,Austria,2010,1120,3143.596761,83.616787,85.042736,50.74074,53.125
2,Austria,2010,1219,1588.023401,64.30635,89.903848,25.694444,85.15625
3,Austria,2010,1221,3194.299927,86.929178,95.512821,57.77778,85.15625
4,Austria,2010,1321,2788.674561,66.906092,72.863248,48.611112,60.0625
...,...,...,...,...,...,...,...,...
8169,Spain,2013,9334,1046.085136,39.659487,81.911422,53.606061,82.545455
8170,Spain,2013,2100,2069.535937,66.338245,88.46154,49.999997,98.75
8171,Spain,2013,2143,2022.221924,68.003346,87.94872,66.222223,70.625
8172,Spain,2013,3258,1435.902368,67.310957,71.28205,40.222222,68.7


In [18]:
df.to_csv(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/work_quality_indexes.csv",
    index=False,
)