# Add working conditions data from EWCS

### Load libraries

In [1]:
import subprocess

import pandas as pd
import pyreadstat

### Load data

In [2]:
df, meta = pyreadstat.read_dta(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/EWCS 1991-2015 UKDA ISCO.DTA"
)
variable_labels = meta.column_labels

### Choose only necessary data, tranform where needed

Identify and name the aggregated indexes for work conditions

In [3]:
df = df.rename(
    columns={
        "adincome_mth": "jqi_monthly_earnings",
        "wq": "jqi_skills_discretion",
        # "goodsoc": "jqi_social_environment", - to calculate manually
        "envsec": "jqi_physical_environment",
        # "intens_slim": "jqi_intensity",
        # "prosp": "jqi_prospects", - to calculate manually
        "wlb_slim": "jqi_working_time_quality",
    }
)

Define country names and choose the needed

In [4]:
countid_mapping = meta.value_labels["COUNTID"]
df["countid"] = df["countid"].map(countid_mapping)

In [5]:
countries = [
    "Austria",
    "Belgium",
    "Czech Republic",
    "Denmark",
    "Estonia",
    "France",
    "Germany",
    "Italy",
    "Slovenia",
    "Spain",
    "Switzerland",
]
df = df[df["countid"].isin(countries)].reset_index(drop=True)

Leave only waves 5 and 6 (2010 and 2015)

In [6]:
df = df[df.year >= 2010].reset_index(drop=True)

Drop lines with missing isco codes

In [7]:
df = df.dropna(subset="ISCO_08").reset_index(drop=True)

Adjust some isco codes

In [8]:
def modify_isco(value):
    if len(str(value)) == 1:
        return value * 1000
    elif len(str(value)) == 2:
        return value * 100
    elif len(str(value)) == 3:
        return value * 10
    else:
        return value


df["ISCO_08"] = df["ISCO_08"].apply(modify_isco)

Rename some variables

In [9]:
df = df.rename(columns={"countid": "country", "ISCO_08": "isco"})

Format id column

In [10]:
df["id"] = df["id"].astype(str)
df["id"] = df["id"].str.replace(r"[^ -~]+", "", regex=True)
df["id"] = df["id"].str.strip()

In [11]:
df

Unnamed: 0,id,country,wave,year,y15_Q1,y15_Q2a,y15_Q2b,y15_Q2c,y15_Q2d,y15_Q3a_2,...,wq_slim,goodsoc,jqi_physical_environment,intens,intens_slim,prosp,wlb,jqi_working_time_quality,isco,ISCO_88
0,BE000964,Belgium,6,2015,1,1,40,1,2,,...,14.285715,94.444443,66.666664,31.759258,53.333336,25.000000,69.12500,88.500,8141,8231
1,BE000967,Belgium,6,2015,1,2,49,1,2,,...,85.714287,,91.025642,13.148149,24.444445,50.000000,84.50000,87.500,5141,5141
2,BE000968,Belgium,6,2015,2,1,54,1,2,2,...,85.714287,,76.923080,27.037039,52.222221,50.000000,56.31250,37.500,1323,1313
3,BE000970,Belgium,6,2015,3,1,52,1,2,2,...,28.571430,97.916664,98.717949,9.444445,18.888889,62.500000,86.84375,96.875,7115,7124
4,BE000972,Belgium,6,2015,3,1,28,1,2,1,...,42.857143,74.166664,97.435898,30.833334,53.333336,41.666664,74.28125,96.875,8322,8322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34701,23000011509,Slovenia,5,2010,3,2,63,,,1,...,57.142860,,84.615387,,40.000000,,,100.000,9111,9131
34702,23000011511,Slovenia,5,2010,4,2,21,,,1,...,50.000000,,84.615387,,40.000000,,,62.500,5131,5123
34703,23000011513,Slovenia,5,2010,2,2,48,,,1,...,71.428574,,80.769234,,6.666667,,,87.500,5141,5141
34704,23000011516,Slovenia,5,2010,5,1,41,,,1,...,100.000000,,97.435898,,18.888889,,,88.500,3113,3113


### Calculate social environment index

Load data for 2010 and 2015

In [12]:
df10, meta10 = pyreadstat.read_dta(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/UKDA-2010/stata/stata11/ewcs_2010_version_ukda_6_dec_2011.dta"
)

Calculate the index for 2010

In [13]:
soc10 = df10[
    [
        "id",
        "q70a",
        "q70b",
        "q70c",
        "q71a",
        "q71c",
        "q71b",
        "q58b",
        "q58a",
        "q51a",
        "q51b",
    ]
]
soc10 = soc10.dropna().reset_index(drop=True)
for col in soc10.columns:
    if col != "id":
        soc10[col] = soc10[col].astype("int")
        soc10 = soc10[soc10[col] < 7].reset_index(drop=True)
soc10[["q58a", "q58b"]] = soc10[["q58a", "q58b"]].replace({1: 2, 2: 1})
soc10[["q51a", "q51b"]] = soc10[["q51a", "q51b"]].replace({1: 5, 2: 4, 4: 2, 5: 1})

In [14]:
soc10.describe()

Unnamed: 0,id,q70a,q70b,q70c,q71a,q71c,q71b,q58b,q58a,q51a,q51b
count,30836.0,30836.0,30836.0,30836.0,30836.0,30836.0,30836.0,30836.0,30836.0,30836.0,30836.0
mean,15723800000.0,1.891069,1.982196,1.945875,1.980834,1.991017,1.953528,1.950123,1.805682,4.01722,3.759826
std,10177430000.0,0.311558,0.13224,0.226268,0.13711,0.094354,0.210508,0.217694,0.395681,1.012743,1.16407
min,1000001000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,6000005000.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0
50%,15038200000.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,4.0
75%,25000000000.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,5.0,5.0
max,34000180000.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,5.0,5.0


In [15]:
soc10

Unnamed: 0,id,q70a,q70b,q70c,q71a,q71c,q71b,q58b,q58a,q51a,q51b
0,1.000001e+09,2,2,2,2,2,2,1,2,4,1
1,1.000001e+09,2,2,2,2,2,2,2,2,4,4
2,1.000001e+09,2,2,2,2,2,2,2,1,4,5
3,1.000001e+09,2,2,2,2,2,2,2,1,4,3
4,1.000001e+09,2,2,2,2,2,2,2,2,3,3
...,...,...,...,...,...,...,...,...,...,...,...
30831,3.400017e+10,2,2,2,2,2,2,2,2,3,4
30832,3.400017e+10,2,2,2,2,2,2,2,2,4,4
30833,3.400017e+10,2,2,2,2,2,2,2,2,3,4
30834,3.400018e+10,2,2,2,2,2,2,2,2,3,3


In [16]:
soc10["jqi_social_environment"] = (
    soc10["q58a"]
    + soc10["q58b"]
    + soc10["q51a"]
    + soc10["q51b"]
    + soc10["q70a"]
    + soc10["q70b"]
    + soc10["q70c"]
    + soc10["q71a"]
    + soc10["q71b"]
    + soc10["q71c"]
)

old_min = 10
old_max = 26
new_min = 0
new_max = 100

soc10["jqi_social_environment"] = (
    (soc10["jqi_social_environment"] - old_min) / (old_max - old_min)
) * (new_max - new_min) + new_min

In [17]:
soc10["jqi_social_environment"].describe()

count    30836.000000
mean        82.983566
std         14.530474
min         12.500000
25%         75.000000
50%         87.500000
75%         93.750000
max        100.000000
Name: jqi_social_environment, dtype: float64

In [18]:
soc10 = soc10[["id", "jqi_social_environment"]]
soc10["year"] = 2010

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  soc10["year"] = 2010


Calculate the index for 2015

In [19]:
soc15 = df[
    [
        "id",
        "y15_Q80a",
        "y15_Q80b",
        "y15_Q80c",
        "y15_Q81a",
        "y15_Q81b",
        "y15_Q81c",
        "y15_Q63a",
        "y15_Q63e",
        "y15_Q61a",
        "y15_Q61b",
    ]
]
soc15 = soc15.dropna().reset_index(drop=True)
for col in soc15.columns:
    if col != "id":
        soc15[col] = soc15[col].astype("int")
        soc15 = soc15[soc15[col] < 7].reset_index(drop=True)
soc15[["y15_Q61a", "y15_Q61b", "y15_Q63a", "y15_Q63e"]] = soc15[
    ["y15_Q61a", "y15_Q61b", "y15_Q63a", "y15_Q63e"]
].replace({1: 5, 2: 4, 4: 2, 5: 1})

In [20]:
soc15.describe()

Unnamed: 0,y15_Q80a,y15_Q80b,y15_Q80c,y15_Q81a,y15_Q81b,y15_Q81c,y15_Q63a,y15_Q63e,y15_Q61a,y15_Q61b
count,13279.0,13279.0,13279.0,13279.0,13279.0,13279.0,13279.0,13279.0,13279.0,13279.0
mean,1.87243,1.980194,1.950072,1.98042,1.990737,1.941637,4.426538,3.817682,3.967543,3.6441
std,0.333623,0.139337,0.217805,0.138556,0.0958,0.234437,0.855332,1.16333,1.079077,1.223853
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,2.0,2.0,2.0,2.0,2.0,4.0,3.0,3.0,3.0
50%,2.0,2.0,2.0,2.0,2.0,2.0,5.0,4.0,4.0,4.0
75%,2.0,2.0,2.0,2.0,2.0,2.0,5.0,5.0,5.0,5.0
max,2.0,2.0,2.0,2.0,2.0,2.0,5.0,5.0,5.0,5.0


In [21]:
soc15["jqi_social_environment"] = (
    soc15["y15_Q80a"]
    + soc15["y15_Q80b"]
    + soc15["y15_Q80c"]
    + soc15["y15_Q81a"]
    + soc15["y15_Q81b"]
    + soc15["y15_Q81c"]
    + soc15["y15_Q63a"]
    + soc15["y15_Q63e"]
    + soc15["y15_Q61a"]
    + soc15["y15_Q61b"]
)

old_min = 10
old_max = 32
new_min = 0
new_max = 100

soc15["jqi_social_environment"] = (
    (soc15["jqi_social_environment"] - old_min) / (old_max - old_min)
) * (new_max - new_min) + new_min

In [22]:
soc15["jqi_social_environment"].describe()

count    13279.000000
mean        79.869788
std         15.768143
min          9.090909
25%         72.727273
50%         81.818182
75%         90.909091
max        100.000000
Name: jqi_social_environment, dtype: float64

In [23]:
soc15 = soc15[["id", "jqi_social_environment"]]
soc15["year"] = 2015

Add indexes to main df

In [24]:
soc = pd.concat([soc10, soc15], axis=0).reset_index(drop=True)

In [25]:
soc["id"] = soc["id"].astype(str)
soc["id"] = soc["id"].astype(str).apply(lambda x: x[:-2] if x.endswith(".0") else x)
soc["id"] = soc["id"].str.replace(r"[^ -~]+", "", regex=True)

In [26]:
df = df.merge(soc, on=["id", "year"], how="left")

In [27]:
unique_mergeid_df = set(df["id"].unique())
unique_mergeid_soc = set(soc["id"].unique())
intersection_ids = unique_mergeid_df.intersection(unique_mergeid_soc)
len(intersection_ids)

25750

### Calculate prospects index

Index for 2010

In [28]:
pro10 = df10[["id", "q77c", "q77a"]]
pro10 = pro10.dropna().reset_index(drop=True)
for col in pro10.columns:
    if col != "id":
        pro10[col] = pro10[col].astype("int")
        pro10 = pro10[pro10[col] < 7].reset_index(drop=True)
pro10["q77a"] = pro10["q77a"].replace({1: 5, 2: 4, 4: 2, 5: 1})

In [29]:
pro10.describe()

Unnamed: 0,id,q77c,q77a
count,38556.0,38556.0,38556.0
mean,16108120000.0,2.745746,3.70666
std,10320830000.0,1.19969,1.226394
min,1000001000.0,1.0,1.0
25%,7000102000.0,2.0,3.0
50%,15380900000.0,3.0,4.0
75%,26000200000.0,4.0,5.0
max,34000180000.0,5.0,5.0


In [30]:
pro10["jqi_prospects"] = pro10["q77c"] + pro10["q77a"]

old_min = 2
old_max = 10
new_min = 0
new_max = 100

pro10["jqi_prospects"] = ((pro10["jqi_prospects"] - old_min) / (old_max - old_min)) * (
    new_max - new_min
) + new_min

In [31]:
pro10["jqi_prospects"].describe()

count    38556.000000
mean        55.655086
std         23.024819
min          0.000000
25%         37.500000
50%         50.000000
75%         75.000000
max        100.000000
Name: jqi_prospects, dtype: float64

In [32]:
pro10 = pro10[["id", "jqi_prospects"]]
pro10["year"] = 2010

Index for 2015

In [33]:
pro15 = df[["id", "y15_Q89b", "y15_Q89g"]]
pro15 = pro15.dropna().reset_index(drop=True)
for col in pro15.columns:
    if col != "id":
        pro15[col] = pro15[col].astype("int")
        pro15 = pro15[pro15[col] < 7].reset_index(drop=True)
pro15["y15_Q89b"] = pro15["y15_Q89b"].replace({1: 5, 2: 4, 4: 2, 5: 1})

In [34]:
pro15.describe()

Unnamed: 0,y15_Q89b,y15_Q89g
count,30073.0,30073.0
mean,2.797626,3.840555
std,1.282885,1.27062
min,1.0,1.0
25%,2.0,3.0
50%,3.0,4.0
75%,4.0,5.0
max,5.0,5.0


In [35]:
pro15["jqi_prospects"] = pro15["y15_Q89g"] + pro15["y15_Q89b"]

old_min = 2
old_max = 10
new_min = 0
new_max = 100

pro15["jqi_prospects"] = ((pro15["jqi_prospects"] - old_min) / (old_max - old_min)) * (
    new_max - new_min
) + new_min

In [36]:
pro15["jqi_prospects"].describe()

count    30073.000000
mean        57.977255
std         24.018895
min          0.000000
25%         50.000000
50%         62.500000
75%         75.000000
max        100.000000
Name: jqi_prospects, dtype: float64

In [37]:
pro15 = pro15[["id", "jqi_prospects"]]
pro15["year"] = 2015

Add indexes to main df

In [38]:
pro = pd.concat([pro10, pro15], axis=0).reset_index(drop=True)

In [39]:
pro["id"] = pro["id"].astype(str)
pro["id"] = pro["id"].astype(str).apply(lambda x: x[:-2] if x.endswith(".0") else x)
pro["id"] = pro["id"].str.replace(r"[^ -~]+", "", regex=True)

In [40]:
df = df.merge(pro, on=["id", "year"], how="left")

In [41]:
unique_mergeid_df = set(df["id"].unique())
unique_mergeid_pro = set(pro["id"].unique())
intersection_ids = unique_mergeid_df.intersection(unique_mergeid_pro)
len(intersection_ids)

30073

### Calculate intensity index

Index for 2010

In [42]:
int10 = df10[
    [
        "id",
        "q45a",
        "q45b",
        "q51g",
        "q46a",
        "q46b",
        "q46c",
        "q46d",
        "q46e",
        "q51p",
        "q24g",
        "q47",
        "q48",
    ]
]
int10 = int10.dropna().reset_index(drop=True)
for col in int10.columns:
    if col != "id" and col != "q46e":
        int10[col] = int10[col].astype("int")
        int10 = int10[int10[col] < 8].reset_index(drop=True)
    elif col == "q46e":
        int10[col] = int10[col].astype("int")
        int10 = int10[int10[col] < 7].reset_index(drop=True)

int10["q4748"] = int10["q47"] * int10["q48"]

int10["q46"] = 2
for i in range(len(int10)):
    count = 0
    for col in ["q46a", "q46b", "q46c", "q46d", "q46e"]:
        if int10[col][i] == 1:
            count += 1
    if count >= 3:
        int10["q46"][i] = 1

int10["q51g"] = int10["q51g"].replace({1: 5, 2: 4, 4: 2, 5: 1})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  int10["q46"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  int10["q46"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  int10["q46"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  int10["q46"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in th

In [43]:
int10.describe()

Unnamed: 0,id,q45a,q45b,q51g,q46a,q46b,q46c,q46d,q46e,q51p,q24g,q47,q48,q4748,q46
count,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0
mean,15562180000.0,4.323061,4.219857,3.915235,1.518826,1.304327,1.623841,1.818128,1.590548,3.319772,5.629827,2.429295,1.746296,4.34379,1.624986
std,10090790000.0,1.981461,2.030365,0.99792,0.499655,0.46013,0.48443,0.385746,0.491742,1.389524,1.606418,0.756038,0.597895,2.115829,0.484135
min,1000001000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,7000022000.0,2.0,2.0,3.0,1.0,1.0,1.0,2.0,1.0,2.0,5.0,2.0,1.0,2.0,1.0
50%,15038110000.0,5.0,4.0,4.0,2.0,1.0,2.0,2.0,2.0,3.0,6.0,3.0,2.0,4.0,2.0
75%,24000160000.0,6.0,6.0,5.0,2.0,2.0,2.0,2.0,2.0,5.0,7.0,3.0,2.0,6.0,2.0
max,34000170000.0,7.0,7.0,5.0,2.0,2.0,2.0,2.0,2.0,5.0,7.0,3.0,3.0,9.0,2.0


In [44]:
int10["jqi_intensity"] = (
    int10["q45a"]
    + int10["q45b"]
    + int10["q51g"]
    + int10["q46a"]
    + int10["q46b"]
    + int10["q46c"]
    + int10["q46d"]
    + int10["q46e"]
    + int10["q46"]
    + int10["q51p"]
    + int10["q24g"]
    + int10["q4748"]
)

old_min = 11
old_max = 54
new_min = 0
new_max = 100

int10["jqi_intensity"] = ((int10["jqi_intensity"] - old_min) / (old_max - old_min)) * (
    new_max - new_min
) + new_min

In [45]:
int10["jqi_intensity"].describe()

count    27063.000000
mean        56.353951
std         15.846348
min          4.651163
25%         46.511628
50%         58.139535
75%         67.441860
max         95.348837
Name: jqi_intensity, dtype: float64

In [46]:
int10 = int10[["id", "jqi_intensity"]]
int10["year"] = 2010

Index for 2015

In [47]:
int15 = df[
    [
        "id",
        "y15_Q49a",
        "y15_Q49b",
        "y15_Q61g",
        "y15_Q50a",
        "y15_Q50b",
        "y15_Q50c",
        "y15_Q50d",
        "y15_Q50e",
        "y15_Q61o",
        "y15_Q30g",
        "y15_Q51",
        "y15_Q52",
    ]
]
int15 = int15.dropna().reset_index(drop=True)
for col in int15.columns:
    if col != "id":
        int15[col] = int15[col].astype("int")
        int15 = int15[int15[col] < 7].reset_index(drop=True)

int15["y15_Q5152"] = int15["y15_Q51"] * int15["y15_Q52"]

int15["y15_Q50"] = 0
for i in range(len(int15)):
    count = 0
    for col in ["y15_Q50a", "y15_Q50b", "y15_Q50c", "y15_Q50d", "y15_Q50e"]:
        if int15[col][i] == 1:
            count += 1
    if count >= 3:
        int15["y15_Q50"][i] = 1

int15["y15_Q61g"] = int15["y15_Q61g"].replace({1: 5, 2: 4, 4: 2, 5: 1})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  int15["y15_Q50"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  int15["y15_Q50"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  int15["y15_Q50"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  int15["y15_Q50"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See t

In [48]:
int15.describe()

Unnamed: 0,y15_Q49a,y15_Q49b,y15_Q61g,y15_Q50a,y15_Q50b,y15_Q50c,y15_Q50d,y15_Q50e,y15_Q61o,y15_Q30g,y15_Q51,y15_Q52,y15_Q5152,y15_Q50
count,12947.0,12947.0,12947.0,12947.0,12947.0,12947.0,12947.0,12947.0,12947.0,12947.0,12947.0,12947.0,12947.0,12947.0
mean,3.700935,3.607786,3.692979,1.526377,1.173322,1.566386,1.83726,1.627018,3.125975,4.735923,2.350583,1.614737,3.8999,0.404264
std,1.682326,1.737535,0.979734,0.499323,0.37854,0.495592,0.369143,0.483616,1.341201,1.604203,0.764868,0.591948,2.044807,0.490768
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,2.0,2.0,3.0,1.0,1.0,1.0,2.0,1.0,2.0,4.0,2.0,1.0,2.0,0.0
50%,4.0,4.0,4.0,2.0,1.0,2.0,2.0,2.0,3.0,5.0,3.0,2.0,3.0,0.0
75%,5.0,5.0,4.0,2.0,1.0,2.0,2.0,2.0,4.0,6.0,3.0,2.0,6.0,1.0
max,6.0,6.0,5.0,2.0,2.0,2.0,2.0,2.0,5.0,6.0,3.0,3.0,9.0,1.0


In [49]:
int15["jqi_intensity"] = (
    int15["y15_Q49a"]
    + int15["y15_Q49b"]
    + int15["y15_Q61g"]
    + int15["y15_Q50a"]
    + int15["y15_Q50b"]
    + int15["y15_Q50c"]
    + int15["y15_Q50d"]
    + int15["y15_Q50e"]
    + int15["y15_Q50"]
    + int15["y15_Q61o"]
    + int15["y15_Q30g"]
    + int15["y15_Q5152"]
)

old_min = 11
old_max = 54
new_min = 0
new_max = 100

int15["jqi_intensity"] = ((int15["jqi_intensity"] - old_min) / (old_max - old_min)) * (
    new_max - new_min
) + new_min

In [50]:
int15["jqi_intensity"].describe()

count    12947.000000
mean        46.274705
std         13.852596
min          2.325581
25%         37.209302
50%         46.511628
75%         55.813953
max         83.720930
Name: jqi_intensity, dtype: float64

In [51]:
int15 = int15[["id", "jqi_intensity"]]
int15["year"] = 2015

Add indexes to main df

In [52]:
int = pd.concat([int10, int15], axis=0).reset_index(drop=True)

In [53]:
int["id"] = int["id"].astype(str)
int["id"] = int["id"].astype(str).apply(lambda x: x[:-2] if x.endswith(".0") else x)
int["id"] = int["id"].str.replace(r"[^ -~]+", "", regex=True)

In [54]:
df = df.merge(int, on=["id", "year"], how="left")

In [55]:
unique_mergeid_df = set(df["id"].unique())
unique_mergeid_int = set(int["id"].unique())
intersection_ids = unique_mergeid_df.intersection(unique_mergeid_int)
len(intersection_ids)

18419

### Add country level post-stratification weights

In [56]:
columns_to_multiply = [
    "jqi_monthly_earnings",
    "jqi_skills_discretion",
    "jqi_social_environment",
    "jqi_physical_environment",
    "jqi_intensity",
    "jqi_prospects",
    "jqi_working_time_quality",
]

for column in columns_to_multiply:
    df[f"{column}_weighted"] = df[column] * df["w4"]

### Add sum of indexes

Normal

In [57]:
df["jqi_sum"] = (
    +df["jqi_skills_discretion"]
    + df["jqi_social_environment"]
    + df["jqi_physical_environment"]
    + df["jqi_intensity"]
    + df["jqi_prospects"]
    + df["jqi_working_time_quality"]
)

In [58]:
df["jqi_sum"].describe()

count    13926.000000
mean       419.805953
std         59.874116
min        133.185597
25%        383.844714
50%        425.971467
75%        463.307343
max        575.941286
Name: jqi_sum, dtype: float64

Weighted

In [59]:
df["jqi_sum_weighted"] = (
    +df["jqi_skills_discretion_weighted"]
    + df["jqi_social_environment_weighted"]
    + df["jqi_physical_environment_weighted"]
    + df["jqi_intensity_weighted"]
    + df["jqi_prospects_weighted"]
    + df["jqi_working_time_quality_weighted"]
)

In [60]:
df["jqi_sum_weighted"].describe()

count    13926.000000
mean       431.165504
std        253.552717
min         40.274174
25%        246.390676
50%        368.014244
75%        544.066005
max       2656.738863
Name: jqi_sum_weighted, dtype: float64

In [61]:
df

Unnamed: 0,id,country,wave,year,y15_Q1,y15_Q2a,y15_Q2b,y15_Q2c,y15_Q2d,y15_Q3a_2,...,jqi_intensity,jqi_monthly_earnings_weighted,jqi_skills_discretion_weighted,jqi_social_environment_weighted,jqi_physical_environment_weighted,jqi_intensity_weighted,jqi_prospects_weighted,jqi_working_time_quality_weighted,jqi_sum,jqi_sum_weighted
0,BE000964,Belgium,6,2015,1,1,40,1,2,,...,,903.657300,5.041595,56.372266,39.371105,,0.000000,52.265144,,
1,BE000967,Belgium,6,2015,1,2,49,1,2,,...,,,20.449089,,28.806330,,,27.690591,,
2,BE000968,Belgium,6,2015,2,1,54,1,2,2,...,60.465116,770.161311,32.674170,,32.909586,25.868464,,16.043422,,
3,BE000970,Belgium,6,2015,3,1,52,1,2,2,...,,943.044961,25.644061,59.869880,59.102318,,44.902410,57.998946,,
4,BE000972,Belgium,6,2015,3,1,28,1,2,1,...,,754.819666,18.183448,42.950298,57.542387,,22.146248,57.211139,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34701,23000011509,Slovenia,5,2010,3,2,63,,,1,...,,113.467863,47.278745,,96.076142,,28.386132,113.544528,,
34702,23000011511,Slovenia,5,2010,4,2,21,,,1,...,65.116279,530.970737,41.498448,69.072833,71.933840,55.357119,,53.132949,,
34703,23000011513,Slovenia,5,2010,2,2,48,,,1,...,,165.145245,22.903454,,30.508954,,23.608119,33.051366,,
34704,23000011516,Slovenia,5,2010,5,1,41,,,1,...,79.069767,1065.008474,127.447402,142.097075,138.453561,112.355827,124.334941,125.755912,542.196042,770.444718


### Aggregate on the level of isco, year and country

In [63]:
# Choose columns
df1 = df[
    [
        "country",
        "year",
        "isco",
        "jqi_monthly_earnings",
        "jqi_skills_discretion",
        "jqi_social_environment",
        "jqi_physical_environment",
        "jqi_intensity",
        "jqi_prospects",
        "jqi_working_time_quality",
        "jqi_monthly_earnings_weighted",
        "jqi_skills_discretion_weighted",
        "jqi_social_environment_weighted",
        "jqi_physical_environment_weighted",
        "jqi_intensity_weighted",
        "jqi_prospects_weighted",
        "jqi_working_time_quality_weighted",
        "jqi_sum",
        "jqi_sum_weighted",
    ]
]

In [64]:
df1 = df1.groupby(["country", "year", "isco"]).mean().reset_index(drop=False)

In [65]:
df1 = df1.dropna().reset_index(drop=True)

Interpolate to obtain approximation of values for 2013

In [66]:
df1["year"] = pd.to_datetime(df1["year"], format="%Y")

df_2010 = df1[df1["year"].dt.year == 2010]
df_2015 = df1[df1["year"].dt.year == 2015]

df_2013 = pd.DataFrame()

for country in df1["country"].unique():
    for isco in df1["isco"].unique():
        data_2010 = df_2010[(df_2010["country"] == country) & (df_2010["isco"] == isco)]
        data_2015 = df_2015[(df_2015["country"] == country) & (df_2015["isco"] == isco)]

        if not data_2010.empty and not data_2015.empty:
            interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
                method="linear", limit_area="inside", limit=1
            ) + (data_2015.iloc[0, 3:] - data_2010.iloc[0, 3:]).multiply(3 / 5)

            interpolated_values_2013.index = data_2010.columns[3:]

            row_2013 = pd.DataFrame(
                {
                    "country": [country],
                    "year": [pd.Timestamp(year=2013, month=1, day=1)],
                    "isco": [isco],
                }
            ).join(interpolated_values_2013.to_frame().T.reset_index(drop=True))

            df_2013 = pd.concat([df_2013, row_2013], ignore_index=True)

df1 = pd.concat([df1, df_2013], ignore_index=True)
df1["year"] = pd.to_datetime(df1["year"], format="%Y")
df1["year"] = df1["year"].dt.year

  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 3:].interpolate(
  interpolated_values_2013 = data_2010.i

In [67]:
df1["year"] = df1["year"].replace({2010: 2011})

In [68]:
df1

Unnamed: 0,country,year,isco,jqi_monthly_earnings,jqi_skills_discretion,jqi_social_environment,jqi_physical_environment,jqi_intensity,jqi_prospects,jqi_working_time_quality,jqi_monthly_earnings_weighted,jqi_skills_discretion_weighted,jqi_social_environment_weighted,jqi_physical_environment_weighted,jqi_intensity_weighted,jqi_prospects_weighted,jqi_working_time_quality_weighted,jqi_sum,jqi_sum_weighted
0,Austria,2011,1114,2366.148112,77.290234,100.0,97.863248,52.325581,75.0,100.0,3034.577207,94.40666,60.31181,121.392346,47.096205,45.233857,123.996761,523.975161,316.018902
1,Austria,2011,1120,3143.596761,83.616787,87.5,85.042736,53.820598,78.571429,53.125,1638.952541,52.464895,47.019517,53.989173,34.3123,49.624036,32.531523,462.6496,250.798543
2,Austria,2011,1219,1588.023401,64.30635,93.75,89.903848,65.697674,59.375,85.15625,1166.245217,44.854093,82.620009,60.938954,54.299599,42.050841,59.592109,513.940069,452.925154
3,Austria,2011,1321,2788.674561,66.906092,81.25,72.863248,45.348837,75.0,60.0625,2632.194486,96.736029,97.872116,105.282028,57.58021,94.529457,101.218176,428.827792,521.19203
4,Austria,2011,1324,1799.962646,65.660399,87.5,87.948718,50.0,62.5,83.75,1804.782956,67.982449,43.819472,94.822467,52.969386,64.502054,85.356768,468.942539,234.843595
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4268,Spain,2013,9334,1046.085136,39.659487,84.363636,81.911422,46.678593,46.985294,82.545455,1092.568983,43.613822,88.254037,84.228471,38.838715,51.95402,86.14677,369.81613,310.287247
4269,Spain,2013,2100,2069.535937,66.338245,97.5,88.46154,59.069767,57.5,98.75,3654.135875,115.455341,170.467285,154.932298,84.230581,103.722052,171.49274,467.619552,800.300298
4270,Spain,2013,3258,1435.902368,67.310957,96.363636,71.28205,52.55814,57.5,68.7,1456.858411,72.508648,100.578868,76.626674,61.410752,68.131395,70.518447,409.30079,471.574849
4271,Spain,2013,4212,1261.978491,50.317439,93.522727,78.076923,41.860465,67.5,75.3875,782.891747,38.370388,74.793264,54.05148,28.959014,44.732223,52.370037,418.935822,309.266499


In [68]:
df1.to_csv(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/results/work_quality_indexes_year_country.csv",
    index=False,
)

### Aggregate on the level of isco and year 

In [69]:
# Choose columns
df2 = df[
    [
        "year",
        "isco",
        "jqi_monthly_earnings",
        "jqi_skills_discretion",
        "jqi_social_environment",
        "jqi_physical_environment",
        "jqi_intensity",
        "jqi_prospects",
        "jqi_working_time_quality",
        "jqi_monthly_earnings_weighted",
        "jqi_skills_discretion_weighted",
        "jqi_social_environment_weighted",
        "jqi_physical_environment_weighted",
        "jqi_intensity_weighted",
        "jqi_prospects_weighted",
        "jqi_working_time_quality_weighted",
        "jqi_sum",
        "jqi_sum_weighted",
    ]
]

In [70]:
df2 = df2.groupby(["year", "isco"]).mean().reset_index(drop=False)

In [71]:
df2 = df2.dropna().reset_index(drop=True)

Interpolate to obtain approximation of values for 2013

In [72]:
df2["year"] = pd.to_datetime(df2["year"], format="%Y")

df_2010 = df2[df2["year"].dt.year == 2010]
df_2015 = df2[df2["year"].dt.year == 2015]

df_2013 = pd.DataFrame()

for isco in df2["isco"].unique():
    data_2010 = df_2010[df_2010["isco"] == isco]
    data_2015 = df_2015[df_2015["isco"] == isco]

    if not data_2010.empty and not data_2015.empty:
        interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
            method="linear", limit_area="inside", limit=1
        ) + (data_2015.iloc[0, 2:] - data_2010.iloc[0, 2:]).multiply(3 / 5)

        interpolated_values_2013.index = data_2010.columns[2:]

        row_2013 = pd.DataFrame(
            {
                "year": [pd.Timestamp(year=2013, month=1, day=1)],
                "isco": [isco],
            }
        ).join(interpolated_values_2013.to_frame().T.reset_index(drop=True))

        df_2013 = pd.concat([df_2013, row_2013], ignore_index=True)

df2 = pd.concat([df2, df_2013], ignore_index=True)
df2["year"] = pd.to_datetime(df2["year"], format="%Y")
df2["year"] = df2["year"].dt.year

  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.iloc[0, 2:].interpolate(
  interpolated_values_2013 = data_2010.i

In [73]:
df2["year"] = df2["year"].replace({2010: 2011})

In [74]:
df2

Unnamed: 0,year,isco,jqi_monthly_earnings,jqi_skills_discretion,jqi_social_environment,jqi_physical_environment,jqi_intensity,jqi_prospects,jqi_working_time_quality,jqi_monthly_earnings_weighted,jqi_skills_discretion_weighted,jqi_social_environment_weighted,jqi_physical_environment_weighted,jqi_intensity_weighted,jqi_prospects_weighted,jqi_working_time_quality_weighted,jqi_sum,jqi_sum_weighted
0,2011,1100,2587.962799,69.381842,90.625,90.909092,57.293869,75.0,98.011364,2384.988548,63.621495,85.602383,83.055189,51.259031,71.443338,88.034729,480.587444,471.879209
1,2011,1111,2250.212012,72.508442,85.714286,91.025642,60.13289,70.3125,85.0,2453.299933,70.387276,71.683611,90.203566,70.144032,65.69619,76.230457,473.16208,368.829092
2,2011,1112,2321.469202,70.534725,79.583333,89.136303,47.227191,63.888889,81.190789,2426.633879,69.585232,89.177723,85.956303,49.152847,60.116236,83.48866,432.032596,465.711954
3,2011,1114,1989.939789,77.824947,93.75,94.172495,57.364341,56.944444,85.227273,2275.227127,88.627626,102.627818,108.335033,55.329127,54.364311,104.293829,464.170265,429.618263
4,2011,1120,2457.219682,80.784538,85.0,90.104732,54.99031,66.25,65.137911,3704.684725,139.517336,129.35272,153.578265,88.700373,112.70507,108.461096,464.934273,638.009317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,2013,9613,890.461025,32.916559,74.325069,72.912467,58.161683,45.826923,91.558011,705.429415,27.113838,62.298783,59.24481,51.368231,39.875611,75.647279,366.415187,358.436017
1112,2013,9621,1035.468639,36.916921,74.17445,82.967113,43.70801,50.690586,83.485441,867.695985,31.468296,62.659457,72.162075,35.36361,44.962885,71.460542,361.023049,301.121436
1113,2013,9622,1013.696387,42.122602,76.221834,76.978023,60.099668,47.272727,96.189286,985.445679,37.204711,66.438476,67.253493,56.548384,40.447552,84.648333,384.158373,467.68368
1114,2013,9623,1876.918372,56.138174,84.431818,77.564103,48.372093,55.0,73.525,1841.243587,54.120743,51.463504,69.268286,32.721479,29.667498,56.135597,400.971932,263.547145


In [75]:
df2.to_csv(
    "/Users/alexandralugova/Documents/GitHub/MH-old-workers/data/datasets/results/work_quality_indexes_year.csv",
    index=False,
)