In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../Health_cleaned_income_delta.csv')
df.head()

Unnamed: 0,hhidpn,rmstat,ragender,rahispan,raracem,riwbegy,ragey_b,sagey_b,rhltc,rhlthlm,...,siearn_delta,ripena_delta,sipena_delta,rgov_delta,sgov_delta,total_work_income_delta,total_pension_income_delta,total_gov_income_delta,total_income_delta,broken
0,22861040,1,2,0,2,1992-04-01,62,56,5,1,...,,,,,,,,,,False
1,25154020,1,2,0,2,1992-04-01,56,59,3,1,...,,,,,,,,,,True
2,25181010,1,2,0,1,1992-04-06,58,53,3,1,...,,,,,,,,,,False
3,11067020,1,2,0,1,1992-04-07,53,50,3,0,...,,,,,,,,,,False
4,22861010,1,1,0,2,1992-04-07,56,62,3,1,...,,,,,,,,,,False


In [3]:
df['riwbegy'] = pd.to_datetime(df['riwbegy'])

In [4]:
def count_jobs(row):
    job1 = not (row['rjweeks'] == '.m' or row['rjweeks'] == '0')
    job2 = not (row['rjweek2'] == '.m' or row['rjweek2'] == '0')

    if job1 and job2:
        return 2
    elif job1 or job2:
        return 1
    return 0


df['n_jobs'] = df.apply(count_jobs, axis=1)

In [5]:
df.sort_values('riwbegy', inplace=True)
df.head()

Unnamed: 0,hhidpn,rmstat,ragender,rahispan,raracem,riwbegy,ragey_b,sagey_b,rhltc,rhlthlm,...,siearn_delta,ripena_delta,sipena_delta,rgov_delta,sgov_delta,total_work_income_delta,total_pension_income_delta,total_gov_income_delta,total_income_delta,broken
0,22861040,1,2,0,2,1992-04-01,62,56,5,1,...,,,,,,,,,,False
1,25154020,1,2,0,2,1992-04-01,56,59,3,1,...,,,,,,,,,,True
2,25181010,1,2,0,1,1992-04-06,58,53,3,1,...,,,,,,,,,,False
3,11067020,1,2,0,1,1992-04-07,53,50,3,0,...,,,,,,,,,,False
4,22861010,1,1,0,2,1992-04-07,56,62,3,1,...,,,,,,,,,,False


In [6]:
cpi_by_year = df.groupby('year')['cpi'].mean()
cpi_by_year

year
1991    0.811126
1992    0.831578
1993    0.852219
1994    0.870152
1995    0.891173
1996    0.915049
1997    0.934777
1998    0.947672
1999    0.967805
2001    1.028097
2002    1.044544
2003    1.068196
2004    1.096848
2005    1.133913
2006    1.170722
2007    1.204051
2008    1.250222
2009    1.245748
2010    1.266227
2011    1.306293
2012    1.333220
Name: cpi, dtype: float64

In [7]:
unique_ids = df['hhidpn'].unique()
unique_ids.shape

(31532,)

In [8]:
# df.rename(columns={'total_work_earn': 'total_work_income', 'total_pension_earn': 'total_pension_income'}, inplace=True)

In [8]:
df.columns

Index(['hhidpn', 'rmstat', 'ragender', 'rahispan', 'raracem', 'riwbegy',
       'ragey_b', 'sagey_b', 'rhltc', 'rhlthlm', 'rhibpe', 'rdiabe', 'rcancre',
       'rlunge', 'rhearte', 'rstroke', 'rpsyche', 'rarthre', 'rhosp',
       'rhspnit', 'oop_spend', 'riearn', 'ripena', 'siearn', 'sipena', 'rcovr',
       'rcovs', 'rlbrf', 'rjphys', 'rjlift', 'rjweeks', 'rjweek2', 'rjcten',
       'index_wave', 'insured_gov', 'uninsured', 'retired', 'collegeplus',
       'year', 'cpi', 'rgov', 'sgov', 'inter_year', 'year_of_birth',
       'total_work_income', 'total_pension_income', 'total_gov_income',
       'total_income', 'without_work', 'n_jobs', 'riearn_delta',
       'siearn_delta', 'ripena_delta', 'sipena_delta', 'rgov_delta',
       'sgov_delta', 'total_work_income_delta', 'total_pension_income_delta',
       'total_gov_income_delta', 'total_income_delta', 'broken'],
      dtype='object')

In [9]:
income_columns = ['riearn', 'siearn', 'ripena', 'sipena', 'rgov', 'sgov', 'total_work_income',
        'total_pension_income', 'total_gov_income', 'total_income']

In [10]:
df.head()

Unnamed: 0,hhidpn,rmstat,ragender,rahispan,raracem,riwbegy,ragey_b,sagey_b,rhltc,rhlthlm,...,siearn_delta,ripena_delta,sipena_delta,rgov_delta,sgov_delta,total_work_income_delta,total_pension_income_delta,total_gov_income_delta,total_income_delta,broken
0,22861040,1,2,0,2,1992-04-01,62,56,5,1,...,,,,,,,,,,False
1,25154020,1,2,0,2,1992-04-01,56,59,3,1,...,,,,,,,,,,True
2,25181010,1,2,0,1,1992-04-06,58,53,3,1,...,,,,,,,,,,False
3,11067020,1,2,0,1,1992-04-07,53,50,3,0,...,,,,,,,,,,False
4,22861010,1,1,0,2,1992-04-07,56,62,3,1,...,,,,,,,,,,False


In [18]:
df['rmstat'].value_counts()

1     108070
7      30016
5      15665
8       5955
3       5748
4       2865
2       1264
6       1064
.m       114
Name: rmstat, dtype: int64

In [14]:
def count_deltas(rows):
    income_deltas = {}

    for col in income_columns:
        income_deltas[f'{col}_delta'] = [np.nan]

    indexes = rows.index

    for i, (_, row) in zip(range(len(indexes)), rows.iterrows()):
        if i == 0:
            continue
        prev_row = rows.loc[indexes[i - 1]]

        valid_interval = True
        current_year = row['year']
        if current_year - prev_row['year'] > 2:
            valid_interval = False

        for col in income_columns:
            if not valid_interval or prev_row[col] == 0 or row[col] == 0:
                income_deltas[f'{col}_delta'].append(np.nan)
            else:
                income_deltas[f'{col}_delta'].append(row[col] / prev_row[col] * 100 * cpi_by_year.loc[current_year]  - 100)

    return income_deltas

In [20]:
def check_if_broken(rows):
    indexes = rows.index
    broken = False
    was_married = False

    statuses = rows['rmstat']

    for st in statuses:
        if st == '.m' or int(st) not in [1, 2, 5, 7]:
            broken = False
            break
            
        if int(st) == 1:
            was_married = True

        if int(st) in [2, 5, 7]:
            if was_married:
                broken = True
            else:
                broken = False
                break
    
        if broken and int(st) not in [2, 5, 7]:
            broken = False
            break

    return int(broken)

In [21]:
broken = []

for uid in unique_ids:
    result = check_if_broken(df[df['hhidpn'] == uid])

    df.loc[df['hhidpn'] == uid, 'broken'] = result

df.head()

Unnamed: 0,hhidpn,rmstat,ragender,rahispan,raracem,riwbegy,ragey_b,sagey_b,rhltc,rhlthlm,...,siearn_delta,ripena_delta,sipena_delta,rgov_delta,sgov_delta,total_work_income_delta,total_pension_income_delta,total_gov_income_delta,total_income_delta,broken
0,22861040,1,2,0,2,1992-04-01,62,56,5,1,...,,,,,,,,,,0
1,25154020,1,2,0,2,1992-04-01,56,59,3,1,...,,,,,,,,,,1
2,25181010,1,2,0,1,1992-04-06,58,53,3,1,...,,,,,,,,,,0
3,11067020,1,2,0,1,1992-04-07,53,50,3,0,...,,,,,,,,,,0
4,22861010,1,1,0,2,1992-04-07,56,62,3,1,...,,,,,,,,,,0


In [40]:
df[df['hhidpn'] == 15834010]

Unnamed: 0,hhidpn,rmstat,ragender,rahispan,raracem,riwbegy,ragey_b,sagey_b,rhltc,rhlthlm,...,siearn_delta,ripena_delta,sipena_delta,rgov_delta,sgov_delta,total_work_income_delta,total_pension_income_delta,total_gov_income_delta,total_income_delta,broken
2900,15834010,5,1,0,1,1992-07-13,55,.m,3,0,...,,,,,,,,,,False
17562,15834010,5,1,0,1,1994-08-20,57,.m,3,0,...,,,,,,,,,,False
21586,15834010,1,1,0,1,1996-06-15,58,46,3,0,...,,,,,,,,,,False
44675,15834010,1,1,0,1,1998-09-15,61,48,3,0,...,,,,,,,,,,False
62977,15834010,1,1,0,1,2000-08-15,63,50,4,1,...,,,,,,,,,,False
67558,15834010,1,1,0,1,2002-05-15,64,52,3,1,...,,,,,,,,,,False
94611,15834010,1,1,0,1,2004-08-15,67,54,3,.m,...,,,,,,59.012362,,,89.773028,False
109572,15834010,5,1,0,1,2006-07-15,69,.m,2,1,...,,,,2.821062,,-50.273609,,2.821062,-41.667394,False
127260,15834010,5,1,0,1,2008-07-15,71,.m,2,1,...,,,,16.730065,,-16.006433,,16.730065,-6.653148,False
138356,15834010,5,1,0,1,2010-08-15,73,.m,3,1,...,,,,28.332463,,-75.918982,,28.332463,-38.671485,False


In [24]:
df[df['broken'] == True].groupby('hhidpn')['rmstat'].unique()

hhidpn
10004040     [1, 7]
10075020     [1, 7]
10109030     [1, 7]
10210020     [1, 7]
10395020     [1, 7]
              ...  
918529020    [1, 2]
919258010    [1, 5]
919258020    [1, 5]
920538010    [1, 2]
920538020    [1, 2]
Name: rmstat, Length: 3480, dtype: object

In [31]:
for uid in unique_ids:
    break
    income_deltas = count_deltas(df[df['hhidpn'] == uid])

    for col in income_columns:
        df.loc[df['hhidpn'] == uid, f'{col}_delta'] = income_deltas[f'{col}_delta']

In [23]:
df[df['hhidpn'] == 22861010][income_columns + [f'{col}_delta' for col in income_columns]]

Unnamed: 0,riearn,siearn,ripena,sipena,rgov,sgov,total_work_income,total_pension_income,total_gov_income,total_income,riearn_delta,siearn_delta,ripena_delta,sipena_delta,rgov_delta,sgov_delta,total_work_income_delta,total_pension_income_delta,total_gov_income_delta,total_income_delta
4,23485.564002,0.0,0.0,0.0,0.0,3522.8347,23485.564002,0.0,3522.8347,27008.398702,,,,,,,,,,
11013,25280.287153,0.0,0.0,0.0,0.0,3768.0933,25280.287153,0.0,3768.0933,29048.380453,-8.265564,,,,,-8.844943,-8.265564,,-8.844943,-8.341135
25453,23157.354808,0.0,0.0,0.0,0.0,5252.3936,23157.354808,0.0,5252.3936,28409.748408,-18.366358,,,,,24.221806,-18.366358,,24.221806,-12.841913
38762,25473.632823,0.0,0.0,0.0,0.0,10436.911,25473.632823,0.0,10436.911,35910.543823,2.827702,,,,,85.747472,2.827702,,85.747472,18.157907
54931,9841.727323,0.0,0.0,0.0,8843.4951,12653.649,9841.727323,0.0,21497.1441,31338.871423,-62.608905,,,,,17.336072,-62.608905,,99.340952,-15.540382
74948,19852.633829,0.0,0.0,0.0,8338.1064,7279.2988,19852.633829,0.0,15617.4052,35470.039029,107.386738,,,,-3.065657,-40.856375,107.386738,,-25.31003,16.362352
97054,0.0,0.0,0.0,0.0,8177.9585,7197.1133,0.0,0.0,15375.0718,15375.0718,,,,,4.767992,5.613624,,,5.162142,-53.697266
101881,0.0,0.0,0.0,0.0,8028.0,29991.352,0.0,0.0,38019.352,38019.352,,,,,11.312073,372.517084,,,180.39313,180.39313
121848,0.0,0.0,0.0,0.0,7955.8916,10566.418,0.0,0.0,18522.3096,18522.3096,,,,,19.323598,-57.579421,,,-41.340916,-41.340916
140704,0.0,0.0,0.0,0.0,0.0,8585.2588,0.0,0.0,8585.2588,8585.2588,,,,,,1.217546,,,-42.258448,-42.258448


In [27]:
df.to_csv('../Health_cleaned_income_delta.csv', index=False)