In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from scipy import stats
import warnings
from scipy.stats import pearsonr
from statsmodels.stats.contingency_tables import mcnemar

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../Health_cleaned_income_delta.csv', low_memory=False)
df.shape

(170761, 65)

In [3]:
columns_to_int = ['rmstat', 'ragender', 'rahispan', 'raracem', 'ragey_b', 'sagey_b', 'rhltc', 'rhlthlm', 'rhibpe',
        'rdiabe', 'rcancre', 'rlunge', 'rhearte', 'rstroke', 'rpsyche', 'rarthre', 'rhosp', 'rhspnit', 'oop_spend',
        'rlbrf', 'rjphys', 'rjlift', 'rjweeks', 'rjweek2', 'rjcten', 'index_wave', 'insured_gov', 'uninsured',
        'retired', 'collegeplus', 'year', 'inter_year', 'year_of_birth', 'without_work', 'n_jobs', 'broken']

In [4]:
def to_int(el):
    try:
        return int(el)
    except ValueError:
        return el

In [5]:
for col in columns_to_int:
    df[col] = df[col].apply(to_int)

In [6]:
columns_to_corr = ['rjcten', 'rjweeks', 'rjweek2',
                'sgov', 'rgov', 'riearn', 'siearn',
                'oop_spend', 'ragey_b', 'sagey_b']

In [7]:
blue_color = '#3862fa'
red_color = '#ed3535'

In [8]:
column_labels_y = ['Доход супруга<br>от пенсий', 'Доход от пенсий', 'Доход',
 'Доход супруга', 'Расходы<br>на медицину', 'Возраст']

column_labels_x = ['Доход супруга<br>от пенсий', 'Доход<br>от<br>пенсий', 'Доход',
 'Доход<br>супруга', 'Расходы<br>на<br>медицину', 'Возраст']

In [10]:
corr = df[columns_to_corr].corr()

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z=corr.values,
    x=column_labels_x,
    y=column_labels_y,
    colorscale=[
        [0, blue_color],
        [0.5, '#f0f7fc'],
        [1, red_color]
    ],
    zmin=-1,
    zmax=1,
    colorbar=dict(
        title='Коэффициент<br>корреляции',
        titleside='top',
        tickmode='array',
        tickvals=[-1, -0.5, 0, 0.5, 1],
        ticktext=['-1', '-0.5', '0', '0.5', '1'],
        ticks='outside'
    ),
    text=corr.values,
    texttemplate='%{text:.2f}',
))

fig.update_layout(
    width=1147, height=691,
    font=dict(
        family='Cera Pro, regular',
        size=18,
        color='#000000'
    ),
)

fig.show()

In [11]:
amount_of_answers = df['hhidpn'].value_counts().value_counts().sort_index()
amount_of_answers

1     2837
2     7583
3     1897
4     1919
5     3728
6     1565
7     1481
8     4023
9      845
10     989
11    4665
Name: hhidpn, dtype: int64

In [12]:
# plotly bar plot of amount of answers per person
fig = go.Figure(data=[go.Bar(
    x=amount_of_answers.index,
    y=amount_of_answers.values,
    text=amount_of_answers.values,
    textposition='auto',
    marker_color=blue_color
)])

fig.update_layout(
    width=900, height=600,
    title='Количество ответов в опросе',
    xaxis_title='Количество ответов',
    yaxis_title='Количество людей',
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 1
    ),
    font=dict(
        family='Cera Pro, regular',
        size=18,
        color='#000000'
    )
)
fig.show()

In [13]:
# cumulative distribution function of amount of answers per person
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=amount_of_answers.index,
        y=amount_of_answers.values.cumsum() / amount_of_answers.values.sum(),
        mode='none',
        fill='tozeroy',
        opacity=0.5,
        fillcolor=blue_color,
))

fig.update_layout(
    width=900, height=600,
    title='Доля респондентов, которые дали не больше X ответов',
    xaxis_title='Количество ответов',
    yaxis_title='Доля респондентов',
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 1
    )
)

fig.show()

In [14]:
birth_year_by_id = df.groupby('hhidpn')['year_of_birth'].unique()
birth_year_by_id = birth_year_by_id.apply(lambda x: x[0])

In [15]:
# plotly histplot of birth year
fig = go.Figure(data=[go.Histogram(
    x=birth_year_by_id,
    marker_color=blue_color
)])

fig.update_traces(opacity=0.75)

fig.update_layout(
    width=900, height=600,
    title='Распределение года рождения',
    xaxis_title='Год рождения',
    yaxis_title='Количество людей',
    bargap=0.1,
    font=dict(
        family='Cera Pro, regular',
        size=18,
        color='#000000'
    )
)
fig.show()

In [16]:
birth_year_by_id_gender = df.groupby(['ragender', 'hhidpn'])['year_of_birth'].unique()
birth_year_by_id_gender = birth_year_by_id_gender.apply(lambda x: x[0])

In [17]:
birth_year_by_id_gender.loc[1].value_counts().sort_index()

1897    1
1898    2
1900    2
1901    3
1902    3
       ..
1982    1
1983    1
1984    1
1985    1
1993    1
Name: year_of_birth, Length: 87, dtype: int64

In [18]:
age_by_gender =df.groupby('ragender')['ragey_b'].value_counts()

In [22]:
# plotly histplot of birth year
fig = go.Figure()

men_age = age_by_gender.loc[1].sort_index()
fig.add_trace(go.Scatter(
    x=men_age.index,
    y=men_age.values,
    fill='tozeroy',
    name='Мужчины',
    marker_color=blue_color
))

women_age = age_by_gender.loc[2].sort_index()
fig.add_trace(go.Scatter(
    x=women_age.index,
    y=women_age.values,
    fill='tonexty',
    name='Женщины',
    marker_color=red_color
))

fig.update_traces(opacity=0.75)

fig.add_vline(x=65, line_dash='dash', line_color='gray')

fig.update_layout(
    width=1147, height=691,
    title='Распределение возрастов',
    xaxis_title='Возраст',
    yaxis_title='Количество ответов',
    bargap=0.1,
    font=dict(
        family='Cera Pro, regular',
        size=18,
        color='#000000'
    ),
    xaxis = dict(
        tickmode = 'array',
        tickvals = [20, 30, 40, 50, 60, 65, 70, 80, 90, 100],
        ticktext = ['20', '30', '40', '50', '60', '<b>65</b>', '70', '80', '90', '100'],
    )
)
fig.show()

In [13]:
rmstat_df = df[(df['marriage_group'] != '.m') & (df['rhltc'] != '.m')]

In [14]:
rmstat_df[rmstat_df['ragey_b'] <= 65]['hhidpn'].nunique()

21577

In [10]:
marriage_groups = ['together', 'absent spouse', 'separated',
                'divorced', 'dead spouse', 'never married']
age_groups = ['young', 'middle', 'old', 'dead outside']

In [11]:
rmstat_df['marriage_group'].unique()

array(['together', 'dead spouse', 'separated', 'divorced',
       'never married', 'absent spouse', '.m'], dtype=object)

In [12]:
def calculate_confidence_interval(df, group_name, groups, column, confidence=0.99):
    errors = []

    for group in groups:
        group_df = df[df[group_name] == group]
        n = len(group_df)
        mean, se = group_df[column].mean(), stats.sem(group_df[column])
        h = se * stats.t._ppf((1 + confidence) / 2., n-1)
        errors.append(h)
    
    return errors

In [16]:
fig = go.Figure()

groups = [x for x in range(1, 9)]
colors = ['#6edb85', '#940909', '#79f8fc', '#5d048a', '#273b61', '#043d15', '#303640', '#c1b8f2']

errors = calculate_confidence_interval(rmstat_df, 'rmstat', groups, 'rhltc')
tmp = rmstat_df[rmstat_df['ragey_b'] <= 65]

group_data = tmp.groupby('rmstat')['rhltc'].mean()
all_mean_rhltc = tmp['rhltc'].mean()

fig.add_trace(go.Bar(
    x=group_data.index,
    y=group_data.values,
    error_y=dict(type='data', array=errors, color='#160224', visible=True),
    width=[0.6 for _ in range(8)],
    # texttemplate=[f'{x:.2f}' for x in tmp.groupby('rmstat')['ragey_b'].mean().reindex(groups).values],
    textposition='outside',
    marker_color=colors
))

fig.update_layout(
    title='Среднее изменение здоровья в разных семейных статусах(до 65 лет)',
    xaxis_title='Семейный статус',
    yaxis_title='Среднее значение rhltc',
    width=900, height=600
)

fig.update_yaxes(range=[-0.5, 0.2])

fig.add_hline(
    y=all_mean_rhltc,
    line_dash='dash',
    line_color='black',
    line_width=1,
)

fig.add_annotation(x=7.5, y=-0.14,
    text=f'Среднее по всем группам: {all_mean_rhltc:.2f}',
    showarrow=True,
    arrowhead=1,
    ay=120)

fig.show()

~~todo: на график выше добавить средний возраст в каждый столбец, графически выделить плохие и хорошие группы~~

In [17]:
mean_together = pd.DataFrame(columns=['gender', 'mean'])
mean_absent_spouse = pd.DataFrame(columns=['gender', 'mean'])
mean_separated = pd.DataFrame(columns=['gender', 'mean'])
mean_divorced = pd.DataFrame(columns=['gender', 'mean'])
mean_dead_spouse = pd.DataFrame(columns=['gender', 'mean'])
mean_never_married = pd.DataFrame(columns=['gender', 'mean'])

In [18]:
for id in tmp[tmp['marriage_group'] == 'together']['hhidpn'].unique():
    rows = tmp[tmp['hhidpn'] == id]
    mean_val = rows['rhltc'].mean()
    mean_together.loc[id, 'gender'] = rows['ragender'].iloc[0]
    mean_together.loc[id, 'mean'] = mean_val

In [19]:
for id in tmp[tmp['marriage_group'] == 'absent spouse']['hhidpn'].unique():
    rows = tmp[tmp['hhidpn'] == id]
    mean_val = rows['rhltc'].mean()
    mean_absent_spouse.loc[id, 'gender'] = rows['ragender'].iloc[0]
    mean_absent_spouse.loc[id, 'mean'] = mean_val

In [20]:
for id in tmp[tmp['marriage_group'] == 'separated']['hhidpn'].unique():
    rows = tmp[tmp['hhidpn'] == id]
    mean_val = rows['rhltc'].mean()
    mean_separated.loc[id, 'gender'] = rows['ragender'].iloc[0]
    mean_separated.loc[id, 'mean'] = mean_val

In [21]:
for id in tmp[tmp['marriage_group'] == 'divorced']['hhidpn'].unique():
    rows = tmp[tmp['hhidpn'] == id]
    mean_val = rows['rhltc'].mean()
    mean_divorced.loc[id, 'gender'] = rows['ragender'].iloc[0]
    mean_divorced.loc[id, 'mean'] = mean_val

In [22]:
for id in tmp[tmp['marriage_group'] == 'dead spouse']['hhidpn'].unique():
    rows = tmp[tmp['hhidpn'] == id]
    mean_val = rows['rhltc'].mean()
    mean_dead_spouse.loc[id, 'gender'] = rows['ragender'].iloc[0]
    mean_dead_spouse.loc[id, 'mean'] = mean_val

In [23]:
for id in tmp[tmp['marriage_group'] == 'never married']['hhidpn'].unique():
    rows = tmp[tmp['hhidpn'] == id]
    mean_val = rows['rhltc'].mean()
    mean_never_married.loc[id, 'gender'] = rows['ragender'].iloc[0]
    mean_never_married.loc[id, 'mean'] = mean_val

In [24]:
translated_labels = ['Проживающие<br>вместе<br>или в браке', 'Супруг<br>отсутствует', 'Всё сложно',
                'Разведены', 'Умер супруг', 'Никогда не<br>был женат', ]

In [89]:
mean_together

Unnamed: 0,gender,mean
22861040,2,-2.0
208779020,2,-1.0
208289020,2,0.0
206198020,2,0.0
206663020,2,0.0
...,...,...
37893041,2,0.0
21721032,2,0.0
74483032,1,0.0
74368011,2,1.0


In [25]:
mean_together.groupby('gender')['mean'].mean().loc[1]

-0.043848794613728946

In [26]:
mean_together

Unnamed: 0,gender,mean
22861040,2,-2.0
15014010,2,1.0
83977010,2,0.0
77679030,2,0.0
84455020,1,0.0
...,...,...
71962011,2,0.0
81975042,1,-1.0
78384041,1,1.0
34667031,1,-1.0


In [27]:
mean_male = pd.DataFrame(columns=['value'])

mean_male.loc['together'] = mean_together[mean_together['gender'] == 1]['mean'].mean()
mean_male.loc['never married'] = mean_never_married[mean_never_married['gender'] == 1]['mean'].mean()
mean_male.loc['absent spouse'] = mean_absent_spouse[mean_absent_spouse['gender'] == 1]['mean'].mean()
mean_male.loc['divorced'] = mean_divorced[mean_divorced['gender'] == 1]['mean'].mean()
mean_male.loc['dead spouse'] = mean_dead_spouse[mean_dead_spouse['gender'] == 1]['mean'].mean()
mean_male.loc['separated'] = mean_separated[mean_separated['gender'] == 1]['mean'].mean()

mean_male['translated'] = translated_labels

In [28]:
mean_separated[mean_separated['gender'] == 2].shape

(600, 2)

In [29]:
mean_male

Unnamed: 0,value,translated
together,-0.043849,Проживающие<br>вместе<br>или в браке
never married,-0.065511,Супруг<br>отсутствует
absent spouse,-0.091602,Всё сложно
divorced,-0.068617,Разведены
dead spouse,-0.058212,Умер супруг
separated,-0.120153,Никогда не<br>был женат


In [30]:
mean_female = pd.DataFrame(columns=['value'])

mean_female.loc['together'] = mean_together[mean_together['gender'] == 2]['mean'].mean()
mean_female.loc['never married'] = mean_never_married[mean_never_married['gender'] == 2]['mean'].mean()
mean_female.loc['absent spouse'] = mean_absent_spouse[mean_absent_spouse['gender'] == 2]['mean'].mean()
mean_female.loc['divorced'] = mean_divorced[mean_divorced['gender'] == 2]['mean'].mean()
mean_female.loc['dead spouse'] = mean_dead_spouse[mean_dead_spouse['gender'] == 2]['mean'].mean()
mean_female.loc['separated'] = mean_separated[mean_separated['gender'] == 2]['mean'].mean()

mean_female['translated'] = translated_labels

In [69]:
mean_female.value

together        -0.041815
never married   -0.072535
absent spouse   -0.061290
divorced        -0.096563
dead spouse     -0.106195
separated       -0.157500
Name: value, dtype: float64

In [96]:
fig = go.Figure()

male_value = mean_male.value.median()
fig.add_trace(go.Bar(
    x=['Среднее'],
    y=[male_value],
    texttemplate=[f'{male_value:.3f}' ],
    name='Мужчины',
    width=0.1,
    marker_color=blue_color
))

female_value = mean_female.value.median()
fig.add_trace(go.Bar(
    x=['Среднее'],
    y=[female_value],
    texttemplate=[f'{female_value:.3f}' ],
    name='Женщины',
    width=0.1,
    marker_color=red_color
))

fig.update_yaxes(range=[-0.2, 0])

fig.update_layout(
    title='Среднее значение показателя здоровья в зависимости от пола',
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Медианное значение показателя здоровья',
        titlefont_size=16,
        tickfont_size=14,
    ),
    font=dict(
        family="Cera Pro, regular",
        size=14,
        color="#000000"
    )
)

fig.show()

In [116]:
fig = go.Figure()

colors = ['#6edb85', '#940909', '#5d048a', '#273b61', '#043d15', '#303640', '#c1b8f2']

fig.add_trace(go.Bar(
    x=mean_male.translated,
    y=mean_male.value,
    width=[0.3 for _ in range(mean_male.shape[0])],
    texttemplate=[f'{x:.2f}' for x in mean_male.value],
    textposition='outside',
    marker_color=blue_color,
    name='Мужчины'
))

fig.add_trace(go.Bar(
    x=mean_female.translated,
    y=mean_female.value,
    width=[0.3 for _ in range(mean_female.shape[0])],
    texttemplate=[f'{x:.2f}' for x in mean_female.value],
    textposition='outside',
    marker_color=red_color,
    name='Женщины'
))

fig.update_layout(
    title='Среднее изменение здоровья в разных семейных группах(до 65 лет)',
    xaxis_title='Группа',
    yaxis_title='Среднее изменение здоровья',
    width=1147, height=691,
    font=dict(
        family='Cera Pro, regular',
        size=18,
        color='#000000'
    )
)

fig.update_yaxes(range=[-0.5, 0])
fig.update_xaxes(tickangle=0)

fig.show()

In [128]:
fig = go.Figure()

group_column = 'marriage_group'
colors = ['#6edb85', '#940909', '#5d048a', '#273b61', '#043d15', '#303640', '#c1b8f2']


group_data = tmp.groupby(group_column)['rhltc'].mean().sort_values(ascending=False)

fig.add_trace(go.Bar(
    x=group_data.index,
    y=group_data.values,
    width=[0.6 for _ in range(len(marriage_groups))],
    texttemplate=[f'{x:.2f}' for x in tmp.groupby(group_column)['rhltc'].mean().sort_values(ascending=False).values],
    textposition='outside',
    marker_color=colors
))

fig.update_layout(
    title='Среднее изменение здоровья в разных семейных группах(до 65 лет)',
    xaxis_title='Группа',
    yaxis_title='Среднее изменение здоровья',
    width=1147, height=691,
    # xaxis=dict(
    #     tickmode='array',
    #     tickvals=marriage_groups,
    #     ticktext=translated_labels
    # ),
    font=dict(
        family='Cera Pro, regular',
        size=18,
        color='#000000'
    )
)

fig.update_yaxes(range=[-0.5, 0])
fig.update_xaxes(tickangle=0)

fig.show()

In [31]:
children_df = df[(df['rmstat'] != '.m') & (df['rhltc'] != '.m') & (df['ragey_b'] <= 65) & (df['child'] != '.m')]
children_df.shape

(2121, 65)

In [36]:
children_df['child'] = children_df['child'].astype(float).astype(int)

In [37]:
children_df.groupby(['ragender', 'child', 'marriage_group'])['hhidpn'].nunique().loc[1, 0].reindex(marriage_groups)

marriage_group
together         384
absent spouse      2
separated          6
divorced          39
dead spouse       17
never married     15
Name: hhidpn, dtype: int64

In [39]:
fig = go.Figure()

# colors = ['#6edb85', '#940909', '#5d048a', '#273b61', '#043d15', '#303640', '#c1b8f2']

fig.add_trace(go.Bar(
    x=children_df.groupby(['ragender', 'child', 'marriage_group'])['rhltc'].mean().loc[1, 0].reindex(marriage_groups).index,
    y=children_df.groupby(['ragender', 'child', 'marriage_group'])['rhltc'].mean().loc[1, 0].reindex(marriage_groups).values,
    width=[0.2 for _ in range(len(marriage_groups))],
    texttemplate=[f'{x:.2f}' for x in children_df.groupby(['ragender', 'child', 'marriage_group'])['rhltc'].mean().loc[1, 0].reindex(marriage_groups).values],
    name='Мужчины нет детей'
))

fig.add_trace(go.Bar(
    x=children_df.groupby(['ragender', 'child', 'marriage_group'])['rhltc'].mean().loc[1, 1].reindex(marriage_groups).index,
    y=children_df.groupby(['ragender', 'child', 'marriage_group'])['rhltc'].mean().loc[1, 1].reindex(marriage_groups).values,
    width=[0.2 for _ in range(len(marriage_groups))],
    texttemplate=[f'{x:.2f}' for x in children_df.groupby(['ragender', 'child', 'marriage_group'])['rhltc'].mean().loc[1, 1].reindex(marriage_groups).values],
    name='Мужчины есть дети'
))

fig.add_trace(go.Bar(
    x=children_df.groupby(['ragender', 'child', 'marriage_group'])['rhltc'].mean().loc[2, 0].reindex(marriage_groups).index,
    y=children_df.groupby(['ragender', 'child', 'marriage_group'])['rhltc'].mean().loc[2, 0].reindex(marriage_groups).values,
    width=[0.2 for _ in range(len(marriage_groups))],
    texttemplate=[f'{x:.2f}' for x in children_df.groupby(['ragender', 'child', 'marriage_group'])['rhltc'].mean().loc[2, 0].reindex(marriage_groups).values],
    name='Женщины нет детей'
))

fig.add_trace(go.Bar(
    x=children_df.groupby(['ragender', 'child', 'marriage_group'])['rhltc'].mean().loc[2, 1].reindex(marriage_groups).index,
    y=children_df.groupby(['ragender', 'child', 'marriage_group'])['rhltc'].mean().loc[2, 1].reindex(marriage_groups).values,
    width=[0.2 for _ in range(len(marriage_groups))],
    texttemplate=[f'{x:.2f}' for x in children_df.groupby(['ragender', 'child', 'marriage_group'])['rhltc'].mean().loc[2, 1].reindex(marriage_groups).values],
    name='Женщины есть дети'
))

fig.update_layout(
    title='Среднее изменение здоровья при наличии детей',
    xaxis_title='Наличие детей',
    yaxis_title='Среднее изменение здоровья',
    width=1147, height=691,
    font=dict(
        family='Cera Pro, regular',
        size=18,
        color='#000000'
    )
)

# fig.update_yaxes(range=[-0.5, 0])
# fig.update_xaxes(tickangle=0)

# fig.show()

In [58]:
children_df.groupby(['ragender', 'child'])['rhltc'].mean().loc[1, 0]

-0.03902439024390244

In [68]:
fig = go.Figure()

# colors = ['#6edb85', '#940909', '#5d048a', '#273b61', '#043d15', '#303640', '#c1b8f2']

fig.add_trace(go.Bar(
    x=['together'],
    y=[children_df.groupby(['ragender', 'child'])['rhltc'].mean().loc[1, 0]],
    width=[0.2 for _ in range(len(marriage_groups))],
    texttemplate=[f"{children_df.groupby(['ragender', 'child'])['rhltc'].mean().loc[1, 0]:.3f}"],
    name='Мужчины нет детей',
    textposition='outside'
))

fig.add_trace(go.Bar(
    x=['together'],
    y=[children_df.groupby(['ragender', 'child'])['rhltc'].mean().loc[1, 1]],
    width=[0.2 for _ in range(len(marriage_groups))],
    texttemplate=[f"{children_df.groupby(['ragender', 'child'])['rhltc'].mean().loc[1, 1]:.3f}"],
    name='Мужчины с детьми',
    textposition='outside'
))

fig.add_trace(go.Bar(
    x=['together'],
    y=[children_df.groupby(['ragender', 'child'])['rhltc'].mean().loc[2, 0]],
    width=[0.2 for _ in range(len(marriage_groups))],
    texttemplate=[f"{children_df.groupby(['ragender', 'child'])['rhltc'].mean().loc[2, 0]:.3f}"],
    name='Женщины нет детей',
    textposition='outside'
))

fig.add_trace(go.Bar(
    x=['together'],
    y=[children_df.groupby(['ragender', 'child'])['rhltc'].mean().loc[2, 1]],
    width=[0.2 for _ in range(len(marriage_groups))],
    texttemplate=[f"{children_df.groupby(['ragender', 'child'])['rhltc'].mean().loc[2, 1]:.3f}"],
    name='Женщины с детьми',
    textposition='outside'
))

fig.update_yaxes(range=[-0.2, 0])

fig.update_layout(
    title='Среднее изменение здоровья при наличии детей',
    xaxis_title='Семейный статус',
    yaxis_title='Среднее изменение здоровья',
    width=1147, height=691,
    font=dict(
        family='Cera Pro, regular',
        size=18,
        color='#000000'
    )
)

fig.show()

In [43]:
children_df = children_df[children_df['marriage_group'] == 'together']
children_df.shape

(1574, 65)

In [45]:
male_without_kids = children_df[(children_df['ragender'] == 1) & (children_df['child'] == 0)]
male_with_kids = children_df[(children_df['ragender'] == 1) & (children_df['child'] == 1)]
female_without_kids = children_df[(children_df['ragender'] == 2) & (children_df['child'] == 0)]
female_with_kids = children_df[(children_df['ragender'] == 2) & (children_df['child'] == 1)]

In [48]:
male_without_kids['rhltc'] = male_without_kids['rhltc'].astype(int)
male_with_kids['rhltc'] = male_with_kids['rhltc'].astype(int)
female_without_kids['rhltc'] = female_without_kids['rhltc'].astype(int)
female_with_kids['rhltc'] = female_with_kids['rhltc'].astype(int)

In [41]:
def hypothesis_check(t, p): 
    if (abs(t) > 2.59) and (p < 0.01): 
        print('Отвергаем Н0. Разница между средними статистически значима. На уровне значимости 99%') 
    elif (abs(t) > 1.9667) and (p < 0.05): 
        print('Отвергаем Н0. Разница между средними статистически значима. На уровне значимости 95%') 
    else: 
        print('Не удалось отвергнуть H0')

In [49]:
t, p = stats.ttest_ind(male_without_kids['rhltc'], male_with_kids['rhltc'], equal_var=False)
var_1 = male_without_kids['rhltc'].var() 
var_2 = male_with_kids['rhltc'].var() 
n_1 = len(male_without_kids['rhltc']) 
n_2 = len(male_with_kids['rhltc']) 
s_m1_m2 = np.sqrt(var_1/n_1 + var_2/n_2) 
df_welch = (var_1 + var_2)**2 / (var_1**2 / (n_1 - 1) + var_2**2 / (n_2 - 1)) 
if df_welch > 350: 
    hypothesis_check(t, p) 
else: 
    print('ddof =', df_welch) 
print(f'ddof = {int(df_welch)}') 
print(f't = {t:.3f}') 
print(f'p-value = {p:.3f}') 
print(f'Среднее среди мужчин без детей = {male_without_kids["rhltc"].mean():.3f}') 
print(f'Среднее среди мужчин с детьми = {male_with_kids["rhltc"].mean():.3f}')

Не удалось отвергнуть H0
ddof = 605
t = 1.031
p-value = 0.303
Среднее среди мужчин без детей = -0.039
Среднее среди мужчин с детьми = -0.093


In [50]:
t, p = stats.ttest_ind(female_without_kids['rhltc'], female_with_kids['rhltc'], equal_var=False)
var_1 = female_without_kids['rhltc'].var() 
var_2 = female_with_kids['rhltc'].var() 
n_1 = len(female_without_kids['rhltc']) 
n_2 = len(female_with_kids['rhltc']) 
s_m1_m2 = np.sqrt(var_1/n_1 + var_2/n_2) 
df_welch = (var_1 + var_2)**2 / (var_1**2 / (n_1 - 1) + var_2**2 / (n_2 - 1)) 
if df_welch > 350: 
    hypothesis_check(t, p) 
else: 
    print('ddof =', df_welch) 
print(f'ddof = {int(df_welch)}') 
print(f't = {t:.3f}') 
print(f'p-value = {p:.3f}') 
print(f'Среднее среди мужчин без детей = {female_without_kids["rhltc"].mean():.3f}') 
print(f'Среднее среди мужчин с детьми = {female_with_kids["rhltc"].mean():.3f}')

Не удалось отвергнуть H0
ddof = 867
t = -0.100
p-value = 0.920
Среднее среди мужчин без детей = -0.049
Среднее среди мужчин с детьми = -0.044


In [40]:
rmstat_df['ragender'].replace({1: 'male', 2: 'female'}, inplace=True)

In [42]:
def count_t_value(df1, df2, column):
    t, p = stats.ttest_ind(df1[column], df2[column], equal_var=False)
    var_1 = df1[column].var() 
    var_2 = df2[column].var() 
    n_1 = len(df1[column]) 
    n_2 = len(df2[column]) 
    s_m1_m2 = np.sqrt(var_1/n_1 + var_2/n_2) 
    df_welch = (var_1 + var_2)**2 / (var_1**2 / (n_1 - 1) + var_2**2 / (n_2 - 1)) 
    return t, p, df_welch, df1[column].mean(), df2[column].mean()


def interpretate_results(t, p, welch, mean1, mean2, group1, group2):
    if welch > 350: 
        hypothesis_check(t, p) 
    else: 
        print('ddof =', welch) 
    print('ddof =', int(welch)) 
    print('t =', t) 
    print('p-value =', p) 
    print(f'Среднее среди {group1} =', mean1) 
    print(f'Среднее среди {group2} =', mean2)


def make_t_test(df, column, compare_column, base_group):
    for val in df[column].unique():
        if val == base_group:
            continue

        df1 = df[df[column] == base_group]
        df2 = df[df[column] == val]
        print(df1.shape, df2.shape, val)

        t, p, welch, mean1, mean2 = count_t_value(df1, df2, compare_column)
        interpretate_results(t, p, welch, mean1, mean2)
        print('-' * 20)

In [34]:
df_together = rmstat_df[rmstat_df['marriage_group'] == 'together']
df_absent = rmstat_df[rmstat_df['marriage_group'] == 'absent spouse']
df_separated = rmstat_df[rmstat_df['marriage_group'] == 'separated']
df_divorced = rmstat_df[rmstat_df['marriage_group'] == 'divorced']
df_dead = rmstat_df[rmstat_df['marriage_group'] == 'dead spouse']
df_never = rmstat_df[rmstat_df['marriage_group'] == 'never married']

In [64]:
df_together['rhltc'] = df_together['rhltc'].astype(int)
df_absent['rhltc'] = df_absent['rhltc'].astype(int)
df_separated['rhltc'] = df_separated['rhltc'].astype(int)
df_divorced['rhltc'] = df_divorced['rhltc'].astype(int)
df_dead['rhltc'] = df_dead['rhltc'].astype(int)
df_never['rhltc'] = df_never['rhltc'].astype(int)

In [28]:
rmstat_df[rmstat_df['marriage_group'] == 'together']['rhltc'].unique()

array([-2, 0, -1, 1, 2], dtype=object)

In [None]:
make_t_test(rmstat_df, 'marriage_group', 'rhltc', 'together')

In [73]:
rmstat_df[rmstat_df['marriage_group'] == 'separated'].shape

(2430, 64)

In [59]:
t, p = stats.ttest_ind(df_together['rhltc'], df_separated['rhltc'], equal_var=False)
var_1 = df_together['rhltc'].var() 
var_2 = df_separated['rhltc'].var() 
n_1 = len(df_together['rhltc']) 
n_2 = len(df_separated['rhltc']) 
s_m1_m2 = np.sqrt(var_1/n_1 + var_2/n_2) 
df_welch = (var_1 + var_2)**2 / (var_1**2 / (n_1 - 1) + var_2**2 / (n_2 - 1)) 
if df_welch > 350: 
    hypothesis_check(t, p) 
else: 
    print('ddof =', df_welch) 
print(f'ddof = {int(df_welch)}') 
print(f't = {t:.3f}') 
print(f'p-value = {p:.3f}') 
print(f'Среднее среди совместно проживающих или женатых = {df_together["rhltc"].mean():.3f}') 
print(f'Среднее среди живущих отдельно = {df_separated["rhltc"].mean():.2f}')

Отвергаем Н0. Разница между средними статистически значима. На уровне значимости 99%
ddof = 7322
t = 3.477
p-value = 0.001
Среднее среди совместно проживающих или женатых = -0.107
Среднее среди живущих отдельно = -0.16


In [58]:
t, p = stats.ttest_ind(df_together['rhltc'], df_absent['rhltc'], equal_var=False)
var_1 = df_together['rhltc'].var() 
var_2 = df_absent['rhltc'].var() 
n_1 = len(df_together['rhltc']) 
n_2 = len(df_absent['rhltc']) 
s_m1_m2 = np.sqrt(var_1/n_1 + var_2/n_2) 
df_welch = (var_1 + var_2)**2 / (var_1**2 / (n_1 - 1) + var_2**2 / (n_2 - 1)) 
if df_welch > 350: 
    hypothesis_check(t, p) 
else: 
    print('ddof =', df_welch) 
print(f'ddof = {int(df_welch)}') 
print(f't = {t:.3f}') 
print(f'p-value = {p:.3f}') 
print(f'Среднее среди совместно проживающих или женатых = {df_together["rhltc"].mean():.3f}') 
print(f'Среднее среди тех, у кого отсутствует супруг = {df_absent["rhltc"].mean():.2f}')

Отвергаем Н0. Разница между средними статистически значима. На уровне значимости 99%
ddof = 3211
t = 10.866
p-value = 0.000
Среднее среди совместно проживающих или женатых = -0.107
Среднее среди тех, у кого отсутствует супруг = -0.36


In [60]:
t, p = stats.ttest_ind(df_together['rhltc'], df_dead['rhltc'], equal_var=False)
var_1 = df_together['rhltc'].var() 
var_2 = df_dead['rhltc'].var() 
n_1 = len(df_together['rhltc']) 
n_2 = len(df_dead['rhltc']) 
s_m1_m2 = np.sqrt(var_1/n_1 + var_2/n_2) 
df_welch = (var_1 + var_2)**2 / (var_1**2 / (n_1 - 1) + var_2**2 / (n_2 - 1)) 
if df_welch > 350: 
    hypothesis_check(t, p) 
else: 
    print('ddof =', df_welch) 
print('ddof =', int(df_welch)) 
print(f't = {t:.3f}') 
print(f'p-value = {p:.3f}') 
print(f'Среднее среди совместно проживающих или женатых = {df_together["rhltc"].mean():.3f}') 
print(f'Среднее среди вдоцов/вдов = {df_dead["rhltc"].mean():.2f}')

Отвергаем Н0. Разница между средними статистически значима. На уровне значимости 99%
ddof = 82847
t = 32.373
p-value = 0.000
Среднее среди совместно проживающих или женатых = -0.107
Среднее среди вдоцов/вдов = -0.26


In [61]:
t, p = stats.ttest_ind(df_together['rhltc'], df_never['rhltc'], equal_var=False)
var_1 = df_together['rhltc'].var() 
var_2 = df_never['rhltc'].var() 
n_1 = len(df_together['rhltc']) 
n_2 = len(df_never['rhltc']) 
s_m1_m2 = np.sqrt(var_1/n_1 + var_2/n_2) 
df_welch = (var_1 + var_2)**2 / (var_1**2 / (n_1 - 1) + var_2**2 / (n_2 - 1)) 
if df_welch > 350: 
    hypothesis_check(t, p) 
else: 
    print('ddof =', df_welch) 
print('ddof =', int(df_welch)) 
print(f't = {t:.3f}') 
print(f'p-value = {p:.3f}') 
print(f'Среднее среди совместно проживающих или женатых = {df_together["rhltc"].mean():.3f}') 
print(f'Среднее среди никогда не женатых = {df_never["rhltc"].mean():.2f}')

Отвергаем Н0. Разница между средними статистически значима. На уровне значимости 95%
ddof = 17643
t = 2.570
p-value = 0.010
Среднее среди совместно проживающих или женатых = -0.107
Среднее среди никогда не женатых = -0.13


In [62]:
t, p = stats.ttest_ind(df_together['rhltc'], df_divorced['rhltc'], equal_var=False)
var_1 = df_together['rhltc'].var() 
var_2 = df_divorced['rhltc'].var() 
n_1 = len(df_together['rhltc']) 
n_2 = len(df_divorced['rhltc']) 
s_m1_m2 = np.sqrt(var_1/n_1 + var_2/n_2) 
df_welch = (var_1 + var_2)**2 / (var_1**2 / (n_1 - 1) + var_2**2 / (n_2 - 1)) 
if df_welch > 350: 
    hypothesis_check(t, p) 
else: 
    print('ddof =', df_welch) 
print('ddof =', int(df_welch)) 
print(f't = {t:.3f}') 
print(f'p-value = {p:.3f}') 
print(f'Среднее среди совместно проживающих или женатых = {df_together["rhltc"].mean():.3f}') 
print(f'Среднее среди разведенных = {df_divorced["rhltc"].mean():.2f}')

Отвергаем Н0. Разница между средними статистически значима. На уровне значимости 99%
ddof = 41241
t = 6.214
p-value = 0.000
Среднее среди совместно проживающих или женатых = -0.107
Среднее среди разведенных = -0.15


In [106]:
y_ticks = [-1, -0.9, -0.8, -0.7, -0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0, 0.1, 0.2]

mean_in_group = rmstat_df['rhltc'].mean()

fig = go.Figure()

tmp = rmstat_df[rmstat_df['marriage_group'] == 'married']
fig.add_trace(go.Bar(name='Married',
    x=age_groups,
    y=tmp.groupby('age_group')['rhltc'].mean().reindex(age_groups),
    # marker_color='rgb(85, 177, 242)'
    texttemplate=[f'{x:.0f}' for x in tmp.groupby('age_group')['ragey_b'].mean().reindex(age_groups).values],
    # texttemplate=['text1', 'text2', 'text3', 'text4', 'text5'],
    textposition='outside',
))

tmp = rmstat_df[rmstat_df['marriage_group'] == 'separated']
fig.add_trace(go.Bar(name='Separated',
    x=age_groups,
    y=tmp.groupby('age_group')['rhltc'].mean().reindex(age_groups), 
    # marker_color='rgb(242, 111, 85)'
    texttemplate=[f'{x:.0f}' for x in tmp.groupby('age_group')['ragey_b'].mean().reindex(age_groups).values],
    textposition='outside',
))

tmp = rmstat_df[rmstat_df['marriage_group'] == 'dead spouse']
fig.add_trace(go.Bar(name='Dead Spouse',
    x=age_groups,
    y=tmp.groupby('age_group')['rhltc'].mean().reindex(age_groups), 
    # marker_color='rgb(242, 111, 85)'
    texttemplate=[f'{x:.0f}' for x in tmp.groupby('age_group')['ragey_b'].mean().reindex(age_groups).values],
    textposition='outside',
))

tmp = rmstat_df[rmstat_df['marriage_group'] == 'never married']
fig.add_trace(go.Bar(name='Never Married',
    x=age_groups,
    y=tmp.groupby('age_group')['rhltc'].mean().reindex(age_groups), 
    # marker_color='rgb(242, 111, 85)'
    texttemplate=[f'{x:.0f}' for x in tmp.groupby('age_group')['ragey_b'].mean().reindex(age_groups).values],
    textposition='outside',
))

tmp = rmstat_df[rmstat_df['marriage_group'] == 'divorced']
fig.add_trace(go.Bar(name='Divorced',
    x=age_groups,
    y=tmp.groupby('age_group')['rhltc'].mean().reindex(age_groups), 
    # marker_color='rgb(242, 111, 85)'
    texttemplate=[f'{x:.0f}' for x in tmp.groupby('age_group')['ragey_b'].mean().reindex(age_groups).values],
    textposition='outside',
))

fig.update_layout(
    barmode='group',
    width=900, height=600,
    title=f'Среднее изменение здоровья по полу и группе возраста в каждой группе брака',
    xaxis_title='Группа возраста',
    yaxis_title='Среднее изменение здоровья',
    yaxis = dict(
        tickmode = 'array',
        tickvals = y_ticks,
        ticktext = y_ticks
    )
)
fig.update_yaxes(range=[-0.5, 0.4])

fig.add_annotation(text="*Над столбцами: средний возраст в группе",
    xref="paper", yref="paper",
    x=1, y=1, showarrow=False
)

fig.show()

In [109]:
def drop_single(df):
    correct_indexes = (df['hhidpn'].value_counts() == 2).where(lambda x: x == True).dropna().index
    return df[df['hhidpn'].isin(correct_indexes)]

In [110]:
def get_pivot(df, column):
    pivot = df.groupby(['rmstat', column])[column].count().unstack(column)
    pivot.iloc[1, 0] = pivot.iloc[1:, 0].sum()
    pivot.iloc[1, 1] = pivot.iloc[1:, 1].sum()

    try:
        pivot.drop([5], axis=0, inplace=True)
    except KeyError:
        pass

    try:
        pivot.drop([7], axis=0, inplace=True)
    except KeyError:
        pass
    return pivot

In [111]:
def get_mcnemar_correlation(df, print=False):
    mcn = mcnemar(df, exact=False)
    correlation = np.sqrt(mcn.statistic / (mcn.statistic + df.sum().sum()))

    if print:
        return f'P-value: {mcn.pvalue:.5f}, Statistic: {mcn.statistic:.3f}, Correlation: {correlation:.5f}'

    return np.round(mcn.pvalue, 5), np.round(mcn.statistic, 3), np.round(correlation, 5)

In [112]:
broken_df = df[(df['broken'] == 1) & (df['rmstat'] != '.m') & (df['rpsyche'] != '.m')]
broken_df.shape

(26416, 64)

In [118]:
for id in df['hhidpn'].unique():
    rows = df[df['hhidpn'] == id]
    rows.sort_values('index_wave', inplace=True)

    found = False

    for el in rows['rpsyche'].values:
        if found and (el == '.m' or int(el) == 0):
            print('ААААААА НЕГРЫЫЫЫЫ!!!!')
            print(id)
            break
        elif el != '.m' and int(el) == 1:
            found = True

In [113]:
def get_broken_period(rows):
    indexes = rows.index

    statuses = rows['rmstat']
    pos = 0
    

    for i, st in enumerate(statuses):
        if i == 0:
            continue

        if st != 1:
            pos = i
            break

    return rows.iloc[pos-1:pos+1]

In [114]:
def get_period_df(df):
    period_df = pd.DataFrame()


    for uid in df['hhidpn'].unique():
        period_df = pd.concat([period_df, get_broken_period(df[df['hhidpn'] == uid])])

    return period_df

In [22]:
broken_period_df = get_period_df(broken_df)

In [25]:
broken_period_df.shape

(6952, 64)

In [27]:
broken_period_df = broken_period_df[broken_period_df['age_group'] == 'middle']
broken_period_df.shape

(2072, 64)

In [28]:
broken_period_df = drop_single(broken_period_df)
broken_period_df.shape

(1824, 64)

In [30]:
broken_pivot = get_pivot(broken_period_df, 'rpsyche')
broken_pivot

rpsyche,0,1
rmstat,Unnamed: 1_level_1,Unnamed: 2_level_1
1,779,133
2,737,175


In [36]:
broken_pivot[0]

rmstat
1    779
2    737
Name: 0, dtype: int64

In [48]:
fig = go.Figure()

fig.add_trace(go.Bar(name='заебись',
    x=broken_pivot.index,
    y=broken_pivot[1],
    text=broken_pivot[1],
    width=0.5
))

fig.add_trace(go.Bar(name='пизда',
    x=broken_pivot.index,
    y=broken_pivot[0],
    text=broken_pivot[0],
    textposition='auto',
    width=0.5
))

fig.update_layout(
    height=900,
    barmode='stack',
    uniformtext_minsize=70, uniformtext_mode='hide',
)

fig.show()

In [10]:
rmstat_df.head()

Unnamed: 0,hhidpn,rmstat,ragender,rahispan,raracem,riwbegy,ragey_b,sagey_b,rhltc,rhlthlm,...,rgov_delta,sgov_delta,total_work_income_delta,total_pension_income_delta,total_gov_income_delta,total_income_delta,broken,mariage_group,age_group,marriage_group
0,22861040,1,2,0,2,1992-04-01,62,56,-2,1,...,,,,,,,0,married,middle,together
2053,205915010,7,2,0,1,1998-02-15,78,.m,0,0,...,,,,,,,0,,old,dead spouse
2054,206015010,1,2,0,1,1998-02-15,84,82,0,0,...,,,,,,,1,,dead outside,together
2055,206717010,7,2,0,1,1998-02-15,76,.m,0,0,...,,,,,,,0,,old,dead spouse
2056,205615020,7,1,0,1,1998-02-15,80,.m,2,0,...,,,,,,,0,,old,dead spouse


In [54]:
tmp = rmstat_df[(rmstat_df['rpsyche'] != '.m') & (rmstat_df['broken'] == 1)]
tmp.shape

(25503, 64)

In [86]:
single = (rmstat_df.groupby('hhidpn')['rmstat'].nunique() == 1).where(lambda x: x == True).dropna().index
single.shape

(23104,)

In [84]:
fig = go.Figure()

tmp = rmstat_df[(rmstat_df['hhidpn'].isin(single)) & (rmstat_df['ragey_b'] <= 65)]
group_column = 'marriage_group'
colors = ['#6edb85', '#940909', '#5d048a', '#273b61', '#043d15', '#303640', '#c1b8f2']
group_data = tmp.groupby(group_column)['rhltc'].mean().reindex(marriage_groups)

fig.add_trace(go.Bar(
    x=group_data.index,
    y=group_data.values,
    width=[0.6 for _ in range(len(marriage_groups))],
    texttemplate=[f'{x:.3f}' for x in tmp.groupby(group_column)['rhltc'].mean().reindex(marriage_groups).values],
    textposition='outside',
    marker_color=colors
))

fig.update_layout(
    title='Среднее изменение здоровья в разных семейных группах(до 65 лет) только один ответ',
    xaxis_title='Группа',
    yaxis_title='Среднее значение rhltc',
    width=900, height=600
)

fig.update_yaxes(range=[-0.5, 0])

fig.show()

In [53]:
df[df['hhidpn'] == 500684010][['hhidpn', 'rmstat', 'rpsyche']]

Unnamed: 0,hhidpn,rmstat,rpsyche
481,500684010,1,0
41483,500684010,1,0
69082,500684010,1,0
75166,500684010,2,0
107494,500684010,5,0


In [29]:
psycho_df = rmstat_df[(rmstat_df['rpsyche'] != '.m') & (rmstat_df['broken'] == 1)]
psycho_df.sort_values('index_wave', inplace=True)
psycho_df.shape

(25503, 64)

In [57]:
broken_psycho_id = psycho_df[psycho_df['rmstat'] == 5]['hhidpn'].unique()
broken_psycho_id.shape

(245,)

In [60]:
psycho_df = psycho_df[psycho_df['hhidpn'].isin(broken_psycho_id)]
psycho_df.shape

(1357, 64)

In [73]:
cnt = 0
divorces = 0

for id in psycho_df['hhidpn'].unique():
    rows = psycho_df[psycho_df['hhidpn'] == id]

    for i in range(1, rows.shape[0]):
        if rows.iloc[i - 1]['rmstat'] == 1 and rows.iloc[i]['rmstat'] == 5:
            divorces += 1

            if rows.iloc[i - 1]['rpsyche'] == 0:
                if rows.iloc[i]['rpsyche'] == 1:
                    cnt += 1
                elif i < rows.shape[0] - 1 and rows.iloc[i + 1]['rpsyche'] == 1:
                        cnt += 1
            break

cnt / divorces * 100

5.921052631578947

In [74]:
cnt, divorces

(9, 152)