In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import plotly
import plotly.graph_objects as go

pd.options.mode.chained_assignment = None

In [2]:
df_dnevnik = pd.read_csv('../hdbscan/data/072420_dnevnikcats.csv', index_col = 0)

In [3]:
df_dnevnik.head()

Unnamed: 0,X,index,entry_id,entry_year,firstName,info,lastName,pub,au_clust,diarynum,...,entry_type,is_au,gender,au_clust_2,isNotes,isRoutine,isPersonal,isLit,isSpirit,isForm
1,0,0,1506,1919,,,,0,8,2,...,35,0,-1,7,0,0,0,0,1,0
2,1,1,1504,1919,,,,0,8,2,...,21,0,-1,7,0,0,0,0,1,0
3,2,3,1497,1919,,,,0,8,2,...,34,0,-1,7,0,0,0,0,1,0
4,3,4,1488,1919,,,,0,8,2,...,32,0,-1,7,0,0,0,1,0,0
5,4,6,1488,1919,,,,0,8,2,...,23,0,-1,7,0,0,0,0,1,0


In [4]:

df_dnev_cat =  df_dnevnik[['entry_year','diarynum','isNotes','is_au','isRoutine', 'isPersonal', 'isLit', 'isSpirit', 'isForm']]

In [5]:
df_dnev_cat['is_hun'] = [1 if i < 1911 else 0 for i in df_dnev_cat['entry_year']]
df_dnev_cat['is_ten'] = [1 if (i > 1910 and i < 1921) else 0 for i in df_dnev_cat['entry_year']]
df_dnev_cat['is_twe'] = [1 if (i > 1920 and i < 1931) else 0 for i in df_dnev_cat['entry_year']]
df_dnev_cat['is_thi'] = [1 if (i > 1930 and i < 1941) else 0 for i in df_dnev_cat['entry_year']]
df_dnev_cat['is_for'] = [1 if (i > 1940 and i < 1951) else 0 for i in df_dnev_cat['entry_year']]
df_dnev_cat['is_fif'] = [1 if (i > 1950 and i < 1961) else 0 for i in df_dnev_cat['entry_year']]
df_dnev_cat['is_six'] = [1 if (i > 1960 and i < 1971) else 0 for i in df_dnev_cat['entry_year']]
df_dnev_cat['is_sev'] = [1 if (i > 1970 and i < 1981) else 0 for i in df_dnev_cat['entry_year']]
df_dnev_cat['is_eig'] = [1 if (i > 1980 and i < 1991) else 0 for i in df_dnev_cat['entry_year']]

In [6]:
def mlm(str_x, str_y, str_group, df, nested):
    '''
    INPUT:
    string of independent var, string of dependent var, string of group, df hosting, string of nested group if relevant
    
    OUTPUT:
    a statsmodel instance of a mixed effects/multilevel model
    
    '''
    if isinstance(nested, str):
        vcf = {nested: "0 +C(" + nested + ")"}
    else:
        vcf = None
    model = str_y + " ~ " + str_x
    md = smf.mixedlm(model, df, groups = df[str_group], vc_formula = vcf)
    mdf = md.fit()
    return(mdf)

In [7]:
decade_ls = ['is_hun', 'is_ten', 'is_twe', 'is_thi', 'is_for', 'is_fif', 'is_six', 'is_sev', 'is_eig']
multilevel_models = []
for i in decade_ls:
    mlm_i = mlm(str_x = 'isLit*is_au', str_y = str(i), str_group = 'diarynum', df= df_dnev_cat, nested = None)
    multilevel_models.append(mlm_i)
    print(mlm_i.summary())

         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: is_hun    
No. Observations: 14513   Method:             REML      
No. Groups:       885     Scale:              0.0113    
Min. group size:  1       Log-Likelihood:     10375.6586
Max. group size:  594     Converged:          Yes       
Mean group size:  16.4                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.063    0.009  6.740 0.000  0.045  0.082
isLit         -0.006    0.003 -1.600 0.110 -0.012  0.001
is_au          0.034    0.020  1.666 0.096 -0.006  0.074
isLit:is_au    0.009    0.005  1.687 0.092 -0.001  0.019
Group Var      0.057    0.028                           

         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: is_ten   
No. Observations: 14513   Method:             REML 

In [8]:
mlm_results = pd.DataFrame({
    
    'features': decade_ls,
    'pvalues':[i.pvalues[1] for i in multilevel_models],
    'coef': [i.params[1] for i in multilevel_models],
    'conf_int_025': [i.conf_int()[0][1] for i in multilevel_models],
    'conf_int_975': [i.conf_int()[1][1] for i in multilevel_models]
    
})

In [9]:
mlm_results[['features', 'pvalues']].style.background_gradient()

Unnamed: 0,features,pvalues
0,is_hun,0.109695
1,is_ten,0.295406
2,is_twe,0.238565
3,is_thi,0.223913
4,is_for,0.548398
5,is_fif,0.378711
6,is_six,0.455841
7,is_sev,0.039285
8,is_eig,0.125196


In [16]:

def plot_sig_results(sig_df):
    '''
    INPUT: df of results, significant features
    
    
    OUTPUT: graphs
    '''
    
    fig = go.Figure()

    fig.add_trace(go.Bar(x=sig_df['features'], y = sig_df['coef'], name = 'Ratings_Score_Top_Coef', 
                     marker=dict(color =   sig_df['pvalues'],
                                 colorscale='cividis',   # choose a colorscale
                                 opacity=0.8,
                                 showscale = True
                                ), 
                     error_y=dict(type='data',
                                array= list(sig_df['conf_int_975'] - sig_df['conf_int_025']),
                                visible=True)
                                ))
    return(fig)

In [17]:
plot_sig_results(mlm_results)

In [18]:
mlm_results.style

Unnamed: 0,features,pvalues,coef,conf_int_025,conf_int_975
0,is_hun,0.109695,-0.005583,-0.012425,0.001258
1,is_ten,0.295406,-0.006307,-0.01812,0.005507
2,is_twe,0.238565,0.007332,-0.004861,0.019526
3,is_thi,0.223913,-0.0073,-0.019065,0.004464
4,is_for,0.548398,0.003692,-0.008365,0.01575
5,is_fif,0.378711,-0.003912,-0.012623,0.004799
6,is_six,0.455841,0.004335,-0.007059,0.015729
7,is_sev,0.039285,0.010849,0.000533,0.021166
8,is_eig,0.125196,0.008504,-0.002366,0.019375


In [19]:
decade_ls = ['is_hun', 'is_ten', 'is_twe', 'is_thi', 'is_for', 'is_fif', 'is_six', 'is_sev', 'is_eig']
multilevel_models2 = []
for i in decade_ls:
    mlm_i = mlm(str_x = str(i)+'*is_au', str_y = 'isLit', str_group = 'diarynum', df= df_dnev_cat, nested = None)
    multilevel_models2.append(mlm_i)
    print(mlm_i.summary())


The MLE may be on the boundary of the parameter space.



         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: isLit     
No. Observations: 14513   Method:             REML      
No. Groups:       885     Scale:              0.1196    
Min. group size:  1       Log-Likelihood:     -5387.7448
Max. group size:  594     Converged:          Yes       
Mean group size:  16.4                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.125    0.006 21.066 0.000  0.113  0.136
is_hun        -0.012    0.020 -0.588 0.556 -0.051  0.028
is_au          0.064    0.012  5.517 0.000  0.041  0.087
is_hun:is_au   0.007    0.033  0.200 0.842 -0.058  0.071
Group Var      0.006    0.002                           




The MLE may be on the boundary of the parameter space.



         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: isLit     
No. Observations: 14513   Method:             REML      
No. Groups:       885     Scale:              0.1196    
Min. group size:  1       Log-Likelihood:     -5385.7959
Max. group size:  594     Converged:          Yes       
Mean group size:  16.4                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.125    0.006 20.449 0.000  0.113  0.137
is_ten        -0.005    0.013 -0.411 0.681 -0.031  0.020
is_au          0.070    0.012  5.945 0.000  0.047  0.093
is_ten:is_au  -0.041    0.024 -1.738 0.082 -0.088  0.005
Group Var      0.006    0.002                           




The MLE may be on the boundary of the parameter space.



         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: isLit     
No. Observations: 14513   Method:             REML      
No. Groups:       885     Scale:              0.1196    
Min. group size:  1       Log-Likelihood:     -5386.8272
Max. group size:  594     Converged:          Yes       
Mean group size:  16.4                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.122    0.006 20.611 0.000  0.110  0.133
is_twe         0.023    0.015  1.546 0.122 -0.006  0.051
is_au          0.069    0.012  5.981 0.000  0.047  0.092
is_twe:is_au  -0.043    0.026 -1.664 0.096 -0.093  0.008
Group Var      0.006    0.002                           




The MLE may be on the boundary of the parameter space.



         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: isLit     
No. Observations: 14513   Method:             REML      
No. Groups:       885     Scale:              0.1196    
Min. group size:  1       Log-Likelihood:     -5387.7012
Max. group size:  594     Converged:          Yes       
Mean group size:  16.4                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.126    0.006 20.973 0.000  0.114  0.138
is_thi        -0.015    0.014 -1.119 0.263 -0.042  0.011
is_au          0.064    0.012  5.473 0.000  0.041  0.086
is_thi:is_au   0.005    0.027  0.172 0.863 -0.049  0.058
Group Var      0.006    0.002                           




The MLE may be on the boundary of the parameter space.



         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: isLit     
No. Observations: 14513   Method:             REML      
No. Groups:       885     Scale:              0.1196    
Min. group size:  1       Log-Likelihood:     -5387.3699
Max. group size:  594     Converged:          Yes       
Mean group size:  16.4                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.129    0.006 19.899 0.000  0.116  0.142
is_for        -0.018    0.011 -1.659 0.097 -0.040  0.003
is_au          0.059    0.012  4.874 0.000  0.035  0.083
is_for:is_au   0.020    0.024  0.856 0.392 -0.026  0.067
Group Var      0.006    0.002                           




The MLE may be on the boundary of the parameter space.



         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: isLit     
No. Observations: 14513   Method:             REML      
No. Groups:       885     Scale:              0.1196    
Min. group size:  1       Log-Likelihood:     -5386.8651
Max. group size:  594     Converged:          Yes       
Mean group size:  16.4                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.125    0.006 21.373 0.000  0.113  0.136
is_fif        -0.013    0.022 -0.583 0.560 -0.055  0.030
is_au          0.066    0.012  5.766 0.000  0.044  0.089
is_fif:is_au  -0.022    0.035 -0.622 0.534 -0.090  0.046
Group Var      0.006    0.002                           




The MLE may be on the boundary of the parameter space.



         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: isLit     
No. Observations: 14513   Method:             REML      
No. Groups:       885     Scale:              0.1196    
Min. group size:  1       Log-Likelihood:     -5388.0230
Max. group size:  594     Converged:          Yes       
Mean group size:  16.4                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.123    0.006 21.262 0.000  0.112  0.135
is_six         0.010    0.018  0.566 0.571 -0.025  0.045
is_au          0.066    0.012  5.714 0.000  0.043  0.089
is_six:is_au  -0.018    0.028 -0.639 0.523 -0.073  0.037
Group Var      0.006    0.002                           




The MLE may be on the boundary of the parameter space.



         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: isLit     
No. Observations: 14513   Method:             REML      
No. Groups:       885     Scale:              0.1196    
Min. group size:  1       Log-Likelihood:     -5385.2539
Max. group size:  594     Converged:          Yes       
Mean group size:  16.4                                  
--------------------------------------------------------
               Coef. Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.123    0.006 21.374 0.000  0.112  0.134
is_sev         0.030    0.020  1.472 0.141 -0.010  0.070
is_au          0.062    0.011  5.449 0.000  0.040  0.085
is_sev:is_au   0.016    0.033  0.478 0.632 -0.048  0.079
Group Var      0.006    0.002                           




The MLE may be on the boundary of the parameter space.



         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: isLit     
No. Observations: 14513   Method:             REML      
No. Groups:       885     Scale:              0.1196    
Min. group size:  1       Log-Likelihood:     -5385.6466
Max. group size:  594     Converged:          Yes       
Mean group size:  16.4                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.122    0.006 21.040 0.000  0.110  0.133
is_eig         0.040    0.018  2.212 0.027  0.005  0.075
is_au          0.066    0.012  5.719 0.000  0.043  0.089
is_eig:is_au  -0.030    0.029 -1.044 0.296 -0.087  0.027
Group Var      0.006    0.002                           



In [24]:
mlm_results2 = pd.DataFrame({
    
    'features': decade_ls,
    'pvalues':[i.pvalues[1] for i in multilevel_models2],
    'coef': [i.params[1] for i in multilevel_models2],
    'conf_int_025': [i.conf_int()[0][1] for i in multilevel_models2],
    'conf_int_975': [i.conf_int()[1][1] for i in multilevel_models2]
    
})

In [25]:
mlm_results2[['features', 'pvalues']].style.background_gradient()

Unnamed: 0,features,pvalues
0,is_hun,0.556401
1,is_ten,0.680955
2,is_twe,0.122157
3,is_thi,0.263128
4,is_for,0.097063
5,is_fif,0.560052
6,is_six,0.571228
7,is_sev,0.141019
8,is_eig,0.026971


In [26]:
plot_sig_results(mlm_results2)

In [27]:
mlm_results2.style

Unnamed: 0,features,pvalues,coef,conf_int_025,conf_int_975
0,is_hun,0.556401,-0.01187,-0.051424,0.027684
1,is_ten,0.680955,-0.00538,-0.031025,0.020266
2,is_twe,0.122157,0.022547,-0.006041,0.051135
3,is_thi,0.263128,-0.015246,-0.041949,0.011457
4,is_for,0.097063,-0.018242,-0.039789,0.003306
5,is_fif,0.560052,-0.012583,-0.054902,0.029736
6,is_six,0.571228,0.01018,-0.025056,0.045415
7,is_sev,0.141019,0.03,-0.009945,0.069944
8,is_eig,0.026971,0.039992,0.004556,0.075428
