In [None]:
from utils_project import *

from scipy.stats import chi2_contingency
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler

def get_item(item):
    return questionnaire_items[questionnaire_items['item']==item]


# Validating the use of factor analysis: regressions w/ individual questionnaires

### OLS
- each questionnaire individually
- questionnaires together
    - also: select questionnaires on first dataset, then replicate relationships

In [None]:
def get_item_labels(ques_prefix):
    return [c for c in data.columns if (c.split('_')[0] == ques_prefix) & ('score' not in c) & ('_att' not in c)]

# put togehter int a dictionary
behs = ['affil_mean_mean_z', 'power_mean_mean_z', 'pov_2d_dist_mean_mean_z']
ques = ['oci_score', 'sds_score', 'aes_score', 'sss_score', 'lsas_av_score', 'apdis_score', 'zbpd_score', 'bapq_score']
for q in ques: data[f'{q}_z'] = zscore_masked(data[q].values) # zscore the questionnaire scores
sample_dict = reset_sample_dict(data)
ques_scores = [f'{q}_z' for q in ques]

ques_prefix = [q.split('_')[0] for q in ques]
ques_dict   = {'scores': ques_scores, 'items': flatten_lists([get_item_labels(qp) for qp in ques_prefix])}

# run ols
ols_df = pd.DataFrame(columns=['sample', 'predictor_type', 'self-report', 'behavior', 'beta', 'p'])
for sample in ['Initial', 'Replication', 'Combined']:
    df = sample_dict[sample]
    for b in behs:  
        for qtype, qs in ques_dict.items():
            
            # behavior ~ single score/item
            for q in qs:
                ols = run_ols([q], b, df, covariates=demo_controls)[1]
                ols_df.loc[len(ols_df), :] = [sample, f'independent_{qtype}', q, b, ols.params[q], ols.pvalues[q]] 

            # behavior ~ all scores/items
            ols = run_ols(qs, b, df, covariates=demo_controls)[1]
            for q in qs:
                ols_df.loc[len(ols_df), :] = [sample, f'combined_{qtype}', q, b, ols.params[q], ols.pvalues[q]]

In [None]:
# many items are reverse scored, so these should all be: higher value means more symptom
item_df = ols_df[(ols_df['sample'] == 'Initial') 
                 & (ols_df['predictor_type'] == 'combined_items') 
                 & (ols_df['behavior'] == 'pov_2d_dist_mean_mean_z')]
item_df.sort_values(by=['sample','p'], inplace=True)
item_df.reset_index(drop=True, inplace=True)

# find top items
item_df_ = item_df.iloc[:10]
for qp in ques_prefix: 
    items = item_df_[item_df_['self-report'].str.contains(qp)]['self-report']
    print(f'{qp}: {items.values}')

# find the text
for i, item in enumerate(item_df['self-report']):
    try: 
        item_text = questionnaire_items[questionnaire_items['item'] == item]['text'].values[0]
        item_df.loc[i, 'item_text'] = item_text
    except: 
        item_df.loc[i, 'item_text'] = 'missing'
item_df.head(20)

In [None]:
def plot_weights(weights, colors, title, ax):
    weights = weights.astype(float)
    w_max = np.round(np.max(np.abs(weights)), 1)
    ax.bar(np.arange(len(weights)), weights, color=colors, edgecolor='black', linewidth=0.5)
    ax.set_title(title, fontsize=10)
    #ax.set_ylim(-w_max-(0.005*w_max), w_max+(0.005*w_max)) 

behs = ['pov_2d_dist_mean_mean_z']

# for plotting
colors = ['red', 'blue', 'purple', 'green', 'lavender', 'grey', 'fuchsia', 'orange', 'dodgerblue', 
          'yellow', 'orchid', 'indigo', 'aqua','palegreen', 'silver', 'plum', 'fuchsia', 'coral',
          'gold', 'pink','slategray', 'forestgreen','peachpuff','honeydew','brown','olivedrab',
          'darkturquoise', 'tan', 'springgreen', 'mintcream','navajowhite','chocolate','lightblue','chartreuse',
          'lime','yellowgreen','khaki','gold','teal','tomato']
colors_items = [colors[ques_prefix.index(item.split('_')[0])] for item in ques_items]

for sample in ['Combined']:
    ols_res = ols_df[ols_df['sample'] == sample]

    fig, axs = plt.subplots(len(behs), 3, figsize=(15, 3*len(behs)), gridspec_kw={'width_ratios': [3, 3, 20]})
    fig.suptitle(f'{sample} sample', fontsize=18, y=1.01)
    if len(behs) == 1:
        ax1, ax2, ax3 = axs[0], axs[1], axs[2]
    else:
        ax1, ax2, ax3 = axs[i,0], axs[i,1], axs[i,2]

    for i, beh in enumerate(behs): 

        # questionnaire ~ behavior
        behav = ols_res[(ols_res['predictor_type'] == 'independent') & (ols_res['predicted'] == beh)]
        plot_weights(behav['beta'].values, colors, title='Scores independent', ax=ax1)
        
        # behavior ~ questionnaires
        behav = ols_res[(ols_res['predictor_type'] == 'covariate_scores') & (ols_res['predicted'] == beh)]
        plot_weights(behav['beta'].values, colors, title='Scores covariates', ax=ax2)
        
        # behavior ~ questionnaire items
        behav = ols_res[(ols_res['predictor_type'] == 'covariate_items') & (ols_res['predicted'] == beh)]
        plot_weights(behav['beta'].values, colors_items, title='Items covariates', ax=ax3)

    # overall legend
    handles, labels = plt.gca().get_legend_handles_labels()
    patches = [mpatches.Patch(facecolor=inst[0], edgecolor='black', label=inst[1]) for inst in zip(colors, ques)]  
    ax3.legend(title='', loc='upper right', handles=patches,  
                    title_fontsize=13, fontsize=8,
                    frameon=False, bbox_to_anchor=(1.15, 1), borderaxespad=0)
    plt.tight_layout()
    plt.show()