# User based LIWC analysis: 
analyze the difference in LIWC attributes between the depression and control groups. group means are aggregated over users, i.e., for computing the feature values for a single user, all their posts were first concatenated. Similarly, we performed Welch’s t-test with adjusted p-value using Bonferroni correction. For effect size assessment, we calculated Cohen’s d statistics. 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set(rc={"figure.figsize":(20, 6)})

In [2]:
data = pd.read_csv("user_based_liwc_analysis.csv") # data is aggregated score for each user 
data.shape

(2632, 112)

In [3]:
data.head(2)

Unnamed: 0,ids,WC,Analytic,Clout,Authentic,Tone,WPS,BigWords,Dic,Linguistic,...,time,focuspast,focuspresent,focusfuture,Conversation,netspeak,assent,nonflu,filler,users
0,1039189,4582,50.384579,33.040092,54.826279,53.647639,43.604103,12.876187,84.504505,69.555105,...,3.797228,3.448027,3.470041,1.156881,1.484337,1.113103,0.283693,0.130912,0.10899,clinical
1,1039462,4939,42.251541,40.928882,53.597016,33.458149,51.014578,13.990883,88.31784,73.03146,...,4.555793,3.7454,3.746048,0.992122,0.728769,0.303677,0.303725,0.101152,0.020267,clinical


In [4]:
# these are the features/attributes from LIWC which do not have multicollinearity and are only the features useful to our research
features = ['ids','users','Analytic', 'Clout', 'Authentic','WC','Tone',
            'i', 'we', 'you', 'shehe','they', 'ipron','auxverb','negate',
            'Drives',
           'insight', 'cause', 'discrep', 'tentat', 'certitude','differ','memory','allnone',
            'emo_anx', 'emo_anger','emo_sad', 'swear', 'emo_pos','emo_neg',
           'socbehav','polite','moral','comm','conflict',
            'family', 'friend','female','male',
           'Culture','relig',
           'Lifestyle',
           'illness', 'wellness', 'mental', 'substances', 'sexual', 'food','death',
            'need', 'want', 'acquire', 'lack', 'fulfill', 'fatigue',
           'reward', 'risk','curiosity', 'allure', 
            'Perception','feeling',
          'focuspast','focuspresent', 'focusfuture', 
            'Conversation']

In [5]:
data = data[features]
data.head()

Unnamed: 0,ids,users,Analytic,Clout,Authentic,WC,Tone,i,we,you,...,reward,risk,curiosity,allure,Perception,feeling,focuspast,focuspresent,focusfuture,Conversation
0,1039189,clinical,50.384579,33.040092,54.826279,4582,53.647639,6.459769,0.283536,2.007911,...,0.196384,0.305369,0.545491,8.271711,8.708136,0.654806,3.448027,3.470041,1.156881,1.484337
1,1039462,clinical,42.251541,40.928882,53.597016,4939,33.458149,6.49951,0.425286,2.894839,...,0.121462,0.263084,0.384606,6.985234,7.673003,0.303725,3.7454,3.746048,0.992122,0.728769
2,1041793,clinical,46.533477,39.13657,45.788556,2729,40.300971,4.653591,0.329802,2.564874,...,0.0,0.073298,0.586402,6.082638,8.317486,0.293166,3.078084,5.019967,1.28266,1.685519
3,1043248,clinical,34.719949,40.516832,44.769162,4151,38.394129,5.299566,0.433693,3.444924,...,0.144454,0.361455,0.36158,7.805408,6.046815,0.433508,2.626025,3.662339,1.204674,1.228316
4,1046051,clinical,46.953886,22.569067,68.181526,16253,40.866317,4.546724,0.356961,1.18136,...,0.116791,0.215299,0.38134,5.918822,8.711918,0.418395,3.070073,3.507167,0.984805,0.645884


In [6]:
clinical = data[data['users']=='clinical'].iloc[:,2:]
control = data[data['users']=='control'].iloc[:,2:]


In [13]:
control_mean_std = control.describe().T.iloc[:,1:3].reset_index().rename(columns={'index': 'Feature'})
clinical_mean_std =clinical.describe().T.iloc[:,1:3].reset_index().rename(columns={'index': 'Feature'})
control_mean_std

Unnamed: 0,Feature,mean,std
0,Analytic,48.761060,10.014593
1,Clout,40.472573,9.622541
2,Authentic,49.649457,9.748757
3,WC,6867.184650,5014.257634
4,Tone,44.178023,8.976000
...,...,...,...
58,feeling,0.438082,0.270790
59,focuspast,3.193214,0.987591
60,focuspresent,4.094123,0.964510
61,focusfuture,1.187488,0.416547


In [11]:
from scipy import stats

# List of LIWC feature column names (replace with your actual column names)
liwc_feature_columns = clinical.columns.to_list()

# Set up an empty dictionary to store the results
t_test_results = []

# Set the desired significance level
alpha = 0.05


# Number of LIWC features
num_features = len(liwc_feature_columns)

# Bonferroni corrected alpha
alpha_corrected = alpha / num_features

for feature_column in liwc_feature_columns:
    # Extract the data for the current LIWC feature
    liwc_feature_group1 = clinical[feature_column]
    liwc_feature_group2 = control[feature_column]
    
    # Perform Welch's t-test
    stat, p_value = stats.ttest_ind(liwc_feature_group1, liwc_feature_group2, equal_var=False)
    
    # Calculate Cohen's d
    mean_diff = np.mean(liwc_feature_group1) - np.mean(liwc_feature_group2)
    pooled_std = np.sqrt(((len(liwc_feature_group1) - 1) * np.var(liwc_feature_group1, ddof=1) + (len(liwc_feature_group2) - 1) * np.var(liwc_feature_group2, ddof=1)) / (len(liwc_feature_group1) + len(liwc_feature_group2) - 2))
    cohen_d = mean_diff / pooled_std
    
    # Apply Bonferroni correction to the p-value
    p_value_corrected = p_value * num_features
    
    # Append the results to the list of dictionaries
    t_test_results.append({
        'Feature': feature_column,
        'T-Statistic': stat,
        'P-Value': p_value,
        'Cohen_d': cohen_d
    })

# Convert the list of dictionaries to a Pandas DataFrame for easier analysis
t_test_df = np.round(pd.DataFrame(t_test_results),5)
t_test_df = np.round(t_test_df,5)


In [12]:
print("corrected threshold value:",alpha_corrected)

corrected threshold value: 0.0007936507936507937


In [20]:
significant_results = t_test_df[t_test_df['P-Value'] < alpha_corrected]
significant_results = significant_results.reset_index(drop=True)

# Merge the significant results with the control and clinical summary statistics
combined_stats = pd.merge(significant_results, clinical_mean_std, on='Feature', how='left')
combined_stats = pd.merge(combined_stats, control_mean_std, on='Feature', how='left').rename(columns={'mean_x':'Clinical Mean',
                                                                                                      'std_x':'Clinical Std',
                                                                                                      'mean_y':'Control Mean',
                                                                                                      'std_y':'Control Std'})

# Rearranging columns to the specified order
final_df = np.round(combined_stats[['Feature', 'Clinical Mean', 'Clinical Std', 'Control Mean', 'Control Std', 'P-Value', 'Cohen_d']],2)

# Display the combined DataFrame
final_df

Unnamed: 0,Feature,Clinical Mean,Clinical Std,Control Mean,Control Std,P-Value,Cohen_d
0,Analytic,42.44,9.92,48.76,10.01,0.0,-0.63
1,Clout,37.45,10.78,40.47,9.62,0.0,-0.3
2,Authentic,55.97,9.67,49.65,9.75,0.0,0.65
3,WC,5259.4,3488.07,6867.18,5014.26,0.0,-0.37
4,i,6.06,1.8,4.69,1.76,0.0,0.77
5,shehe,1.19,0.89,0.95,0.7,0.0,0.29
6,ipron,5.76,0.92,5.53,1.19,0.0,0.22
7,insight,2.78,0.67,2.43,0.75,0.0,0.49
8,cause,1.66,0.36,1.6,0.46,0.0,0.15
9,tentat,3.08,0.68,2.88,0.76,0.0,0.28
