In [1]:
# Import libraries
import pandas as pd
import numpy as np
from scipy import stats

In [None]:
OUTPUT_LIWC_CSV = input("Enter the file path for LIWC Category Memorability scores: ")
OUTPUT_GI_CSV = input("Enter the file path for GI Category Memorability scores: ")

In [None]:
WORD_LEVEL_LIWC = input("Enter the file path for LIWC word level memorability scores: ")
WORD_LEVEL_GI = input("Enter the file path for GI word level memorability scores: ")

In [3]:
liwc = pd.read_csv(OUTPUT_LIWC_CSV)
gi = pd.read_csv(OUTPUT_GI_CSV)

In [15]:
liwc_word = pd.read_csv(WORD_LEVEL_LIWC)
gi_word = pd.read_csv(WORD_LEVEL_GI)

In [6]:
gi.columns

Index(['Category', 'Cue_Mean_Mem', 'Cue_Std_Mem', 'Target_Mean_Mem',
       'Target_Std_Mem', 'n_words'],
      dtype='object')

DICHOTOMY ANALYSIS

In [4]:
# Define dichotomies to compare
dichotomies = [
    ('Pleasur', 'Pain'),
    ('Virtue', 'Vice'),
    ('Strong', 'Weak'),
    ('Active', 'Passive'),
    ('Undrst', 'Ovrst'),
    ('Negativ', 'Positiv')
]

In [5]:
def calculate_ttest_from_summary(mean1, std1, n1, mean2, std2, n2):
    se = np.sqrt((std1**2/n1) + (std2**2/n2))
    t_stat = (mean1 - mean2) / se
    df = ((std1**2/n1 + std2**2/n2)**2) / ((std1**2/n1)**2/(n1-1) + (std2**2/n2)**2/(n2-1))
    p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df))
    return t_stat, p_value

def cohens_d_from_summary(mean1, std1, n1, mean2, std2, n2):
    pooled_sd = np.sqrt(((n1-1)*std1**2 + (n2-1)*std2**2) / (n1+n2-2))
    return (mean1 - mean2) / pooled_sd

In [7]:
results = []

for cat1, cat2 in dichotomies:
    # Get rows for each category
    row1 = gi[gi['Category'] == cat1].iloc[0]
    row2 = gi[gi['Category'] == cat2].iloc[0]

    # Cue memorability stats
    cue_t, cue_p = calculate_ttest_from_summary(
        row1['Cue_Mean_Mem'], row1['Cue_Std_Mem'], row1['n_words'],
        row2['Cue_Mean_Mem'], row2['Cue_Std_Mem'], row2['n_words']
    )

    cue_d = cohens_d_from_summary(
        row1['Cue_Mean_Mem'], row1['Cue_Std_Mem'], row1['n_words'],
        row2['Cue_Mean_Mem'], row2['Cue_Std_Mem'], row2['n_words']
    )

    # Target memorability stats
    target_t, target_p = calculate_ttest_from_summary(
        row1['Target_Mean_Mem'], row1['Target_Std_Mem'], row1['n_words'],
        row2['Target_Mean_Mem'], row2['Target_Std_Mem'], row2['n_words']
    )

    target_d = cohens_d_from_summary(
        row1['Target_Mean_Mem'], row1['Target_Std_Mem'], row1['n_words'],
        row2['Target_Mean_Mem'], row2['Target_Std_Mem'], row2['n_words']
    )

    results.append({
        'Categories': f'{cat1} vs {cat2}',
        'Cue_t': cue_t,
        'Cue_p': cue_p,
        'Cue_d': cue_d,
        'Target_t': target_t,
        'Target_p': target_p,
        'Target_d': target_d
    })

# Create DataFrame with results
comparison_df = pd.DataFrame(results)

In [8]:
comparison_df

Unnamed: 0,Categories,Cue_t,Cue_p,Cue_d,Target_t,Target_p,Target_d
0,Pleasur vs Pain,1.903961,0.057752,0.197151,1.952584,0.05190816,0.215952
1,Virtue vs Vice,-0.706875,0.479772,-0.039393,-4.230654,2.495439e-05,-0.235799
2,Strong vs Weak,1.000436,0.317302,0.047853,-5.424022,7.035207e-08,-0.258228
3,Active vs Passive,2.531223,0.011479,0.115579,-5.589485,2.752761e-08,-0.255929
4,Undrst vs Ovrst,-1.662837,0.096886,-0.118525,-0.588048,0.5567294,-0.041932
5,Negativ vs Positiv,0.522278,0.60151,0.017396,2.539701,0.01113675,0.084026


In [10]:
# Get Posemo and Negemo rows
posemo = liwc[liwc['Category'] == 'Posemo'].iloc[0]
negemo = liwc[liwc['Category'] == 'Negemo'].iloc[0]

# Cue-based stats
cue_t, cue_p = calculate_ttest_from_summary(
    posemo['Cue_Mean_Mem'], posemo['Cue_Std_Mem'], posemo['n_words'],
    negemo['Cue_Mean_Mem'], negemo['Cue_Std_Mem'], negemo['n_words']
)

cue_d = cohens_d_from_summary(
    posemo['Cue_Mean_Mem'], posemo['Cue_Std_Mem'], posemo['n_words'],
    negemo['Cue_Mean_Mem'], negemo['Cue_Std_Mem'], negemo['n_words']
)

# Target-based stats
target_t, target_p = calculate_ttest_from_summary(
    posemo['Target_Mean_Mem'], posemo['Target_Std_Mem'], posemo['n_words'],
    negemo['Target_Mean_Mem'], negemo['Target_Std_Mem'], negemo['n_words']
)

target_d = cohens_d_from_summary(
    posemo['Target_Mean_Mem'], posemo['Target_Std_Mem'], posemo['n_words'],
    negemo['Target_Mean_Mem'], negemo['Target_Std_Mem'], negemo['n_words']
)

# Optional: Print or store results
posemo_negemo_results = {
    'Categories': 'Posemo vs Negemo',
    'Cue_t': cue_t,
    'Cue_p': cue_p,
    'Cue_d': cue_d,
    'Target_t': target_t,
    'Target_p': target_p,
    'Target_d': target_d
}

In [11]:
comparison_df = pd.concat([comparison_df, pd.DataFrame([posemo_negemo_results])], ignore_index=True)

In [12]:
comparison_df

Unnamed: 0,Categories,Cue_t,Cue_p,Cue_d,Target_t,Target_p,Target_d
0,Pleasur vs Pain,1.903961,0.057752,0.197151,1.952584,0.05190816,0.215952
1,Virtue vs Vice,-0.706875,0.479772,-0.039393,-4.230654,2.495439e-05,-0.235799
2,Strong vs Weak,1.000436,0.317302,0.047853,-5.424022,7.035207e-08,-0.258228
3,Active vs Passive,2.531223,0.011479,0.115579,-5.589485,2.752761e-08,-0.255929
4,Undrst vs Ovrst,-1.662837,0.096886,-0.118525,-0.588048,0.5567294,-0.041932
5,Negativ vs Positiv,0.522278,0.60151,0.017396,2.539701,0.01113675,0.084026
6,Posemo vs Negemo,-4.073985,5.1e-05,-0.281689,-0.181679,0.8558836,-0.01258


CATEGORY LEVEL ANALYSIS

In [13]:
from scipy.stats import spearmanr

# Spearman correlation across category-level means
r_cat, p_cat = spearmanr(liwc['Cue_Mean_Mem'], liwc['Target_Mean_Mem'])
n_cat = liwc.shape[0]  # number of categories

print(f"Category-level Spearman r = {r_cat:.3f}, p = {p_cat:.3g}, n = {n_cat}")

Category-level Spearman r = 0.805, p = 1.01e-15, n = 64


In [16]:
# Spearman correlation across individual words
r_word, p_word = spearmanr(liwc_word['Cue_Mem_Score'], liwc_word['Target_Mem_Score'])
# n_word = liwc_word.shape[0]  # number of words

print(f"Word-level Spearman r = {r_word:.3f}, p = {p_word:.3g}")


Word-level Spearman r = 0.474, p = 0


In [17]:
len(liwc_word)

10174

In [18]:
# Spearman correlation across category-level means
r_cat, p_cat = spearmanr(gi['Cue_Mean_Mem'], gi['Target_Mean_Mem'])
n_cat = gi.shape[0]  # number of categories

print(f"Category-level Spearman r for GI = {r_cat:.3f}, p = {p_cat:.3g}, n = {n_cat}")

Category-level Spearman r for GI = 0.143, p = 0.0534, n = 182


In [19]:
# Spearman correlation across individual words
r_word, p_word = spearmanr(gi_word['Cue_Mem_Score'], gi_word['Target_Mem_Score'])
# n_word = liwc_word.shape[0]  # number of words

print(f"Word-level Spearman r GI = {r_word:.3f}, p = {p_word:.3g}")
len(gi_word)


Word-level Spearman r GI = 0.246, p = 0


40934

HIGHER AND LOWER CATEGORY LEVEL

In [23]:
from scipy.stats import spearmanr

# Filter df_results to only keep the desired categories
categories_to_keep = [
    "Death", "Affect", "Bio", "Relig", "Percept", "Posemo", "Social",
    "Leisure", "Money", "Funct", "Home", "CogMech", "Work"
]

filtered_df = liwc[liwc['Category'].isin(categories_to_keep)]

# Compute Spearman correlation
r_cat, p_cat = spearmanr(filtered_df['Cue_Mean_Mem'], filtered_df['Target_Mean_Mem'])
n_cat = filtered_df.shape[0]

print(f"Category-level Spearman r higher level = {r_cat:.3f}, p = {p_cat:.3g}, n = {n_cat}")


Category-level Spearman r higher level = 0.824, p = 0.00053, n = 13


In [24]:
filtered_df

Unnamed: 0,Category,Cue_Mean_Mem,Cue_Std_Mem,Target_Mean_Mem,Target_Std_Mem,n_words
0,Funct,0.280542,0.067127,0.270953,0.057073,445
22,Social,0.313657,0.069906,0.291127,0.059798,444
26,Affect,0.281237,0.047623,0.281678,0.033644,885
27,Posemo,0.273794,0.050989,0.281132,0.035993,393
32,CogMech,0.25246,0.059763,0.26707,0.034087,710
41,Percept,0.308515,0.04799,0.276848,0.0309,261
45,Bio,0.334497,0.042682,0.294757,0.054095,558
54,Work,0.306264,0.043981,0.282419,0.046346,319
56,Leisure,0.339562,0.046103,0.300806,0.048598,225
57,Home,0.352686,0.049757,0.304885,0.063886,89


In [25]:
liwc['Category'].unique()

array(['Funct', 'Pronoun', 'Ppron', 'I', 'We', 'You', 'SheHe', 'They',
       'Ipron', 'Article', 'Verbs', 'AuxVb', 'Past', 'Present', 'Future',
       'Adverbs', 'Prep', 'Conj', 'Negate', 'Quant', 'Numbers', 'Swear',
       'Social', 'Family', 'Friends', 'Humans', 'Affect', 'Posemo',
       'Negemo', 'Anx', 'Anger', 'Sad', 'CogMech', 'Insight', 'Cause',
       'Discrep', 'Tentat', 'Certain', 'Inhib', 'Incl', 'Excl', 'Percept',
       'See', 'Hear', 'Feel', 'Bio', 'Body', 'Health', 'Sexual', 'Ingest',
       'Relativ', 'Motion', 'Space', 'Time', 'Work', 'Achiev', 'Leisure',
       'Home', 'Money', 'Relig', 'Death', 'Assent', 'Nonflu', 'Filler'],
      dtype=object)

In [37]:
lower_level_liwc = [
    "Nonflu", "Swear", "Netspeak", "Sexual", "Family", "Assent", "Filler",
    "Female", "Body", "Anger", "Negemo", "Anx", "Friend", "Posemo",
    "Health", "Future Focus", "Male", "Discrep", "Feel", "See", "Power",
    "Time", "Motion", "Tentative", "Achieve", "Space", "Hear", "Affiliation",
    "Certain", "Present Focus", "Reward", "Risk", "Sad", "Ingestion",
    "Past Focus", "Causation", "Insight", "CogProc", "Differentiation"
]

available = np.array(liwc['Category'].unique())

lower_level_mapped = [
    "Nonflu", "Swear", "Netspeak", "Sexual", "Family", "Assent", "Filler",
    "Female", "Body", "Anger", "Negemo", "Anx", "Friends", "Posemo",
    "Health", "Future", "Male", "Discrep", "Feel", "See", "Power",  # 'Power' not found in your data
    "Time", "Motion", "Tentat", "Achiev", "Space", "Hear", "Social",  # Social as proxy for Affiliation
    "Certain", "Present", "Reward", "Risk", "Sad", "Ingest", "Past",
    "Cause", "Insight", "CogMech", "Discrep", "Excl", "Incl", "Tentat", "Certain"
]


In [38]:
final_cats = list(set(lower_level_mapped) & set(available))

In [39]:
final_cats = set(final_cats) - set(categories_to_keep)

In [40]:
len(final_cats)

31

In [41]:
final_cats

{'Achiev',
 'Anger',
 'Anx',
 'Assent',
 'Body',
 'Cause',
 'Certain',
 'Discrep',
 'Excl',
 'Family',
 'Feel',
 'Filler',
 'Friends',
 'Future',
 'Health',
 'Hear',
 'Incl',
 'Ingest',
 'Insight',
 'Motion',
 'Negemo',
 'Nonflu',
 'Past',
 'Present',
 'Sad',
 'See',
 'Sexual',
 'Space',
 'Swear',
 'Tentat',
 'Time'}

In [42]:
missing_cats = [cat for cat in final_cats if cat not in liwc['Category'].unique()]
print("Missing categories:", missing_cats)


Missing categories: []


In [46]:
results = []

for cat in final_cats:
    sub_df = liwc[liwc['Category'] == cat]
    cue_vals = sub_df['Cue_Mean_Mem']
    target_vals = sub_df['Target_Mean_Mem']
    
    if len(sub_df) < 2:
        print(f"Category: {cat} skipped (only {len(sub_df)} word)")
        continue
    if cue_vals.nunique() <= 1 or target_vals.nunique() <= 1:
        print(f"Category: {cat} skipped (no variance in cue or target)")
        continue
    
    corr, pval = spearmanr(cue_vals, target_vals)
    print(f"Category: {cat}, Spearman r = {corr:.3f}, p = {pval:.3g}")

    results.append({
        'LIWC_Category': cat,
        'Spearman_r': corr,
        'p_value': pval,
        'n_words': len(sub_df)
    })

# If you want the full summary in a dataframe
correlation_df = pd.DataFrame(results)


Category: Hear skipped (only 1 word)
Category: Excl skipped (only 1 word)
Category: Feel skipped (only 1 word)
Category: Swear skipped (only 1 word)
Category: Sexual skipped (only 1 word)
Category: Present skipped (only 1 word)
Category: Insight skipped (only 1 word)
Category: Space skipped (only 1 word)
Category: Certain skipped (only 1 word)
Category: Nonflu skipped (only 1 word)
Category: Friends skipped (only 1 word)
Category: Assent skipped (only 1 word)
Category: Body skipped (only 1 word)
Category: Anx skipped (only 1 word)
Category: Anger skipped (only 1 word)
Category: Tentat skipped (only 1 word)
Category: Past skipped (only 1 word)
Category: Time skipped (only 1 word)
Category: Cause skipped (only 1 word)
Category: Filler skipped (only 1 word)
Category: Discrep skipped (only 1 word)
Category: Motion skipped (only 1 word)
Category: Health skipped (only 1 word)
Category: Future skipped (only 1 word)
Category: Negemo skipped (only 1 word)
Category: Achiev skipped (only 1 word)


In [44]:
results

[]