In [1]:
# Import libraries
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
# # --- Configuration ---
# LIWC_CSV_PATH = '/home2/anuska.m/Cue_Target_Mem/Files/LIWC2007.csv' # Replace with your LIWC file path
# GI_CSV_PATH = '/home2/anuska.m/Cue_Target_Mem/Files/General_Inquirer_formatted.csv'     # Replace with your GI file path
# SVM_CUE_PATH = '/home2/anuska.m/Cue_Target_Mem/Files/ft_svm_model_cue.pkl' 
# SVM_TARGET_PATH = '/home2/anuska.m/Cue_Target_Mem/Files/ft_svm_model_target.pkl' # Replace with your SVM model path
# FASTTEXT_MODEL_PATH = '/home2/anuska.m/Cue_Target_Mem/Embeddings/cc.en.300.bin' # Replace with your FastText .bin model path
# COX_WORDS_FILE = '/home2/anuska.m/Cue_Target_Mem/Files/Cox Data Mem Combined.xlsx' # Optional: Path to a file with one Cox word per line
# OUTPUT_LIWC_CSV = '/home2/anuska.m/Cue_Target_Mem/Files/liwc_category_memorability.csv'
# OUTPUT_GI_CSV = '/home2/anuska.m/Cue_Target_Mem/Files/gi_category_memorability.csv'

In [None]:
# --- Configuration ---
LIWC_CSV_PATH = input("Enter the path to the LIWC file: ")
GI_CSV_PATH = input("Enter the path to the General Inquirer file: ")
SVM_CUE_PATH = input("Enter the path to the SVM cue model(stored as pkl file): ")
SVM_TARGET_PATH = input("Enter the path to the SVM target model(stored as pkl file): ")
FASTTEXT_MODEL_PATH = input("Enter the path to the FastText model(Common Crawl English bin file): ")
COX_WORDS_FILE = input("Enter the path to the Cox words file(the combined memorability file): ")
OUTPUT_LIWC_CSV = input("Enter the file path for LIWC Category Memorability scores: ")
OUTPUT_GI_CSV = input("Enter the file path for GI Category Memorability scores: ")


In [3]:
cox = pd.read_excel(COX_WORDS_FILE)

words = cox['Word'].tolist()

In [4]:
import fasttext
ft_model = fasttext.load_model(FASTTEXT_MODEL_PATH)

In [5]:
import pickle

svm_cue = pickle.load(open(SVM_CUE_PATH, 'rb'))
svm_target = pickle.load(open(SVM_TARGET_PATH, 'rb'))

In [6]:
gi = pd.read_csv(GI_CSV_PATH, index_col=0)
liwc = pd.read_csv(LIWC_CSV_PATH, encoding='ISO-8859-1')

In [7]:
liwc

Unnamed: 0,Funct,Pronoun,Ppron,I,We,You,SheHe,They,Ipron,Article,...,Work,Achiev,Leisure,Home,Money,Relig,Death,Assent,Nonflu,Filler
0,a,anybod*,hed,i,lets,thee,he,their*,anybod*,a,...,absent*,abilit*,actor*,address,account*,afterlife*,autops*,absolutely,er,blah
1,about,anyone*,he'd,Id,let's,thine,hed,them,anyone*,alot,...,academ*,able*,actress*,apartment*,atm,agnost*,alive,agree,hm*,idontknow
2,above,anything,her,I'd,our,thou,he'd,themselves,anything,an,...,accomplish*,accomplish*,aerobic*,backyard,atms,alla,bereave*,ah,sigh,imean
3,absolutely,everybod*,hers,I'll,ours,thoust,her,they,everybod*,the,...,achiev*,ace,amus*,bake*,auction*,allah*,burial*,alright*,uh,like
4,across,everyone*,herself,Im,ourselves,thy,hers,theyd,everyone*,,...,administrat*,achiev*,apartment*,baking,audit,altar*,buried,aok,um,ohwell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,,,,,,,,,,,...,,,,,,,,,,
914,,,,,,,,,,,...,,,,,,,,,,
915,,,,,,,,,,,...,,,,,,,,,,
916,,,,,,,,,,,...,,,,,,,,,,


In [8]:
liwc_filtered = liwc[~liwc.isin(words)]
print(f"LIWC data after filtering Cox words: {liwc_filtered.shape}")

LIWC data after filtering Cox words: (918, 64)


In [9]:
gi_filtered = gi[~gi.isin(words)]

In [10]:
liwc_filtered

Unnamed: 0,Funct,Pronoun,Ppron,I,We,You,SheHe,They,Ipron,Article,...,Work,Achiev,Leisure,Home,Money,Relig,Death,Assent,Nonflu,Filler
0,a,anybod*,hed,i,lets,thee,he,their*,anybod*,a,...,absent*,abilit*,actor*,,account*,afterlife*,autops*,absolutely,er,blah
1,about,anyone*,he'd,Id,let's,thine,hed,them,anyone*,alot,...,academ*,able*,actress*,apartment*,atm,agnost*,alive,agree,hm*,idontknow
2,above,anything,her,I'd,our,thou,he'd,themselves,anything,an,...,accomplish*,accomplish*,aerobic*,backyard,atms,alla,bereave*,ah,sigh,imean
3,absolutely,everybod*,hers,I'll,ours,thoust,her,they,everybod*,the,...,achiev*,ace,amus*,bake*,auction*,allah*,burial*,alright*,uh,like
4,across,everyone*,herself,Im,ourselves,thy,hers,theyd,everyone*,,...,administrat*,achiev*,apartment*,baking,audit,altar*,,aok,um,ohwell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,,,,,,,,,,,...,,,,,,,,,,
914,,,,,,,,,,,...,,,,,,,,,,
915,,,,,,,,,,,...,,,,,,,,,,
916,,,,,,,,,,,...,,,,,,,,,,


In [11]:
results = []
word_level_rows = []

for col in liwc_filtered.columns:
    words = liwc_filtered[col].dropna().astype(str).unique()
    cue_preds = []
    target_preds = []

    for word in words:
            vector = ft_model[word].reshape(1, -1)
            cue_score = svm_cue.predict(vector)[0]
            target_score = svm_target.predict(vector)[0]
            cue_preds.append(cue_score)
            target_preds.append(target_score)

            # Collect individual word-level data
            word_level_rows.append({
                'word': word,
                'category': col,
                'Cue_Mem_Score': cue_score,
                'Target_Mem_Score': target_score
            })
        
    if cue_preds:
        results.append({
            'Category': col,
            'Cue_Mean_Mem': np.mean(cue_preds),
            'Cue_Std_Mem': np.std(cue_preds),
            'Target_Mean_Mem': np.mean(target_preds),
            'Target_Std_Mem': np.std(target_preds),
            'n_words': len(target_preds),
        })

# Create both DataFrames
df_results = pd.DataFrame(results)
liwc_words = pd.DataFrame(word_level_rows)

In [12]:
df_results

Unnamed: 0,Category,Cue_Mean_Mem,Cue_Std_Mem,Target_Mean_Mem,Target_Std_Mem,n_words
0,Funct,0.280542,0.067127,0.270953,0.057073,445
1,Pronoun,0.315263,0.049431,0.290568,0.063977,115
2,Ppron,0.333187,0.041549,0.305317,0.075543,70
3,I,0.355941,0.024712,0.371684,0.129060,12
4,We,0.337286,0.039416,0.275107,0.035586,12
...,...,...,...,...,...,...
59,Relig,0.332493,0.048442,0.291681,0.040454,158
60,Death,0.329081,0.034693,0.296097,0.035422,61
61,Assent,0.325498,0.049373,0.354325,0.073486,30
62,Nonflu,0.323796,0.046054,0.328255,0.059252,8


In [13]:
results_gi = []
word_level_rows_gi = []

for col in gi_filtered.columns:
    words_gi = gi_filtered[col].dropna().astype(str).unique()
    cue_preds_gi = []
    target_preds_gi = []

    for word in words_gi:
            vector = ft_model[word].reshape(1, -1)
            cue_score = svm_cue.predict(vector)[0]
            target_score = svm_target.predict(vector)[0]
            cue_preds_gi.append(cue_score)
            target_preds_gi.append(target_score)
            
                # Collect individual word-level data
            word_level_rows_gi.append({
                'Word': word,
                'Category': col,
                'Cue_Mem_Score': cue_score,
                'Target_Mem_Score': target_score
            })

    if cue_preds_gi:  # If list not empty
        results_gi.append({
            'Category': col,
            'Cue_Mean_Mem': np.mean(cue_preds_gi),
            'Cue_Std_Mem': np.std(cue_preds_gi),
            'Target_Mean_Mem': np.mean(target_preds_gi),
            'Target_Std_Mem': np.std(target_preds_gi),
            'n_words': len(target_preds_gi),
        })
        
        
        

df_results_gi = pd.DataFrame(results_gi)
gi_words = pd.DataFrame(word_level_rows_gi)

In [14]:
df_results_gi

Unnamed: 0,Category,Cue_Mean_Mem,Cue_Std_Mem,Target_Mean_Mem,Target_Std_Mem,n_words
0,Positiv,0.316500,0.042922,0.283216,0.035580,1637
1,Negativ,0.317247,0.042876,0.286317,0.037967,2007
2,Pstv,0.318910,0.042413,0.285665,0.040357,765
3,Affil,0.325556,0.039717,0.279341,0.038331,474
4,Ngtv,0.320023,0.042705,0.288724,0.040860,875
...,...,...,...,...,...,...
177,SureLw,0.292050,0.048542,0.281552,0.025818,140
178,If,0.304263,0.047343,0.275205,0.033702,113
179,NotLw,0.305668,0.048056,0.288434,0.034148,20
180,TimeSpc,0.333576,0.037745,0.280042,0.048578,341


In [15]:
# df_results.to_csv(OUTPUT_LIWC_CSV, index=False)
# df_results_gi.to_csv(OUTPUT_GI_CSV, index=False)

In [16]:
liwc_words

Unnamed: 0,word,category,Cue_Mem_Score,Target_Mem_Score
0,a,Funct,0.363168,0.190747
1,about,Funct,0.215873,0.264323
2,above,Funct,0.212767,0.224790
3,absolutely,Funct,0.208515,0.282727
4,across,Funct,0.254220,0.285896
...,...,...,...,...
10169,ohwell,Filler,0.364052,0.370460
10170,rr*,Filler,0.337526,0.283186
10171,yakno*,Filler,0.307868,0.257554
10172,ykn*,Filler,0.349422,0.310536


In [17]:
gi_words

Unnamed: 0,Word,Category,Cue_Mem_Score,Target_Mem_Score
0,ABIDE,Positiv,0.348635,0.285280
1,ABILITY,Positiv,0.316033,0.317660
2,ABLE,Positiv,0.357881,0.355795
3,ABOUND,Positiv,0.319761,0.320242
4,ABSOLVE,Positiv,0.358308,0.288010
...,...,...,...,...
40929,WHOLESALE,FormLw,0.309750,0.251194
40930,WORD,FormLw,0.342604,0.264889
40931,WRITE,FormLw,0.338836,0.280342
40932,WRITTEN,FormLw,0.261002,0.245712


In [None]:
WORD_LEVEL_LIWC = input("Enter the file path for LIWC word level memorability scores: ")
WORD_LEVEL_GI = input("Enter the file path for GI word level memorability scores: ")

In [18]:
liwc_words.to_csv(WORD_LEVEL_LIWC, index=False)
gi_words.to_csv(WORD_LEVEL_GI, index=False)