# Summary

This notebook standardizes three moral psychology datasets (MFRC, MFTC, eMFD) for comparative LLM evaluation:

Key Processing Steps:
1. Label standardization to unified 5-foundation taxonomy
2. Text deduplication with unique text_id assignment
3. Multi-annotator format preservation
4. Upload to HuggingFace Hub for reproducible access

Output: Three clean datasets (user/morality-{MFRC,MFTC,eMFD}) ready for zero-shot moral classification evaluation across different text domains
(social media, news, forums).

In [None]:
from datasets import load_dataset
import pandas as pd
from itertools import islice
from dotenv import load_dotenv
from datasets import Dataset, DatasetDict
load_dotenv()

# Prepare Datasets

## MFRC

In [None]:
ds = load_dataset("USC-MOLA-Lab/MFRC")
df = pd.DataFrame( ds['train'] )
df.head()

In [None]:
df["annotation"].str.split(",").explode().unique()

In [None]:
key_map = {
    "Thin Morality": "none",
    "Non-Moral": "none",
    "Care": "care",
    "Purity": "sanctity",
    "Authority": "authority",
    "Loyalty": "loyalty",
    "Proportionality": "fairness",
    "Equality": "fairness"
}

df["label"] = df["annotation"].replace(key_map,regex=True) # to allow for substrings
df.head()

In [None]:
df['text_id'] = df.groupby('text').ngroup()
assert df['text_id'].max() == df['text_id'].nunique()-1

In [None]:
from datasets import Dataset, DatasetDict

ds = Dataset.from_pandas(df)
ds.push_to_hub("maciejskorski/morality-MFRC")

## MFTC

In [None]:
df = pd.json_normalize(
   data=pd.read_json('../data/MFTC/MFTC_V4_text.json').to_dict('records'),
   record_path=['Tweets', 'annotations'],
   meta=[
       ['Corpus'],
       ['Tweets', 'tweet_id'],
       ['Tweets', 'tweet_text']
   ]
)
print(df.shape)
df.head()

In [None]:
df['annotation'].str.split(",").explode().unique()

In [None]:
key_map = {
        'care': 'care',
        'harm': 'care',
        'fairness': 'fairness',
        'cheating': 'fairness',
        'loyalty': 'loyalty',
        'betrayal': 'loyalty',
        'authority': 'authority',
        'subversion': 'authority',
        'purity': 'sanctity',
        'degradation': 'sanctity'
}

df['label'] = df['annotation'].replace(key_map,regex=True)
df = df.rename({'Tweets.tweet_id':'tweet_id','Tweets.tweet_text':'text',},axis=1)

In [None]:
df['label'].str.split(",").explode().unique()

In [None]:
df['text_id'] = df.groupby('text').ngroup()
assert df['text_id'].max() == df['text_id'].nunique()-1

In [None]:
ds = Dataset.from_pandas(df)
ds.push_to_hub("maciejskorski/morality-MFTC")

# EMFD

Source https://osf.io/vw85e/

In [None]:
import pandas as pd

df = pd.read_csv('highlights_raw.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)
df.head()

In [None]:
df['assigned_domain'].unique()

In [None]:
df['text_id'] = df.groupby('content').ngroup()
df.rename({'content':'text','coder_id':'annotator'},axis=1, inplace=True)

In [None]:
moral_targets = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']
assert set(df['assigned_domain'].unique()) == set(moral_targets)
df.rename({'assigned_domain':'label'},axis=1, inplace=True)

In [None]:
ds = Dataset.from_pandas(df)
ds.push_to_hub("maciejskorski/morality-eMFD")

In [None]:
from datasets import load_dataset
import pandas as pd

# Dataset names
datasets = ['morality-MFRC', 'morality-MFTC', 'morality-eMFD']
moral_foundations = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']

print("Dataset Statistics Summary")
print("=" * 50)

for dataset_name in datasets:
    print(f"\n### {dataset_name.upper()}")
    
    # Load dataset
    ds = load_dataset(f"maciejskorski/{dataset_name}")['train']
    df = ds.to_pandas()
    
    # Basic statistics
    n_annotations = len(df)
    n_unique_texts = df['text_id'].nunique()
    
    print(f"Total annotations: {n_annotations:,}")
    print(f"Unique texts: {n_unique_texts:,}")
    print(f"Avg annotations per text: {n_annotations/n_unique_texts:.1f}")
    
    # Moral foundation prevalence using text.str.contains
    print("\nMoral Foundation Prevalence:")
    
    # Group by text_id and aggregate labels
    text_labels = df.groupby('text_id')['label'].apply(';'.join)
    
    for foundation in moral_foundations:
        texts_with_foundation = text_labels.str.contains(foundation).sum()
        pct = (texts_with_foundation / n_unique_texts) * 100
        print(f"  {foundation}: {texts_with_foundation:,} texts ({pct:.1f}%)")
    
    print("-" * 40)

Dataset Statistics Summary

### MORALITY-MFRC
Total annotations: 61,226
Unique texts: 17,886
Avg annotations per text: 3.4

Moral Foundation Prevalence:
  care: 4,740 texts (26.5%)
  fairness: 5,280 texts (29.5%)
  loyalty: 1,977 texts (11.1%)
  authority: 3,430 texts (19.2%)
  sanctity: 1,747 texts (9.8%)
----------------------------------------

### MORALITY-MFTC
Total annotations: 128,454
Unique texts: 33,687
Avg annotations per text: 3.8

Moral Foundation Prevalence:
  care: 13,716 texts (40.7%)
  fairness: 11,982 texts (35.6%)
  loyalty: 10,305 texts (30.6%)
  authority: 11,280 texts (33.5%)
  sanctity: 7,543 texts (22.4%)
----------------------------------------

### MORALITY-EMFD
Total annotations: 73,001
Unique texts: 54,868
Avg annotations per text: 1.3

Moral Foundation Prevalence:
  care: 13,438 texts (24.5%)
  fairness: 12,742 texts (23.2%)
  loyalty: 12,714 texts (23.2%)
  authority: 13,609 texts (24.8%)
  sanctity: 10,308 texts (18.8%)
------------------------------------