In [3]:
import pandas as pd
import shutil
import json
import re

In [4]:
dimensions = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']
polarities = ['virtue', 'vice']

In [5]:
# save and move JSON data
def save_and_move_json(data, filename, destination):
    with open(filename, 'w') as file:
        json.dump(data, file)
    shutil.move(filename, destination)

In [4]:
# parse dic format
def parse_dic(path, dimensions, polarities):
    output = {dim: {pol: [] for pol in polarities} for dim in dimensions}
    mapping = {}

    with open(path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            if line.strip() and line[0].isdigit():
                parts = re.split(r'\s+', line.strip())
                if len(parts) == 2 and '.' in parts[1]:
                    dimension, polarity = parts[1].split('.')
                    mapping[parts[0]] = (dimension, polarity)

    for line in lines:
        parts = re.split(r'\s+', line.strip())
        if len(parts) >= 2 and parts[-1].isdigit():
            word = parts[0].strip('*')
            for category_number in parts[1:]:
                if category_number in mapping:
                    dimension, polarity = mapping[category_number]
                    output[dimension][polarity].append(word)

    return output

In [6]:
dictionary_paths = {
    'mft': '/workspaces/fake_news_analysis/dictionaries/raw/mft_original.dic',
    'mfd2': '/workspaces/fake_news_analysis/dictionaries/raw/mfd2.0.dic'
}

for name, path in dictionary_paths.items():
    dictionary_data = parse_dic(path, dimensions, polarities)
    save_and_move_json(dictionary_data, f'{name}_dictionary.json', f'/workspaces/fake_news_analysis/dictionaries/processed/{name}_dictionary.json')

NameError: name 'parse_dic' is not defined

In [7]:
# parse txt format
def parse_txt(file_path, dimensions, polarities):
    txt_output = {dimension: {polarity: [] for polarity in polarities} for dimension in dimensions}

    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split('|')
            if len(parts) < 6:
                continue

            token = parts[0].split(' = ')[1]
            m_desc = parts[4].split(' = ')[1]

            # adjust naming
            if 'ingroup' in m_desc.lower():
                m_desc += ' loyalty'
            if 'purity' in m_desc.lower():
                m_desc += ' sanctity'

            for dimension in dimensions:
                for polarity in polarities:
                    if dimension.lower() in m_desc.lower() and polarity.lower() in m_desc.lower():
                        txt_output[dimension][polarity].append(token)
                        break

    return txt_output

In [8]:
mfd1_path = '/workspaces/fake_news_analysis/dictionaries/raw/Enhanced_Morality_Lexicon_V1.1.txt'

dictionary_data = parse_txt(mfd1_path, dimensions, polarities)
save_and_move_json(dictionary_data, 'mfd1_dictionary.json', '/workspaces/fake_news_analysis/dictionaries/processed/mfd1_dictionary.json')

In [12]:
word_count=0
for category, subcategories in dictionary_data.items():
    for subcategory, words in subcategories.items():
        word_count += len(words)
word_count

4331

In [8]:
# parse csv format
def parse_csv(file_path, dimensions, polarities):
    df = pd.read_csv(file_path)

    csv_output = {dimension: {polarity: [] for polarity in polarities} for dimension in dimensions}

    for index, row in df.iterrows():
        for dimension in dimensions:
            prob_key = f'{dimension}_p'

            if float(row[prob_key]) > 0:
                for polarity in polarities:
                    polarity_key = f'{dimension}.{polarity}'

                    if polarity_key in row and float(row[polarity_key]) > 0:
                        csv_output[dimension][polarity].append((row['word'], float(row[prob_key])))

    return csv_output

In [13]:
emfd_path = '/workspaces/fake_news_analysis/dictionaries/raw/emfd_amp.csv'

dictionary_data = parse_csv(emfd_path, dimensions, polarities)
save_and_move_json(dictionary_data, 'emfd_dictionary.json', '/workspaces/fake_news_analysis/dictionaries/processed/emfd_dictionary.json')

In [10]:
# Function to parse TSV data and return combined DataFrame
def parse_tsv_to_df(paths):
    dataframes = []
    for dim, path in paths.items():
        df = pd.read_csv(path, sep='\t')
        df['dimension'] = dim
        dataframes.append(df)
    return pd.concat(dataframes, ignore_index=True)

In [11]:
# classify terms
def classify_terms(df, dimensions, polarities):

    output = {dimension: {polarity: [] for polarity in polarities} for dimension in dimensions}

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Read the dimension and moral score from the row
        dimension = row['dimension']
        score = row['EXPRESSED_MORAL']

        # Determine polarity based on score
        polarity = 'vice' if score < 4.5 else 'virtue'

        # Append the term to the corresponding list in the output structure
        output[dimension][polarity].append((row['LEMMA'], score))

    return output

In [12]:
# TSV file paths
tsv_paths = {
    'care': '/workspaces/fake_news_analysis/dictionaries/raw/care.tsv',
    'fairness': '/workspaces/fake_news_analysis/dictionaries/raw/fairness.tsv',
    'loyalty': '/workspaces/fake_news_analysis/dictionaries/raw/loyalty.tsv',
    'authority': '/workspaces/fake_news_analysis/dictionaries/raw/authority.tsv',
    'sanctity': '/workspaces/fake_news_analysis/dictionaries/raw/purity.tsv'
}

# Load and process TSV data
final_data = parse_tsv_to_df(tsv_paths)
ms_dictionary = classify_terms(final_data, dimensions, polarities)
save_and_move_json(ms_dictionary, 'ms_dictionary.json', '/workspaces/fake_news_analysis/dictionaries/processed/ms_dictionary.json')