In [None]:
pip install pandas matplotlib wordcloud

In [12]:
import re, fnmatch
import pandas as pd
import requests
from statistics import mean
import csv
import numpy as np
import glob
import os
import scipy.stats as stats
from scipy.stats import mannwhitneyu
import math
import json
import ast
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [2]:
fake = pd.read_csv('/workspaces/fake_news_analysis/data/preprocessed/fake_preprocessed.csv')
real = pd.read_csv('/workspaces/fake_news_analysis/data/preprocessed/real_preprocessed.csv')

df = pd.concat([fake, real])

In [13]:
def string_to_list(text):
    try:
        return ast.literal_eval(text)
    except ValueError:
        return None

df['processed_text'] = df['processed_text'].apply(string_to_list)

In [19]:
dimensions = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']
polarities = ['virtue', 'vice']

# dictionaries
dictionary_paths = {
    'mft': '/workspaces/fake_news_analysis/dictionaries/processed/mft_dictionary.json',
    'mfd2': '/workspaces/fake_news_analysis/dictionaries/processed/mfd2_dictionary.json',
    'mfd1': '/workspaces/fake_news_analysis/dictionaries/processed/mfd1_dictionary.json',
    'emfd': '/workspaces/fake_news_analysis/dictionaries/processed/emfd_dictionary.json',
    'ms': '/workspaces/fake_news_analysis/dictionaries/processed/ms_dictionary.json'
}


loaded_dictionaries = {}
for key, path in dictionary_paths.items():
    with open(path, 'r') as file:
        loaded_dictionaries[key] = json.load(file)

### Vectorization

In [21]:
# TF normalized

def score_tf(tokens, dictionary):
    # Initialize scores and term frequencies
    scores = {dimension: {polarity: 0 for polarity in dictionary[dimension]} for dimension in dictionary}
    tf = {token: tokens.count(token) for token in set(tokens)}
    tracked = {dimension: {polarity: [] for polarity in dictionary[dimension]} for dimension in dictionary}

    # Calculate maximum term frequency for normalization
    max_tf = max(tf.values()) if tf else 1

    # Iterate through each token and update scores based on normalized term frequency
    for token in set(tokens):  # Iterate through unique tokens for efficiency
        for dimension, polarities in dictionary.items():
            for polarity, words in polarities.items():
                if token in words:
                    # Calculate TF normalization factor
                    tf_factor = 0.5 + 0.5 * (tf[token] / max_tf)
                    # Update scores with TF normalization
                    scores[dimension][polarity] += tf_factor
                    if token not in tracked[dimension][polarity]:
                        tracked[dimension][polarity].append(token)

    vector = []
    for dimension in scores.keys():
        for polarity in scores[dimension].keys():
            vector.append(scores[dimension][polarity])

    return vector

In [22]:
keys = ['mft', 'mfd2', 'mfd1']

for key in keys:
    if key in loaded_dictionaries:
      df[key] = df['processed_text'].apply(lambda x: score_tf(x, loaded_dictionaries[key]))

In [23]:
def score_prob(tokens, dictionary):
    prob_sums = {dimension: {polarity: 0 for polarity in dictionary[dimension]} for dimension in dictionary}
    word_counts = {dimension: {polarity: 0 for polarity in dictionary[dimension]} for dimension in dictionary}
    tracked = {dimension: {polarity: [] for polarity in dictionary[dimension]} for dimension in dictionary}

    # Process tokens against the dictionary
    for token in tokens:
        for dimension, polarities in dictionary.items():
            for polarity, words_probs in polarities.items():
                if isinstance(words_probs, list) and words_probs:
                    for word_prob in words_probs:
                        if token == word_prob[0]:  # If the token matches the word
                            prob_sums[dimension][polarity] += word_prob[1]
                            word_counts[dimension][polarity] += 1
                            if token not in tracked[dimension][polarity]:
                                tracked[dimension][polarity].append(token)

    # Calculate averages based on tracked order
    avg_probs = {}
    for dimension in dictionary:  # Ensure dimension order matches 'tracked'
        avg_probs[dimension] = {}
        for polarity in dictionary[dimension]:  # Ensure polarity order matches 'tracked'
            if word_counts[dimension][polarity] > 0:
                avg = prob_sums[dimension][polarity] / word_counts[dimension][polarity]
            else:
                avg = 0
            avg_probs[dimension][polarity] = avg

    # Create the vector based on the same order as 'tracked'
    vector = []
    for dimension, polarities in tracked.items():  # Follow the order in 'tracked'
        for polarity in polarities:  # Follow the order in 'tracked'
            vector.append(avg_probs[dimension][polarity])

    # Ensure the vector is 10-dimensional
    vector = vector[:10] if len(vector) > 10 else vector + [0] * (10 - len(vector))

    return vector

In [25]:
df['emfd'] = df['processed_text'].apply(lambda x: score_prob(x, loaded_dictionaries['emfd']))

In [27]:
def score_range(tokens, dictionary):

    # Initialize scores and counts
    scores = {f'{dimension}_{polarity}': [] for dimension in dimensions for polarity in polarities}

    # Iterate through tokens and update scores
    for token in tokens:
        for dimension in dimensions:
            for polarity in polarities:
                key = f'{dimension}_{polarity}'
                if dimension in dictionary and polarity in dictionary[dimension]:
                    for word, score in dictionary[dimension][polarity]:
                        if token == word:
                            scores[key].append(score)

    # Compute averages for each dimension and polarity
    averages = []
    for key in scores:
        if scores[key]:
            avg_score = sum(scores[key]) / len(scores[key])
        else:
            avg_score = 0  # Default to 0 if no scores were found
        averages.append(avg_score)

    # Ensure the result is 10-dimensional
    return averages

In [28]:
df['ms'] = df['processed_text'].apply(lambda x: score_range(x, loaded_dictionaries['ms']))

In [29]:
df.to_csv('/workspaces/fake_news_analysis/data/preprocessed/data_vectorized.csv')

### Word cloud

flattened_morality_words = {}
for category, subcats in enh_mfd1_dictionary.items():
    for subcat, words in subcats.items():
        for word in words:
            flattened_morality_words[word] = category