# Signal analysis

## Setup

In [1]:
from scipy.stats import spearmanr
import os
import pandas as pd

# load datasets
likert_datasets = ["credcoalition.csv", "microsoft.csv", "reconcile.csv"]
binary_datasets = ["fnn-gossip.csv", "fnn-politifact.csv"]

likert_per_file = (pd.read_csv(os.path.join(os.getcwd(), "datasets_evaluated", dataset), sep=";",index_col="url") 
                   for dataset in likert_datasets)
binary_per_file = (pd.read_csv(os.path.join(os.getcwd(), "datasets_evaluated", dataset), sep=";",index_col="url") 
                   for dataset in binary_datasets)

likert_df = pd.concat(likert_per_file)
binary_df = pd.concat(binary_per_file)

# swap 0 <-> 1 fake news classification to facilitate comparisons with credibility ratings (higher rating = better)
binary_df["rating"] = 1 - binary_df["rating"]

# create combined emotion intensity column
likert_df["emotion_intensity"] = (likert_df["anger_intensity"] + likert_df["anticipation_intensity"]
                                  + likert_df["disgust_intensity"] + likert_df["fear_intensity"]
                                  + likert_df["sadness_intensity"] + likert_df["joy_intensity"]
                                  + likert_df["surprise_intensity"] + likert_df["trust_intensity"])
binary_df["emotion_intensity"] = (binary_df["anger_intensity"] + binary_df["anticipation_intensity"] 
                                  + binary_df["disgust_intensity"] + binary_df["fear_intensity"]
                                  + binary_df["sadness_intensity"] + binary_df["joy_intensity"]
                                  + binary_df["surprise_intensity"] + binary_df["trust_intensity"])

# dataframes for webpages without headlines
likert_clean = likert_df.copy()
likert_clean = likert_clean[likert_clean["clickbait"] > -10]
binary_clean = binary_df.copy()
binary_clean = binary_clean[binary_clean["clickbait"] > -10]

# data distributions
print(likert_df.shape)
print(likert_clean.shape)
print(binary_df.shape)
print(binary_clean.shape)

(242, 59)
(241, 59)
(200, 59)
(200, 59)


## Signal scores & credibility correlation

In [2]:
signals = ["score_author",
           "score_url_domain_ending",
           "score_errors",
           "score_tonality_questions_text",
           "score_tonality_exclamations_text",
           "score_tonality_all_caps_text",
           "score_readability",
           "score_ls_word_count_text",
           "score_ls_sentence_count",
           "score_ls_type_token_ratio",
           "score_ls_word_length_text",
           "score_vocabulary_profanity",
           "score_vocabulary_emotional_words",
           "score_links_external",
           "score_sentiment_polarity_text",
           "score_sentiment_subjectivity"]

headline_signals = ["score_tonality_questions_title",
                    "score_tonality_exclamations_title",
                    "score_tonality_all_caps_title",
                    "score_ls_word_count_title",
                    "score_ls_word_length_title",
                    "score_clickbait",
                    "score_sentiment_polarity_title"]

correlations = {}

for signal in signals:
    likert_rho, likert_p = spearmanr(likert_df[signal], likert_df["rating"])
    binary_rho, binary_p = spearmanr(binary_df[signal], binary_df["rating"])
    correlations[signal] = [likert_rho, likert_p, binary_rho, binary_p]
for signal in headline_signals:
    likert_rho, likert_p = spearmanr(likert_clean[signal], likert_clean["rating"])
    binary_rho, binary_p = spearmanr(binary_clean[signal], binary_clean["rating"])
    correlations[signal] = [likert_rho, likert_p, binary_rho, binary_p]
    
for signal_results in sorted(correlations.items(), key=lambda tpl: tpl[1][0]+ tpl[1][2], reverse=True):
    print("{:<35} {:.10f}  {:.5f}  {:.10f}  {:.5f}".format(signal_results[0],
                                                            signal_results[1][0], signal_results[1][1],
                                                            signal_results[1][2], signal_results[1][3]))

score_url_domain_ending             0.1746552141  0.00645  0.2154101092  0.00219
score_ls_sentence_count             0.0217943134  0.73587  0.3637267966  0.00000
score_vocabulary_emotional_words    0.1478136029  0.02144  0.2373630623  0.00071
score_tonality_exclamations_text    0.2814921193  0.00001  0.0587699187  0.40844
score_ls_word_count_text            -0.0090645035  0.88844  0.3246437488  0.00000
score_tonality_all_caps_title       0.1171977047  0.06934  0.1959654504  0.00542
score_ls_word_count_title           0.1692409064  0.00847  0.1300712538  0.06639
score_errors                        0.1569176505  0.01454  0.1184542035  0.09480
score_clickbait                     0.1362540513  0.03451  0.1307937102  0.06489
score_sentiment_polarity_text       0.0340834574  0.59776  0.2285869206  0.00113
score_sentiment_polarity_title      0.1525822066  0.01777  0.0754660009  0.28820
score_ls_word_length_text           0.2265133571  0.00038  -0.0092198317  0.89690
score_sentiment_subjectivi

## Inter-signal correlation

In [3]:
signals = ["errors_grammar_spelling", 
           "questions_text_per_sentence",
           "exclamations_text_per_sentence",
           "all_caps_text",
           "all_caps_title",
           "readability_coleman_liau",
           "word_count_text",
           "sentence_count",
           "ttr",
           "word_length_text",
           "profanity",
           "links_count",
           "sentiment_text_vader",
           "subjectivity",
           "score_author",
           "emotion_intensity"]

headline_signals = ["negativity_title_vader", 
                    "word_length_title",
                    "word_count_title",
                    "questions_title",
                    "exclamations_title",
                    "clickbait",
                    "domain_ending"]

correlations = {}

# check all possible correlations for signals in the signals list
for index, signal in enumerate(signals):
        for index2 in range(index + 1, len(signals)):
            signal2 = signals[index2]
            
            likert_rho, likert_p = spearmanr(likert_df[signal], likert_df[signal2])
            binary_rho, binary_p = spearmanr(binary_df[signal], binary_df[signal2])
            
            if likert_p < 0.05 and binary_p < 0.05 and not (likert_rho < 0 < binary_rho or binary_rho < 0 < likert_rho):
                correlations[signal + " & " + signal2] = [likert_rho, likert_p, binary_rho, binary_p]
                
# check all possible correlations for a signal in the signals and the other in the headline_signals list
for signal in signals:
        for signal2 in headline_signals:            
            likert_rho, likert_p = spearmanr(likert_clean[signal], likert_clean[signal2])
            binary_rho, binary_p = spearmanr(binary_clean[signal], binary_clean[signal2])
            
            if likert_p < 0.05 and binary_p < 0.05 and not (likert_rho < 0 < binary_rho or binary_rho < 0 < likert_rho):
                correlations[signal + " & " + signal2] = [likert_rho, likert_p, binary_rho, binary_p]
                
# check all possible correlations for signals in the headline_signals list
for index, signal in enumerate(signals):
    for index2 in range(index + 1, len(signals)):
        signal2 = signals[index2]

        likert_rho, likert_p = spearmanr(likert_df[signal], likert_df[signal2])
        binary_rho, binary_p = spearmanr(binary_df[signal], binary_df[signal2])

        if likert_p < 0.05 and binary_p < 0.05 and not (likert_rho < 0 < binary_rho or binary_rho < 0 < likert_rho):
            correlations[signal + " & " + signal2] = [likert_rho, likert_p, binary_rho, binary_p]
            
for signal_results in sorted(correlations.items(), key=lambda tpl: abs(tpl[1][0] + tpl[1][2]), reverse=True):
    print("{:<65} {:.10f}  {:.5f}  {:.10f}  {:.5f}".format(signal_results[0],
                                                            signal_results[1][0], signal_results[1][1],
                                                            signal_results[1][2], signal_results[1][3]))

readability_coleman_liau & word_length_text                       0.9634364864  0.00000  0.9520573014  0.00000
word_count_text & sentence_count                                  0.9454530845  0.00000  0.9446779729  0.00000
word_count_text & ttr                                             -0.8559305835  0.00000  -0.9082166511  0.00000
sentence_count & ttr                                              -0.8069241993  0.00000  -0.8689474696  0.00000
sentence_count & sentiment_text_vader                             0.2912136118  0.00000  0.4283807038  0.00000
word_count_text & sentiment_text_vader                            0.2677966195  0.00002  0.4097673730  0.00000
sentiment_text_vader & negativity_title_vader                     -0.3741740397  0.00000  -0.2836326293  0.00005
questions_text_per_sentence & sentence_count                      0.2210630394  0.00053  0.4028962869  0.00000
ttr & sentiment_text_vader                                        -0.2651964823  0.00003  -0.3470119076  0