### Toxicity analysis.

This notebook looks at how toxic super-spreaders are in comparison to all users within the misinformation network.

In [1]:
import sys
sys.path.insert(0, '/Users/mdeverna/Documents/Projects/FIB_index/src_clean')
from utils import Loader
from utils import convert_twitter_strings_2_dates
import datetime
import numpy as np
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from altair_saver import save
from scipy.stats import mannwhitneyu, pearsonr

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
mean_tox_fib = pd.read_parquet("../data/fibers_uid_rank_tox.parquet")
mean_tox_inf = pd.read_parquet("../data/influentials_uid_rank_tox.parquet")

### No relationships between metric rank and toxicity

In [3]:
inf_chart = alt.Chart(mean_tox_inf).mark_point(color="blue", opacity=.3).encode(
    x = alt.X("inf_rank:Q"),
    y = alt.Y("toxicity_score", title="Mean toxicity")
)

inf_ = inf_chart + inf_chart.transform_regression('inf_rank', 'toxicity_score').mark_line(color="blue", opacity=.5)


fib_chart = alt.Chart(mean_tox_fib).mark_point(color="firebrick", opacity=.3).encode(
    x = alt.X("fib_rank:Q"),
    y = alt.Y("toxicity_score", title="Mean toxicity")
)

fib_ = fib_chart + fib_chart.transform_regression('fib_rank', 'toxicity_score').mark_line(color = "firebrick", opacity=.5)

alt.layer(fib_, inf_)

In [4]:
from scipy.stats import spearmanr

In [5]:
print("Spearman correlations between metric RANK and toxicity scores")
print("-"*20)

print("Influential:")
statistic, p = spearmanr(mean_tox_inf["toxicity_score"], mean_tox_inf["inf_rank"])
print(f"\t- Test statistic : {statistic}")
print(f"\t- p value.       : {p}")

print("~"*20)

print("FIB:")
statistic, p = spearmanr(mean_tox_fib["toxicity_score"], mean_tox_fib["fib_rank"])
print(f"\t- Test statistic : {statistic}")
print(f"\t- p value.       : {p}")

Spearman correlations between metric RANK and toxicity scores
--------------------
Influential:
	- Test statistic : 0.0849789718159563
	- p value.       : 0.25804534636229753
~~~~~~~~~~~~~~~~~~~~
FIB:
	- Test statistic : 0.03173963139769068
	- p value.       : 0.6740602592675484


### Lets compare the mean toxicity score of users in each group

In [6]:
mean_tox_fib["user_type"] = "FIB-index"
mean_tox_inf["user_type"] = "Influential"


In [7]:
mean_tox_fib.head()

Unnamed: 0,user_id,fib_rank,toxicity_score,user_type
0,1000141948269678592,27,0.324496,FIB-index
1,1010682068131557376,133,0.279745,FIB-index
2,1018425398869282817,14,0.112814,FIB-index
3,1023362107,52,0.134076,FIB-index
4,1026684533364547594,85,0.330502,FIB-index


### Average toxicity values...

In [8]:
print(f"Mean tox for FIB-index : {mean_tox_fib['toxicity_score'].mean()}")
print(f"Median tox for FIB-index : {mean_tox_fib['toxicity_score'].median()}")
print(f"Num. users = {len(mean_tox_fib)}")
print("")
print(f"Mean tox for Influence : {mean_tox_inf['toxicity_score'].mean()}")
print(f"Median tox for Influence : {mean_tox_inf['toxicity_score'].median()}")
print(f"Num. users = {len(mean_tox_inf)}")

Mean tox for FIB-index : 0.19764762871743705
Median tox for FIB-index : 0.1836832002151127
Num. users = 178

Mean tox for Influence : 0.19662978283213628
Median tox for Influence : 0.17703450862773723
Num. users = 179


In [9]:
mean_tox_fib = mean_tox_fib[["user_id", "toxicity_score", "user_type"]]
mean_tox_inf = mean_tox_inf[["user_id", "toxicity_score", "user_type"]]

In [10]:
ss_mean_tox = pd.concat([mean_tox_fib, mean_tox_inf])

In [11]:
ss_mean_tox

Unnamed: 0,user_id,toxicity_score,user_type
0,1000141948269678592,0.324496,FIB-index
1,1010682068131557376,0.279745,FIB-index
2,1018425398869282817,0.112814,FIB-index
3,1023362107,0.134076,FIB-index
4,1026684533364547594,0.330502,FIB-index
...,...,...,...
174,959856286061006848,0.242277,Influential
175,965259182,0.195793,Influential
176,969444209940889600,0.288375,Influential
177,975217231,0.345641,Influential


In [12]:
ss_mean_tox['user_id'].nunique()

245

In [13]:
fib_vs_inf = alt.Chart(ss_mean_tox).mark_boxplot().encode(
    x= alt.X(
        "toxicity_score:Q", 
        title="Mean toxicity score per user"
    ),
    y= alt.Y(
        "user_type:N", 
        axis=alt.Axis(
            title=None,
            labelAngle=0,
            labelFontSize=14
        )
    ),
    color= alt.Color("user_type:N", title="Identification Metric", legend=None)
).properties(width=700)

fib_vs_inf

In [14]:
ss_mean_tox

Unnamed: 0,user_id,toxicity_score,user_type
0,1000141948269678592,0.324496,FIB-index
1,1010682068131557376,0.279745,FIB-index
2,1018425398869282817,0.112814,FIB-index
3,1023362107,0.134076,FIB-index
4,1026684533364547594,0.330502,FIB-index
...,...,...,...
174,959856286061006848,0.242277,Influential
175,965259182,0.195793,Influential
176,969444209940889600,0.288375,Influential
177,975217231,0.345641,Influential


In [15]:
print("Mann Whitney U analysis of FIBers and Influentials mean toxicity scores")
print("-"*20)

inf_tox_list = list(ss_mean_tox[ss_mean_tox["user_type"] == "Influential"]["toxicity_score"])
fib_tox_list = list(ss_mean_tox[ss_mean_tox["user_type"] == "FIB-index"]["toxicity_score"])

statistic, p = mannwhitneyu(fib_tox_list, inf_tox_list)
print(f"\t- Test statistic : {statistic}")
print(f"\t- p value.       : {p}\n")


print("Median and Mean: mean toxicity score per user per group")
print("-"*20 + '\n')

print("Median:")
print("~"*20)
print(f"Influential: {np.round(np.median(inf_tox_list),4)}")
print(f"FIB-index: {np.round(np.median(fib_tox_list),4)}\n")

print("Mean:")
print("~"*20)
print(f"Influential: {np.round(np.mean(inf_tox_list),4)}")
print(f"FIB-index: {np.round(np.mean(fib_tox_list),4)}")

Mann Whitney U analysis of FIBers and Influentials mean toxicity scores
--------------------
	- Test statistic : 16434.0
	- p value.       : 0.6062673537202803

Median and Mean: mean toxicity score per user per group
--------------------

Median:
~~~~~~~~~~~~~~~~~~~~
Influential: 0.177
FIB-index: 0.1837

Mean:
~~~~~~~~~~~~~~~~~~~~
Influential: 0.1966
FIB-index: 0.1976


### Lets look at the distribution of all users to just super-spreaders

In [16]:
all_vs_ss_mean_tox_df = pd.read_parquet("../data/superspreader_vs_all_users_tox.parquet")

In [17]:
all_vs_ss_mean_tox_df

Unnamed: 0,toxicity_score,user_type
0,0.337475,Superspreaders
1,0.285913,Superspreaders
2,0.217476,Superspreaders
3,0.113308,Superspreaders
4,0.133302,Superspreaders
...,...,...
149476,0.624249,All users
149477,0.098910,All users
149478,0.310894,All users
149479,0.065592,All users


In [18]:
distro_fig = alt.Chart(all_vs_ss_mean_tox_df).transform_density(
    'toxicity_score',
    as_=['toxicity_score', 'density'],
    groupby = ["user_type"]
).mark_area(
    opacity=.75,
).encode(
    x=alt.X(
        "toxicity_score:Q",
        title="Toxicity score"
    ),
    y=alt.Y(
        'density:Q',
        axis=alt.Axis(
            title="Density",
            titleAngle=0,
            titlePadding=35
        )
    ),
    color = alt.Color(
        "user_type:N",
        title = "User type",
        scale = alt.Scale(domain=["All users", "Superspreaders"], range=["grey", "firebrick"])
    )
).properties(width=300)

distro_fig = distro_fig.configure_axis(
    labelFontSize=14,
    titleFontSize=14
).configure_legend(
    titleFontSize=14,
    labelFontSize=14,
    orient="bottom",
    offset=10,
)
distro_fig

### These groups are very different

In [19]:
print("Mann Whitney U analysis of super-spreaders mean toxicity score being greater than the mean toxicity score of all users")
print("-"*20)

ss_user_toxicity_scores = list(all_vs_ss_mean_tox_df[all_vs_ss_mean_tox_df["user_type"] == "Superspreaders"]["toxicity_score"])
all_user_toxicity_scores = list(all_vs_ss_mean_tox_df[all_vs_ss_mean_tox_df["user_type"] == "All users"]["toxicity_score"])

statistic, p = mannwhitneyu(ss_user_toxicity_scores, all_user_toxicity_scores)
print(f"\t- Test statistic : {statistic}")
print(f"\t- p value.       : {p}\n")


Mann Whitney U analysis of super-spreaders mean toxicity score being greater than the mean toxicity score of all users
--------------------
	- Test statistic : 24181943.0
	- p value.       : 3.806551111063592e-18



In [20]:
len(all_user_toxicity_scores)

149481