<a href="https://colab.research.google.com/github/meg-huggingface/bias-testing/blob/main/BiasResults-Religion-Fineweb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fineweb TF-IDF Religion Bias Analysis: Results

This displays the outputs from processing Fineweb sample 10BT for religion bias.

The columns show the top tf-idf values with respect to other words.

The columns with `+` in them show these same values, minus the mean tf-idf for all the other words each word co-occurs with. This provides a measure of how much the association is skewed with respect to the entire dataset.

Throughout, we see skews towards words that suggest intimacy: `online`, `singles`, `sex`, `mature`, `girls`, suggesting that religions in this dataset are particularly represented with respect to sexuality.

As can be seen, `jewish` is *particularly* associated with `dating` and `singles`.

`muslim`, `jewish`, `hindu` and `buddhist` are slightly skewed to co-occur with `women`, while `sex` is skewed with `muslim`, `christian`, `jewish`; and `girl` with `muslim`, `jewish`, `hindu`.


In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('fineweb-sample-10BT-religion-tfidf.csv', index_col=0)
print(df.columns)

Index(['word', 'muslim', 'christian', 'jewish', 'hindu', 'buddhist', 'atheist',
       'muslim+', 'christian+', 'jewish+', 'hindu+', 'buddhist+', 'atheist+',
       'variance', 'total'],
      dtype='object')


In [7]:
df.sort_values('muslim+', ascending=False).style.background_gradient(
    axis=None,
    vmin=0,
    vmax=0.2,
    cmap="YlGnBu"
).format(precision=3)

Unnamed: 0,word,muslim,christian,jewish,hindu,buddhist,atheist,muslim+,christian+,jewish+,hindu+,buddhist+,atheist+,variance,total
6,muslim,0.115,0.011,0.018,0.027,0.015,0.009,0.083,-0.021,-0.015,-0.006,-0.017,-0.024,0.407,0.194
0,dating,0.164,0.192,0.212,0.146,0.133,0.009,0.021,0.049,0.069,0.004,-0.009,-0.134,1.755,0.856
2,women,0.048,0.037,0.05,0.053,0.046,0.01,0.007,-0.004,0.009,0.012,0.006,-0.03,0.499,0.244
27,sex,0.02,0.015,0.015,0.014,0.011,0.005,0.007,0.001,0.002,0.0,-0.002,-0.008,0.163,0.08
5,online,0.043,0.047,0.057,0.038,0.032,0.004,0.006,0.01,0.02,0.001,-0.005,-0.033,0.454,0.222
48,girl,0.015,0.009,0.011,0.01,0.007,0.003,0.005,-0.0,0.002,0.001,-0.002,-0.006,0.112,0.055
32,girls,0.016,0.012,0.016,0.014,0.01,0.003,0.005,0.0,0.004,0.002,-0.002,-0.009,0.145,0.071
19,sites,0.019,0.023,0.022,0.013,0.009,0.002,0.004,0.009,0.008,-0.002,-0.005,-0.013,0.182,0.089
42,mature,0.014,0.01,0.02,0.007,0.006,0.001,0.004,0.001,0.01,-0.003,-0.003,-0.009,0.118,0.057
13,meet,0.028,0.026,0.034,0.027,0.028,0.003,0.003,0.001,0.01,0.003,0.004,-0.021,0.298,0.146


In [8]:
df.sort_values('christian+', ascending=False).style.background_gradient(
    axis=None,
    vmin=0,
    vmax=0.2,
    cmap="YlGnBu"
).format(precision=3)

Unnamed: 0,word,muslim,christian,jewish,hindu,buddhist,atheist,muslim+,christian+,jewish+,hindu+,buddhist+,atheist+,variance,total
0,dating,0.164,0.192,0.212,0.146,0.133,0.009,0.021,0.049,0.069,0.004,-0.009,-0.134,1.755,0.856
12,christian,0.016,0.077,0.018,0.013,0.011,0.031,-0.012,0.049,-0.009,-0.015,-0.016,0.003,0.344,0.166
5,online,0.043,0.047,0.057,0.038,0.032,0.004,0.006,0.01,0.02,0.001,-0.005,-0.033,0.454,0.222
19,sites,0.019,0.023,0.022,0.013,0.009,0.002,0.004,0.009,0.008,-0.002,-0.005,-0.013,0.182,0.089
1,singles,0.065,0.076,0.11,0.079,0.085,0.002,-0.005,0.006,0.041,0.01,0.015,-0.068,0.854,0.417
38,church,0.004,0.016,0.005,0.003,0.005,0.027,-0.006,0.006,-0.005,-0.007,-0.005,0.017,0.125,0.06
8,free,0.034,0.038,0.042,0.038,0.035,0.008,0.002,0.005,0.009,0.005,0.002,-0.024,0.398,0.194
37,catholic,0.006,0.015,0.014,0.006,0.01,0.012,-0.004,0.005,0.003,-0.004,-0.001,0.002,0.131,0.064
9,site,0.033,0.035,0.043,0.035,0.038,0.004,0.002,0.004,0.012,0.003,0.007,-0.027,0.386,0.188
20,love,0.013,0.017,0.014,0.016,0.014,0.013,-0.001,0.003,-0.0,0.001,-0.001,-0.002,0.179,0.087


In [9]:
df.sort_values('jewish+', ascending=False).style.background_gradient(
    axis=None,
    vmin=0,
    vmax=0.2,
    cmap="YlGnBu"
).format(precision=3)

Unnamed: 0,word,muslim,christian,jewish,hindu,buddhist,atheist,muslim+,christian+,jewish+,hindu+,buddhist+,atheist+,variance,total
7,jewish,0.012,0.012,0.128,0.018,0.013,0.007,-0.02,-0.02,0.097,-0.014,-0.019,-0.024,0.402,0.19
0,dating,0.164,0.192,0.212,0.146,0.133,0.009,0.021,0.049,0.069,0.004,-0.009,-0.134,1.755,0.856
1,singles,0.065,0.076,0.11,0.079,0.085,0.002,-0.005,0.006,0.041,0.01,0.015,-0.068,0.854,0.417
5,online,0.043,0.047,0.057,0.038,0.032,0.004,0.006,0.01,0.02,0.001,-0.005,-0.033,0.454,0.222
9,site,0.033,0.035,0.043,0.035,0.038,0.004,0.002,0.004,0.012,0.003,0.007,-0.027,0.386,0.188
42,mature,0.014,0.01,0.02,0.007,0.006,0.001,0.004,0.001,0.01,-0.003,-0.003,-0.009,0.118,0.057
13,meet,0.028,0.026,0.034,0.027,0.028,0.003,0.003,0.001,0.01,0.003,0.004,-0.021,0.298,0.146
15,personals,0.018,0.018,0.032,0.031,0.034,0.001,-0.004,-0.004,0.009,0.009,0.012,-0.021,0.272,0.132
8,free,0.034,0.038,0.042,0.038,0.035,0.008,0.002,0.005,0.009,0.005,0.002,-0.024,0.398,0.194
2,women,0.048,0.037,0.05,0.053,0.046,0.01,0.007,-0.004,0.009,0.012,0.006,-0.03,0.499,0.244


In [10]:
df.sort_values('hindu+', ascending=False).style.background_gradient(
    axis=None,
    vmin=0,
    vmax=0.2,
    cmap="YlGnBu"
).format(precision=3)

Unnamed: 0,word,muslim,christian,jewish,hindu,buddhist,atheist,muslim+,christian+,jewish+,hindu+,buddhist+,atheist+,variance,total
10,hindu,0.015,0.004,0.006,0.129,0.02,0.002,-0.015,-0.026,-0.023,0.1,-0.009,-0.027,0.378,0.177
34,indian,0.011,0.004,0.004,0.037,0.008,0.001,0.0,-0.007,-0.007,0.026,-0.003,-0.009,0.137,0.066
3,single,0.034,0.033,0.045,0.054,0.055,0.003,-0.003,-0.004,0.007,0.017,0.018,-0.034,0.463,0.226
2,women,0.048,0.037,0.05,0.053,0.046,0.01,0.007,-0.004,0.009,0.012,0.006,-0.03,0.499,0.244
11,men,0.03,0.028,0.034,0.04,0.036,0.009,0.001,-0.002,0.004,0.011,0.007,-0.021,0.362,0.177
1,singles,0.065,0.076,0.11,0.079,0.085,0.002,-0.005,0.006,0.041,0.01,0.015,-0.068,0.854,0.417
15,personals,0.018,0.018,0.032,0.031,0.034,0.001,-0.004,-0.004,0.009,0.009,0.012,-0.021,0.272,0.132
39,essay,0.007,0.009,0.01,0.017,0.013,0.003,-0.003,-0.001,0.001,0.007,0.003,-0.007,0.121,0.059
8,free,0.034,0.038,0.042,0.038,0.035,0.008,0.002,0.005,0.009,0.005,0.002,-0.024,0.398,0.194
44,asian,0.011,0.009,0.012,0.014,0.011,0.001,0.002,-0.001,0.002,0.004,0.001,-0.009,0.116,0.057


In [11]:
df.sort_values('buddhist+', ascending=False).style.background_gradient(
    axis=None,
    vmin=0,
    vmax=0.2,
    cmap="YlGnBu"
).format(precision=3)

Unnamed: 0,word,muslim,christian,jewish,hindu,buddhist,atheist,muslim+,christian+,jewish+,hindu+,buddhist+,atheist+,variance,total
4,buddhist,0.006,0.005,0.007,0.022,0.169,0.003,-0.029,-0.03,-0.029,-0.013,0.134,-0.033,0.455,0.211
3,single,0.034,0.033,0.045,0.054,0.055,0.003,-0.003,-0.004,0.007,0.017,0.018,-0.034,0.463,0.226
1,singles,0.065,0.076,0.11,0.079,0.085,0.002,-0.005,0.006,0.041,0.01,0.015,-0.068,0.854,0.417
15,personals,0.018,0.018,0.032,0.031,0.034,0.001,-0.004,-0.004,0.009,0.009,0.012,-0.021,0.272,0.132
9,site,0.033,0.035,0.043,0.035,0.038,0.004,0.002,0.004,0.012,0.003,0.007,-0.027,0.386,0.188
11,men,0.03,0.028,0.034,0.04,0.036,0.009,0.001,-0.002,0.004,0.011,0.007,-0.021,0.362,0.177
2,women,0.048,0.037,0.05,0.053,0.046,0.01,0.007,-0.004,0.009,0.012,0.006,-0.03,0.499,0.244
21,chat,0.014,0.016,0.016,0.017,0.019,0.001,0.0,0.002,0.002,0.003,0.005,-0.013,0.173,0.085
13,meet,0.028,0.026,0.034,0.027,0.028,0.003,0.003,0.001,0.01,0.003,0.004,-0.021,0.298,0.146
47,100,0.008,0.01,0.011,0.011,0.013,0.002,-0.001,0.0,0.002,0.002,0.003,-0.007,0.113,0.055
