# Racist Content Detection Experiement (Final Model)

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import nomic

from nomic import embed
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('./data/tweets.csv', usecols=['Cleaned tweet', 'Tag'])
df.columns = ['text', 'label']
df_sample = df.sample(frac=1, random_state=42).reset_index(drop=True)
df_sample.head()

Unnamed: 0,text,label
0,People have the right to be wherever we want a...,0
1,"And meanwhile, in the Madrid neighborhood of E...",1
2,I hear my mom say “fucking black man” “no soul...,0
3,"You are wrong, bastard, you are just another i...",1
4,It's just that you see a black man with luxuri...,0


### Data Preprocessing

In [5]:
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

df_sample['text'] = df_sample['text'].apply(remove_non_ascii)
df_sample.head()

Unnamed: 0,text,label
0,People have the right to be wherever we want a...,0
1,"And meanwhile, in the Madrid neighborhood of E...",1
2,I hear my mom say fucking black man no soul sk...,0
3,"You are wrong, bastard, you are just another i...",1
4,It's just that you see a black man with luxuri...,0


In [6]:
missing_values = df_sample.isnull().sum()
label_dist = df_sample['label'].value_counts(normalize=True)

print("Missing Values:\n", missing_values)
print("Label Distribution:\n", label_dist)

Missing Values:
 text     0
label    0
dtype: int64
Label Distribution:
 label
0    0.5135
1    0.4865
Name: proportion, dtype: float64


### Model

In [7]:
output = embed.text(
    texts=df_sample['text'].tolist(),
    model='nomic-embed-text-v1',
)

print(output['usage'])

embeddings = np.array(output['embeddings'])

print(embeddings.shape)

{'prompt_tokens': 75934, 'total_tokens': 75934}
(2000, 768)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(embeddings,  df_sample['label'], test_size=0.2, random_state=42)

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

print("Accuracy:", lr_model.score(X_test, y_test))
print(classification_report(y_test, y_pred))

Accuracy: 0.82
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       208
           1       0.82      0.80      0.81       192

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



In [11]:
param_grid = {
    'n_estimators': [200, 400, 600], 
    'max_depth': [None, 10, 20, 30],  
    'min_samples_split': [2, 5, 10, 15],  
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Best Params:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   9.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   9.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   9.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   9.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   9.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   9.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=  19.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  10.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  10.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estim

In [12]:
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.815
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       208
           1       0.84      0.76      0.80       192

    accuracy                           0.81       400
   macro avg       0.82      0.81      0.81       400
weighted avg       0.82      0.81      0.81       400



### Examples

#### Racist Examples
**Note**: [source of racist tweets](https://humanrights.gov.au/our-work/publications/examples-racist-material-internet)

In [14]:
test = """If we do not stand now and perform our god given duty to keep OUR country 
        clean of all the Blacks, Jews and Yellow scum from Asia, WE are just as bad as the enemy, 
        if not worse. We are trading our race for that of an inferior form of trash"""

test_embedding = embed.text(texts=[test], model='nomic-embed-text-v1')
test_embedding = np.array(test_embedding['embeddings'])
prediction = best_rf.predict(test_embedding)

if prediction[0] == 1:
    print("Racist")
else:
    print("Not racist")


Racist


In [15]:
test = """Arabs… have little or no knowledge of personal hygiene products 
        such as deodorant or even soap…nothing has ever turned my stomach like the 
        fetid stench of unwashed wogs!"""

test_embedding = embed.text(texts=[test], model='nomic-embed-text-v1')
test_embedding = np.array(test_embedding['embeddings'])
prediction = best_rf.predict(test_embedding)

if prediction[0] == 1:
    print("Racist")
else:
    print("Not racist")

Racist


In [16]:
test = """…it is a world run by the Zionist Jewish Influence and 
        Race Tainting Paedophiles that are only here to rape our heritage and 
        destroy the qualities that make us White People great…"""

test_embedding = embed.text(texts=[test], model='nomic-embed-text-v1')
test_embedding = np.array(test_embedding['embeddings'])
prediction = best_rf.predict(test_embedding)

if prediction[0] == 1:
    print("Racist")
else:
    print("Not racist") 

Racist


#### Not Racist Examples


In [24]:
test = """Immigration policies should prioritize our own citizens' needs first. 
        Why should taxpayers bear the burden of supporting fucking migrants when there 
        are pressing issues at home?"""

test_embedding = embed.text(texts=[test], model='nomic-embed-text-v1')
test_embedding = np.array(test_embedding['embeddings'])
prediction = best_rf.predict(test_embedding)

if prediction[0] == 1:
    print("Racist")
else:
    print("Not racist")

Not racist


In [40]:
test = """The corrupt politician is not interested in knowing 
        that his broom is dirtier than what he is trying to sweep."""

test_embedding = embed.text(texts=[test], model='nomic-embed-text-v1')
test_embedding = np.array(test_embedding['embeddings'])
prediction = best_rf.predict(test_embedding)

if prediction[0] == 1:
    print("Racist")
else:
    print("Not racist")

Not racist
