<a href="https://colab.research.google.com/github/natesheehan/-30daymapchallenge-/blob/main/weird.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.corpus import brown
from collections import Counter

# Make sure you have the NLTK corpus downloaded
nltk.download('brown')

# Use the Brown corpus as the general corpus
general_corpus_words = brown.words()
general_counter = Counter([word.lower() for word in general_corpus_words])

# Define your specialist corpus here
specialist_corpus = """
Attention Colored Voters.
According to the decision of the Supreme Court persons with less than one quarter African blood in their veins are white men and women. Let such persons be sure to register and vote
"""
specialist_counter = Counter(specialist_corpus.lower().split())

# Calculate total word counts for each corpus
t_s = sum(specialist_counter.values())
t_g = sum(general_counter.values())

# Calculate Weirdness Index for each word in the specialist corpus
weirdness_indices = {}
for word in specialist_counter:
    w_s = specialist_counter[word]
    w_g = general_counter.get(word, 0)  # Default to 0 if word is not found in the general corpus
    wi = (w_s / t_s) / (w_g / t_g) if w_g > 0 else float('inf')  # Avoid division by zero
    weirdness_indices[word] = wi

# Sort the words by their Weirdness Index in descending order
sorted_weirdness_indices = sorted(weirdness_indices.items(), key=lambda item: item[1], reverse=True)

# Display the top words with the highest Weirdness Index
for word, index in sorted_weirdness_indices[:10]:
    print(f"Word: {word}, Weirdness Index: {index}")


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Word: voters., Weirdness Index: inf
Word: women., Weirdness Index: inf
Word: vote., Weirdness Index: inf
Word: veins, Weirdness Index: 5375.888888888889
Word: register, Weirdness Index: 1240.5897435897436
Word: african, Weirdness Index: 1151.9761904761904
Word: colored, Weirdness Index: 1040.494623655914
Word: quarter, Weirdness Index: 948.6862745098039
Word: supreme, Weirdness Index: 632.4575163398692
Word: persons, Weirdness Index: 533.1460055096419


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.cuda.amp import GradScaler, autocast
import spacy

# Function to create synthetic data
def create_synthetic_data():
    data = {
        "text": ["This is a historical document.", "This document contains hate speech.", "An example of a neutral historical text.", "Explicit hate speech example."],
        "label": [0, 1, 0, 1]
    }
    return pd.DataFrame(data)

# Advanced Preprocessing
def advanced_preprocess(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop])

# Create synthetic data
df = create_synthetic_data()
df['processed_text'] = df['text'].apply(advanced_preprocess)

# Splitting data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['processed_text'], df['label'], test_size=.2)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

# Convert to torch tensors
train_seq = torch.tensor(train_encodings['input_ids'])
train_mask = torch.tensor(train_encodings['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(val_encodings['input_ids'])
val_mask = torch.tensor(val_encodings['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

# DataLoaders
batch_size = 32
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
optimizer = AdamW(model.parameters(), lr=1e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training
epochs = 4
scaler = GradScaler()
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_total_steps=total_steps)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch

        model.zero_grad()
        with autocast():
            outputs = model(sent_id, attention_mask=mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")

# Evaluation
def evaluate_model(model, dataloader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            batch = [t.to(device) for t in batch]
            sent_id, mask, labels = batch

            outputs = model(sent_id, attention_mask=mask)
            logits = outputs.logits
            logits = logits.detach()


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: ignored

In [None]:
import nltk
from nltk.corpus import stopwords, brown
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from collections import Counter
import re

# Make sure you have the NLTK data downloaded
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('brown')

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

# Weirdness Index Calculation
def calculate_weirdness(specialist_counter, general_counter, top_n=10):
    t_s = sum(specialist_counter.values())
    t_g = sum(general_counter.values())
    weirdness_indices = {}

    for word in specialist_counter:
        w_s = specialist_counter[word]
        w_g = general_counter.get(word, 0)  # Default to 0 if word is not found
        wi = (w_s / t_s) / (w_g / t_g) if w_g > 0 else float('inf')
        weirdness_indices[word] = wi

    return sorted(weirdness_indices.items(), key=lambda item: item[1], reverse=True)[:top_n]

# Hate Speech Classifier
def classify_hate_speech(weirdness_indices, sentiment):
    # Define thresholds
    weirdness_threshold = 1
    sentiment_threshold = -0.5

    # Check if any word exceeds the weirdness threshold
    high_weirdness = any(index > weirdness_threshold for _, index in weirdness_indices)

    # Check sentiment score
    negative_sentiment = sentiment['compound'] < sentiment_threshold

    # Classify as hate speech if high weirdness and negative sentiment are found
    if high_weirdness and negative_sentiment:
        return "Hate speech"
    else:
        return "Non-hate speech"

# Define your specialist corpus here
specialist_corpus = """
The Manchester Alpha Sigma re- cently debated the question, "Re- -solved, that the Indian has received xmore cruel treatment from the handsof the white man, than the negro."
Decided that the negro had been the worst used. Of course he has. Time and again have the whites fur- nished the Indian with whisky and scalps.
Whenever he wanted brain fsoup, has he not been allowed to killa pale-face to furnish it? If he desired to hang up white children on pine knots, haven't we furnished the children?
'Course we have. What have we done for the negro? Made him a slave for 200 years and spelled his name with a small "n" -the Indian with a big "I."
For two weeks prior to an election he is dignified with the title of "brother" by republicans, after which they let him relapse into a "nigger" for the next two yeears.
"""

# Preprocess the specialist corpus
specialist_tokens = preprocess_text(specialist_corpus)
specialist_counter = Counter(specialist_tokens)

# Sentiment Analysis
sentiment = sia.polarity_scores(specialist_corpus)

# Weirdness Index Calculation
general_corpus_words = brown.words()
general_counter = Counter([word.lower() for word in general_corpus_words if word.isalpha()])

weirdness_indices = calculate_weirdness(specialist_counter, general_counter)

# Hate Speech Classification
classification = classify_hate_speech(weirdness_indices, sentiment)

# Display results
print(f"Hate Speech Classification: {classification}")
print(f"Sentiment Analysis: {sentiment}")
print("Top Words by Weirdness Index:")
for word, index in weirdness_indices:
    print(f"Word: {word}, Weirdness Index: {index}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Hate Speech Classification: Hate speech
Sentiment Analysis: {'neg': 0.055, 'neu': 0.911, 'pos': 0.034, 'compound': -0.6298}
Top Words by Weirdness Index:
Word: cently, Weirdness Index: inf
Word: xmore, Weirdness Index: inf
Word: handsof, Weirdness Index: inf
Word: nished, Weirdness Index: inf
Word: scalps, Weirdness Index: inf
Word: fsoup, Weirdness Index: inf
Word: killa, Weirdness Index: inf
Word: paleface, Weirdness Index: inf
Word: havent, Weirdness Index: inf
Word: relapse, Weirdness Index: inf
