In [1]:
import getpass
import os
import re
import torch
import transformers
import datasets
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#load a dataset form the HuggingFace Hub
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech')

# check the dataset structure
print(dataset)

# check the size of the dataset
print(dataset.shape)

# take a look at an example
dataset["train"][0]

DatasetDict({
    train: Dataset({
        features: ['comment_id', 'annotator_id', 'platform', 'sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'hate_speech_score', 'text', 'infitms', 'outfitms', 'annotator_severity', 'std_err', 'annotator_infitms', 'annotator_outfitms', 'hypothesis', 'target_race_asian', 'target_race_black', 'target_race_latinx', 'target_race_middle_eastern', 'target_race_native_american', 'target_race_pacific_islander', 'target_race_white', 'target_race_other', 'target_race', 'target_religion_atheist', 'target_religion_buddhist', 'target_religion_christian', 'target_religion_hindu', 'target_religion_jewish', 'target_religion_mormon', 'target_religion_muslim', 'target_religion_other', 'target_religion', 'target_origin_immigrant', 'target_origin_migrant_worker', 'target_origin_specific_country', 'target_origin_undocumented', 'target_origin_other', 'target_origin', 'target_gender_men', 'target

{'comment_id': 47777,
 'annotator_id': 10873,
 'platform': 3,
 'sentiment': 0.0,
 'respect': 0.0,
 'insult': 0.0,
 'humiliate': 0.0,
 'status': 2.0,
 'dehumanize': 0.0,
 'violence': 0.0,
 'genocide': 0.0,
 'attack_defend': 0.0,
 'hatespeech': 0.0,
 'hate_speech_score': -3.9,
 'text': 'Yes indeed. She sort of reminds me of the elder lady that played the part in the movie "Titanic" who was telling her story!!! And I wouldn\'t have wanted to cover who I really am!! I would be proud!!!! WE should be proud of our race no matter what it is!!',
 'infitms': 0.81,
 'outfitms': 1.88,
 'annotator_severity': 0.36,
 'std_err': 0.34,
 'annotator_infitms': 1.35,
 'annotator_outfitms': 1.23,
 'hypothesis': -1.1301777576839678,
 'target_race_asian': True,
 'target_race_black': True,
 'target_race_latinx': True,
 'target_race_middle_eastern': True,
 'target_race_native_american': True,
 'target_race_pacific_islander': True,
 'target_race_white': True,
 'target_race_other': False,
 'target_race': True,
 

In [3]:
# take a look at an example
print("review: ", dataset["train"][0]["text"])
print("hate_speech_score: ", dataset["train"][0]["hate_speech_score"])

print("review: ", dataset["train"][-1]["text"])
print("hate_speech_score: ", dataset["train"][-1]["hate_speech_score"])

print(len(dataset["train"]["hate_speech_score"]), dataset["train"]["hate_speech_score"][:10],dataset["train"]["hate_speech_score"][-10:] )

review:  Yes indeed. She sort of reminds me of the elder lady that played the part in the movie "Titanic" who was telling her story!!! And I wouldn't have wanted to cover who I really am!! I would be proud!!!! WE should be proud of our race no matter what it is!!
hate_speech_score:  -3.9
review:  لا تتشمت الرجال مسكين يعاني كس امه 😂. يقول ياليتهم كانوا عرب.  OP, I really hope that you commit suicide one day or die of ass cancer. Just because you "feel sorry" does not justify the action.  It is your fault of leaving your country to fight in Afghanistan. These poor boys were defending their country from foreign invaders.  الله يلعن البجاحه الي عايشين فيها الامريكان.
hate_speech_score:  -0.2
135556 [-3.9, -6.52, 0.36, 0.26, 1.54, -4.93, 0.17, 2.08, -0.66, -0.38] [-3.1, -0.2, -2.89, -1.69, -2.49, -4.88, -4.4, -2.49, -4.4, -0.2]


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Convert dataset["train"] into a pandas DataFrame
train_df = pd.DataFrame(dataset["train"])

# Split the DataFrame into training and test sets
train_data, test_data = train_test_split(train_df, test_size=0.2, random_state=42)

# Split the training set into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Print the sizes of the datasets
print("Training set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Test set size:", len(test_data))


Training set size: 86755
Validation set size: 21689
Test set size: 27112


In [7]:
# Save the training set
train_data.to_csv('train_data.csv', index=False)

# Save the test set
test_data.to_csv('test_data.csv', index=False)

# Save the validation set
val_data.to_csv('val_data.csv', index=False)
