In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# Load the data
data = pd.read_csv('train.csv')

In [3]:
# Define a function to preprocess the input string
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove usernames
    text = re.sub(r'@\S+', '', text)
    # Remove hashtags
    text = re.sub(r'#\S+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing whitespace
    text = text.strip()
    return text

In [4]:
# Preprocess the data
data['text'] = data['text'].apply(preprocess_text)

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

In [6]:
# Define the classifiers
lr = LogisticRegression(random_state=42)
nb = MultinomialNB()
rf = RandomForestClassifier(random_state=42)

In [7]:
# Define the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

In [8]:
# Define the voting classifier
voting_clf = VotingClassifier(
    estimators=[('lr', lr), ('nb', nb), ('rf', rf)],
    voting='soft')

In [9]:
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('voting', voting_clf)
])

In [10]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [11]:
# Predict on the test data
y_pred = pipeline.predict(X_test)

In [12]:
# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.9402471453151885


In [13]:
# Define a function to predict the label for a given string
def predict_hate_speech(text):
    text = preprocess_text(text)
    label = pipeline.predict([text])[0]
    return label

In [14]:
# Test the function on a sample string
text = "I hate people like you"
label = predict_hate_speech(text)
print('Text:', text)
print('Label:', label)

Text: I hate people like you
Label: 0


In [15]:
text = "I love you"
label = predict_hate_speech(text)
print('Text:', text)
print('Label:', label)

Text: I love you
Label: 0
