In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# Load the data
data = pd.read_csv('train.csv')

In [3]:
# Define a function to preprocess the input string
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove usernames
    text = re.sub(r'@\S+', '', text)
    # Remove hashtags
    text = re.sub(r'#\S+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing whitespace
    text = text.strip()
    return text

In [4]:
data.head()

Unnamed: 0,label,text
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [5]:
data.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [14]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler

# Load the dataset
df = pd.read_csv('train.csv')

# Split the dataframe into feature matrix and target array
X = df['text']
y = df['label']

# Reshape the feature matrix X
X = X.values.reshape(-1, 1)

# Initialize the RandomUnderSampler
rus = RandomUnderSampler()

# Fit and transform X and y
X_resampled, y_resampled = rus.fit_resample(X, y)

# Convert X_resampled back to a 1D array
X_resampled = X_resampled.reshape(-1)

# Update the dataframe with the resampled data
df_resampled = pd.DataFrame({'text': X_resampled, 'label': y_resampled})
df_resampled.head()

Unnamed: 0,text,label
0,but first....let me take a #selfie #weekend #l...,0
1,"@user it was an attack on human kind, but also...",0
2,#linkinmybio renemo0dy: my opc3 finally came!!...,0
3,when sisters love their friends more than thei...,0
4,@user '' x'mas &amp; my bihday disney ! '' ...,0


In [15]:
df_resampled.label.value_counts()

0    2242
1    2242
Name: label, dtype: int64

In [16]:
data=df_resampled

In [17]:
# Preprocess the data
data['text'] = data['text'].apply(preprocess_text)

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

In [19]:
# Define the classifiers
lr = LogisticRegression(random_state=42)
nb = MultinomialNB()
rf = RandomForestClassifier(random_state=42)

In [20]:
# Define the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

In [21]:
# Define the voting classifier
voting_clf = VotingClassifier(
    estimators=[('lr', lr), ('nb', nb), ('rf', rf)],
    voting='soft')

In [22]:
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('voting', voting_clf)
])

In [23]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [24]:
# Predict on the test data
y_pred = pipeline.predict(X_test)

In [25]:
# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.8294314381270903


In [26]:
# Define a function to predict the label for a given string
def predict_hate_speech(text):
    text = preprocess_text(text)
    label = pipeline.predict([text])[0]
    return label

In [27]:
# Test the function on a sample string
text = "I hate people like you"
label = predict_hate_speech(text)
print('Text:', text)
print('Label:', label)

Text: I hate people like you
Label: 1


In [28]:
text = "I love you"
label = predict_hate_speech(text)
print('Text:', text)
print('Label:', label)

Text: I love you
Label: 0


In [29]:
import joblib
joblib.dump(pipeline, 'hate_speech_model.joblib')

['hate_speech_model.joblib']