In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Load data
data = pd.read_csv('/content/drive/My Drive/train1.csv')  # Replace 'your_data.csv' with your dataset path
X = data['comment_text']
y = data['identity_hate']

# data = pd.read_csv('/kaggle/input/dataset101/HateSpeechDataset.csv')  # Replace 'your_data.csv' with your dataset path
# X = data['Content']
# y = data['Label']


#three labels # hatespeech,ofensive,neither
# trial with other dataset with 2 labels 1 for offensive and 0 for non offensive

In [None]:
len(data)

223549

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Assuming you have NLTK installed, if not, install it using:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Function to preprocess text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    # Join the words back into a string
    text = ' '.join(filtered_text)
    return text



# Model training and evaluation steps...

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
X_train_preprocessed = X_train.apply(preprocess_text)
X_test_preprocessed = X_test.apply(preprocess_text)

In [None]:
# Data cleaning and preprocessing



# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_preprocessed)
X_test_tfidf = tfidf_vectorizer.transform(X_test_preprocessed)

In [None]:
# # Train Decision Tree
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train_tfidf, y_train)
dt_preds = dt_classifier.predict(X_test_tfidf)
dt_accuracy = accuracy_score(y_test, dt_preds)

In [None]:
dt_accuracy

0.9896891075821964

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate Accuracy
dt_accuracy = accuracy_score(y_test, dt_preds)

# Calculate Precision
dt_precision = precision_score(y_test, dt_preds, average='weighted')

# Calculate Recall
dt_recall = recall_score(y_test, dt_preds, average='weighted')

# Calculate F1 Score
dt_f1_score = f1_score(y_test, dt_preds, average='weighted')

# Print the metrics
print("Decision Tree Classifier Metrics:")
print("Accuracy:", dt_accuracy)
print("Precision:", dt_precision)
print("Recall:", dt_recall)
print("F1 Score:", dt_f1_score)

Decision Tree Classifier Metrics:
Accuracy: 0.9896891075821964
Precision: 0.9882971149615107
Recall: 0.9896891075821964
F1 Score: 0.9889172825319892


In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

# Calibrate the classifier to get probability estimates
calibrated_classifier = CalibratedClassifierCV(dt_classifier, method='sigmoid', cv='prefit')
calibrated_classifier.fit(X_train_tfidf, y_train)

# Get the predicted probabilities for the test set
probabilities = calibrated_classifier.predict_proba(X_test_tfidf)

# Calculate log loss
dt_log_loss = log_loss(y_test, probabilities)

# Print the log loss
print("Log Loss for Decision Tree Classifier:", dt_log_loss)

Log Loss for Decision Tree Classifier: 0.08962400806489855


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assume these are the best parameters found from grid search
best_params = {
    'n_estimators': 100,
    'max_depth': 10,
    'min_samples_split': 2,
    'min_samples_leaf': 1
}

# Create a Random Forest classifier with the best parameters
best_rf_model = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf']
)

# Fit the model to the training data
best_rf_model.fit(X_train_tfidf, y_train)

# Predict on the test data
best_rf_model_preds = best_rf_model.predict(X_test_tfidf)

# Calculate the accuracy
best_rf_model_accuracy = accuracy_score(y_test, best_rf_model_preds)
print(f"Best Random Forest Model Accuracy: {best_rf_model_accuracy}")


Best Random Forest Model Accuracy: 0.990874524714829


In [None]:
best_rf_model_accuracy

0.990874524714829

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Assuming the RandomForestClassifier with the best parameters has already been trained and tested as in the provided code
# The model's predictions are stored in best_rf_model_preds and the true labels in y_test

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, best_rf_model_preds)

# Calculate accuracy, precision, recall, and f1 score
accuracy = accuracy_score(y_test, best_rf_model_preds)
precision = precision_score(y_test, best_rf_model_preds, average='weighted')
recall = recall_score(y_test, best_rf_model_preds, average='weighted')
f1score = f1_score(y_test, best_rf_model_preds, average='weighted')

# Print results
print("Confusion Matrix:")
print(conf_matrix)

print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1score)


Confusion Matrix:
[[44302     0]
 [  408     0]]

Accuracy: 0.990874524714829
Precision: 0.9818323237288381
Recall: 0.990874524714829
F1 Score: 0.986332701072133


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.metrics import log_loss

# Predict probabilities for each class
y_pred_proba = best_rf_model.predict_proba(X_test_tfidf)

# Assuming y_test contains the true labels for the test data

# Calculate log loss
logloss = log_loss(y_test, y_pred_proba)

print("Log Loss:", logloss)


Log Loss: 0.03860382137994724


In [None]:
# Train Logistic Regression
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train_tfidf, y_train)
lr_preds = lr_classifier.predict(X_test_tfidf)
lr_accuracy = accuracy_score(y_test, lr_preds)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define the logistic regression classifier
lr_classifier = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')

# Define the hyperparameters grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l2'],                     # L2 regularization only for multinomial
    'class_weight': [None, 'balanced'],    # Weights associated with classes
    'fit_intercept': [True, False],        # Whether to calculate the intercept
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=lr_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train a new logistic regression classifier using the best hyperparameters
best_lr_classifier = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs', **best_params)
best_lr_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set using the best classifier
best_lr_preds = best_lr_classifier.predict(X_test_tfidf)

# Calculate accuracy
best_lr_accuracy = accuracy_score(y_test, best_lr_preds)

print("Best Hyperparameters:", best_params)
print("Accuracy with Best Hyperparameters:", best_lr_accuracy)

Best Hyperparameters: {'C': 1, 'class_weight': None, 'fit_intercept': True, 'penalty': 'l2'}
Accuracy with Best Hyperparameters: 0.9920152091254753


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

# Define the logistic regression classifier
lr_classifier = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')

# Fit the classifier with training data
lr_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set using lr_classifier
lr_preds = lr_classifier.predict(X_test_tfidf)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, lr_preds)

# Calculate accuracy
lr_accuracy = accuracy_score(y_test, lr_preds)

# Calculate precision
lr_precision = precision_score(y_test, lr_preds, average='weighted')

# Calculate recall
lr_recall = recall_score(y_test, lr_preds, average='weighted')

# Calculate F1 score
lr_f1 = f1_score(y_test, lr_preds, average='weighted')

print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", lr_accuracy)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1 Score:", lr_f1)



Confusion Matrix:
 [[44249    53]
 [  304   104]]
Accuracy: 0.9920152091254753
Precision: 0.9901583576998265
Recall: 0.9920152091254753
F1 Score: 0.990252874111492


In [None]:
from sklearn.metrics import log_loss

# Assuming you have already fitted the lr_classifier with training data and defined X_test_tfidf and y_test

# Predict probabilities on the test set
lr_probs = lr_classifier.predict_proba(X_test_tfidf)

# Calculate log loss
lr_log_loss = log_loss(y_test, lr_probs)

print("Log Loss:", lr_log_loss)


Log Loss: 0.026057940051724432


In [None]:
lr_accuracy

0.9920152091254753

In [None]:
from sklearn.svm import SVC

# Train SVM
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)
svm_preds = svm_classifier.predict(X_test_tfidf)
svm_accuracy = accuracy_score(y_test, svm_preds)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Train SVM
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set using svm_classifier
svm_preds = svm_classifier.predict(X_test_tfidf)

# Calculate confusion matrix
svm_conf_matrix = confusion_matrix(y_test, svm_preds)

# Calculate accuracy
svm_accuracy = accuracy_score(y_test, svm_preds)

# Calculate precision
svm_precision = precision_score(y_test, svm_preds, average='weighted')

# Calculate recall
svm_recall = recall_score(y_test, svm_preds, average='weighted')

# Calculate F1 score
svm_f1 = f1_score(y_test, svm_preds, average='weighted')

print("SVM Confusion Matrix:\n", svm_conf_matrix)
print("SVM Accuracy:", svm_accuracy)
print("SVM Precision:", svm_precision)
print("SVM Recall:", svm_recall)
print("SVM F1 Score:", svm_f1)


SVM Confusion Matrix:
 [[44273    29]
 [  342    66]]
SVM Accuracy: 0.9917020800715723
SVM Precision: 0.9896186973643336
SVM Recall: 0.9917020800715723
SVM F1 Score: 0.9891349265907705


In [None]:
svm_accuracy

0.9917020800715723

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Create a pipeline with TruncatedSVD and KNN
pipeline = Pipeline([
    ('svd', TruncatedSVD(n_components=100)),  # Adjust n_components as needed
    ('scaler', StandardScaler()),  # Standardizing after SVD
    ('knn', KNeighborsClassifier(n_neighbors=2, n_jobs=-1))  # Use all available cores
])

# Train the pipeline
pipeline.fit(X_train_tfidf, y_train)

# Predict on test set
knn_preds = pipeline.predict(X_test_tfidf)

# Calculate accuracy
knn_accuracy = accuracy_score(y_test, knn_preds)
print("KNN Accuracy with SVD:", knn_accuracy)


KNN Accuracy with SVD: 0.9905166629389398


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Assuming you have trained your KNN model and made predictions (knn_preds) on the test set

# Calculate Precision
precision = precision_score(y_test, knn_preds, average='weighted')

# Calculate Recall
recall = recall_score(y_test, knn_preds, average='weighted')

# Calculate F1 Score
f1 = f1_score(y_test, knn_preds, average='weighted')

# Calculate Confusion Matrix
conf_matrix = confusion_matrix(y_test, knn_preds)

# Print the results
print("Confusion Matrix:")
print(conf_matrix)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)



Confusion Matrix:
[[44244    58]
 [  366    42]]
Precision: 0.9865776552683461
Recall: 0.9905166629389398
F1 Score: 0.9876582186532762


In [None]:
from sklearn.metrics import log_loss

# Predict probabilities instead of class labels
knn_probs = pipeline.predict_proba(X_test_tfidf)  # Note: KNN doesn't have predict_proba, so this will not work

# Calculate log loss
knn_log_loss = log_loss(y_test, knn_probs)
print("Log Loss for KNN:", knn_log_loss)


Log Loss for KNN: 0.27391018533064465


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Define the hyperparameter grid
param_grid = {
    'alpha': [0.1, 0.5, 1.0],  # Add more values if needed
    'fit_prior': [True, False]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform Grid Search
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Initialize Multinomial Naive Bayes classifier with the best hyperparameters
best_nb_classifier = MultinomialNB(**best_params)

# Train the best classifier on the entire training set
best_nb_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
nb_preds = best_nb_classifier.predict(X_test_tfidf)

# Calculate accuracy
nb_accuracy = accuracy_score(y_test, nb_preds)

print("Best Hyperparameters:", best_params)
print("Accuracy with Best Hyperparameters:", nb_accuracy)

Best Hyperparameters: {'alpha': 0.5, 'fit_prior': True}
Accuracy with Best Hyperparameters: 0.9907179601878774


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Assuming you have made predictions (nb_preds) using your classifier

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, nb_preds)

# Calculate accuracy
accuracy = accuracy_score(y_test, nb_preds)

# Calculate precision
precision = precision_score(y_test, nb_preds, average='weighted')

# Calculate recall
recall = recall_score(y_test, nb_preds, average='weighted')

# Calculate F1 score
f1 = f1_score(y_test, nb_preds, average='weighted')

print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Confusion Matrix:
[[44232    70]
 [  345    63]]
Accuracy: 0.9907179601878774
Precision: 0.9875283269429994
Recall: 0.9907179601878774
F1 Score: 0.9883732063101116


In [None]:
from sklearn.metrics import log_loss

# Get probability estimates for each class
nb_probs = best_nb_classifier.predict_proba(X_test_tfidf)

# Calculate log loss
nb_log_loss = log_loss(y_test, nb_probs)

print("Log Loss with Best Hyperparameters:", nb_log_loss)


Log Loss with Best Hyperparameters: 0.02876192589276328


In [None]:
!pip install spacy
#python -m spacy download en_core_web_sm
import spacy

# Load English language model
nlp = spacy.load('en_core_web_sm')

# Define a function for preprocessing text with lemmatization
def preprocess_text_with_lemmatization(tweet):
    # Remove special characters, URLs, and mentions
    tweet = re.sub(r'http\S+|www\S+|pic.twitter\S+|@\S+', '', tweet)
    tweet = re.sub(r'[^a-zA-Z\s]', '', tweet)

    # Remove extra spaces and convert to lowercase
    tweet = ' '.join(tweet.lower().split())

    # Lemmatize the text
    lemmatized_tokens = []
    doc = nlp(tweet)
    for token in doc:
        lemmatized_tokens.append(token.lemma_)

    # Join lemmatized tokens back into a single string
    tweet = ' '.join(lemmatized_tokens)

    return tweet

