In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


# DATA SET
train_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\train.csv\train.csv"
test_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\test.csv\test.csv"
test_label_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\test_labels.csv\test_labels.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    
    # REMOVE URLS
    text = re.sub(r'http\S+', '', text)
    
    # REMOVE SPECIAL CHARECTER
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # CONVERT TO LOWER CASE
    text = text.lower()
    
    #TOKENIZE THE TEXT 
    tokens = nltk.word_tokenize(text)
    
    # REMOVE STOP WORDS
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    #LANCASTER STEMING ALTERNATIVE TO PORTER
    lancaster = LancasterStemmer()
    tokens = [lancaster.stem(word) for word in tokens]
    
    # LEMMATIZATION
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # JOIN BACK TO FORM THE PREPROCESS TEXT
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# LET'S APPLY PREPROCESS TO  'comment_text'COL
train_df['comment_text'] = train_df['comment_text'].apply(preprocess_text)
test_df['comment_text'] = test_df['comment_text'].apply(preprocess_text)


print("Preprocessed Training Data:")
print(train_df.head())

print("\nPreprocessed Testing Data:")
print(test_df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessed Training Data:
                 id                                       comment_text  toxic  \
0  0000997932d777bf  expl edit mad usernam hardc metallic fan rever...      0   
1  000103f0d9cfb60f  daww match background colo im seem stuck thank...      0   
2  000113f07ec002fd  hey man im real try edit war guy const remov r...      0   
3  0001b41b1c6bb37e  cant mak real suggest improv wond sect stat la...      0   
4  0001d958c54c6e35                     sir hero chant rememb pag that      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  

Preprocessed Testing Data:
                 id                                       comment_text
0  00001cee341fdb12  yo bitch ja rul s

In [3]:
#PERFORM NAIVE BAYS MODEL 

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# DATA SET
train_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\train.csv\train.csv"
test_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\test.csv\test.csv"
test_label_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\test_labels.csv\test_labels.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    
    # REMOVE URLS
    text = re.sub(r'http\S+', '', text)
    
    # REMOVE SPECIAL CHARECTER
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # CONVERT TO LOWER CASE
    text = text.lower()
    
    #TOKENIZE THE TEXT 
    tokens = nltk.word_tokenize(text)
    
    # REMOVE STOP WORDS
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    #LANCASTER STEMING ALTERNATIVE TO PORTER
    lancaster = LancasterStemmer()
    tokens = [lancaster.stem(word) for word in tokens]
    
    # LEMMATIZATION
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # JOIN BACK TO FORM THE PREPROCESS TEXT
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# LET'S APPLY PREPROCESS TO  'comment_text'COL
train_df['comment_text'] = train_df['comment_text'].apply(preprocess_text)
test_df['comment_text'] = test_df['comment_text'].apply(preprocess_text)


# SPLIT DATA INTO TRAIN AND VALIDATION SET
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df['comment_text'], train_df['toxic'], test_size=0.2, random_state=42
)

# TF-IDF VECTORIZATION
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)


#NAIVE BAYES CLASSIFIRE
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

#PREDICTION ON THE VALIDATION SET
y_pred = model.predict(X_valid_tfidf)

# EVALUATE
accuracy = accuracy_score(y_valid, y_pred)
print(f'Validation Accuracy: {accuracy}')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Validation Accuracy: 0.9469528434905217


In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# DATA SET
train_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\train.csv\train.csv"
test_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\test.csv\test.csv"
test_label_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\test_labels.csv\test_labels.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    
    # REMOVE URLS
    text = re.sub(r'http\S+', '', text)
    
    # REMOVE SPECIAL CHARECTER
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # CONVERT TO LOWER CASE
    text = text.lower()
    
    #TOKENIZE THE TEXT 
    tokens = nltk.word_tokenize(text)
    
    # REMOVE STOP WORDS
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    #LANCASTER STEMING ALTERNATIVE TO PORTER
    lancaster = LancasterStemmer()
    tokens = [lancaster.stem(word) for word in tokens]
    
    # LEMMATIZATION
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # JOIN BACK TO FORM THE PREPROCESS TEXT
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# LET'S APPLY PREPROCESS TO  'comment_text'COL
train_df['comment_text'] = train_df['comment_text'].apply(preprocess_text)
test_df['comment_text'] = test_df['comment_text'].apply(preprocess_text)


# SPLIT DATA INTO TRAIN AND VALIDATION SET
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df['comment_text'], train_df['toxic'], test_size=0.2, random_state=42
)

# TF-IDF VECTORIZATION
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)


# NAIVE BAYS MODEL
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

#PREDICTION ON THE VALIDATION SET
threshold = 0.3  
y_pred = (model.predict_proba(X_valid_tfidf)[:, 1] > threshold).astype(int)

# EVALUATEl
accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
roc_auc = roc_auc_score(y_valid, y_pred)


print(f'Validation Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'AUC-ROC: {roc_auc}')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Validation Accuracy: 0.9540654864483785
Precision: 0.8326359832635983
Recall: 0.6511780104712042
F1 Score: 0.7308116048475947
AUC-ROC: 0.8186587581722943


In [7]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# DATA SET
train_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\train.csv\train.csv"
test_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\test.csv\test.csv"
test_label_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\test_labels.csv\test_labels.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    
    # REMOVE URLS
    text = re.sub(r'http\S+', '', text)
    
    # REMOVE SPECIAL CHARECTER
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # CONVERT TO LOWER CASE
    text = text.lower()
    
    #TOKENIZE THE TEXT 
    tokens = nltk.word_tokenize(text)
    
    # REMOVE STOP WORDS
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    #LANCASTER STEMING ALTERNATIVE TO PORTER
    lancaster = LancasterStemmer()
    tokens = [lancaster.stem(word) for word in tokens]
    
    # LEMMATIZATION
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # JOIN BACK TO FORM THE PREPROCESS TEXT
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# LET'S APPLY PREPROCESS TO  'comment_text'COL
train_df['comment_text'] = train_df['comment_text'].apply(preprocess_text)
test_df['comment_text'] = test_df['comment_text'].apply(preprocess_text)


# SPLIT DATA INTO TRAIN AND VALIDATION SET
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df['comment_text'], train_df['toxic'], test_size=0.2, random_state=42
)

# TF-IDF VECTORIZATION
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

# NAIVE BAYS MODEL
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

#PREDICTION ON THE VALIDATION SET
threshold = 0.2  
y_pred = (model.predict_proba(X_valid_tfidf)[:, 1] > threshold).astype(int)


# EVALUATE THE MODEL
accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
roc_auc = roc_auc_score(y_valid, y_pred)


print(f'Validation Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'AUC-ROC: {roc_auc}')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Validation Accuracy: 0.9474228419238603
Precision: 0.710445937690898
Recall: 0.7611256544502618
F1 Score: 0.7349131121642971
AUC-ROC: 0.8641381416850914


In [10]:
#BUILD USER INTERECTION

import re
import nltk
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import joblib

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# DATASET
train_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\train.csv\train.csv"
test_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\test.csv\test.csv"
test_label_path = r"C:\Users\nh013\Desktop\Identify and classify toxic online comments\test_labels.csv\test_labels.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lancaster = LancasterStemmer()
    tokens = [lancaster.stem(word) for word in tokens]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# PREPROCESS TO THE 'comment_text' COL
train_df['comment_text'] = train_df['comment_text'].apply(preprocess_text)
test_df['comment_text'] = test_df['comment_text'].apply(preprocess_text)

# SPLIT THE DATA INTO TRAIN AND VALIDATION SET
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df['comment_text'], train_df['toxic'], test_size=0.2, random_state=42
)

# TF-IDF VECTORIZATIONS
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

# TRAN THE MDOEL
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# SAVE THE MODEL
model_path = 'toxic_comment_classifier_model.pkl'
joblib.dump(model, model_path)

# FUNCTION TO PREDICT TOXICITY
def predict_toxicity(text, threshold=0.2):
    preprocessed_text = preprocess_text(text)
    text_vectorized = tfidf_vectorizer.transform([preprocessed_text])
    prediction = (model.predict_proba(text_vectorized)[:, 1] > threshold).astype(int)
    return prediction

# USER INTERECTION
while True:
    user_input = input("Enter a comment (type 'exit' to stop): ")
    
    if user_input.lower() == 'exit':
        break
    
    prediction = predict_toxicity(user_input)
    
    if prediction == 1:
        print("Toxic Comment")
    else:
        print("Non-Toxic Comment")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Enter a comment (type 'exit' to stop): Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"
Toxic Comment
Enter a comment (type 'exit' to stop): "Thank you for understanding. I think very highly of you and would not revert without discussion."
Non-Toxic Comment
Enter a comment (type 'exit' to stop): exit
