In [22]:
#import required packages
#basics
import pandas as pd 
import numpy as np

#misc
import gc
import time
import warnings

#viz
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud ,STOPWORDS
from PIL import Image

#FeatureEngineering
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency

#confusion matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

In [2]:
train_df = pd.read_csv("C:/Users/KIIT/OneDrive/Desktop/train.csv")
test_df = pd.read_csv("C:/Users/KIIT/OneDrive/Desktop/test.csv")
df = pd.concat([train_df, test_df], axis = 0).reset_index(drop = True)

In [3]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [5]:
# look for number of distinct value in each column
df  = train_df.nunique().to_frame().reset_index()
df.columns = ["col", "nunique"]
df

Unnamed: 0,col,nunique
0,id,159571
1,comment_text,159571
2,toxic,2
3,severe_toxic,2
4,obscene,2
5,threat,2
6,insult,2
7,identity_hate,2


In [6]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
bin_df = pd.concat([
           train_df[train_df[labels].eq(0).all(1)][['id', 'comment_text', 'toxic']], # get new df with rows for "good" comments
           train_df[train_df['toxic'] == 1][['id', 'comment_text', 'toxic']]] # get new dataframe with rows for "toxic" comments
         ).reset_index(drop=True)

In [7]:
bin_df

Unnamed: 0,id,comment_text,toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0
...,...,...,...
158635,fef4cf7ba0012866,"""\n\n our previous conversation \n\nyou fuckin...",1
158636,ff39a2895fc3b40e,YOU ARE A MISCHIEVIOUS PUBIC HAIR,1
158637,ffa33d3122b599d6,Your absurd edits \n\nYour absurd edits on gre...,1
158638,ffb47123b2d82762,"""\n\nHey listen don't you ever!!!! Delete my e...",1


In NLP, the main steps to preprocess raw text into tensors includes the following steps:<br> ***word tokenization***, ***text cleaning*** (lowercasing, removing stop words and punctuation, stemming), and ***text vectorization***.

In [9]:
import re
import string
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
# stemmer
stemmer = PorterStemmer()

# stop words
nltk.download('stopwords')
nltk.download('punkt')
stopwords_english = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [11]:
def preprocess(comment):
    # normalizing case
    comment = comment.lower()

    # remove special characters
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    comment = re.sub(pat, '', comment)

    # remove punctuation
    comment = comment.translate(str.maketrans('', '', string.punctuation))

    #stopwordsa
    stopwords = [word.translate(str.maketrans('', '', string.punctuation)) for word in stopwords_english]
    
    # replace repeated character sequences of length 3 or greater with sequences of length 3
    comment = nltk.tokenize.casual.reduce_lengthening(comment)
    
    # tokenize comment
    tokens = word_tokenize(comment)
    
    # remove stop words and stem
    comment_clean = [stemmer.stem(token) for token in tokens if token not in stopwords and len(token) > 1]
    #comment_clean = [token for token in tokens if token not in stopwords]

    return comment_clean

In [12]:
bin_df['clean_text'] = bin_df['comment_text'].apply(preprocess)

In [None]:
# train_df['clean_text'] = train_df['comment_text'].apply(preprocess)

In [None]:
# train_df

In [13]:
original_distribution = bin_df['toxic'].value_counts(normalize=True)
print("Original Class Distribution:")
print(original_distribution)

Original Class Distribution:
toxic
0    0.903593
1    0.096407
Name: proportion, dtype: float64


## **Split data into train, validation sets**

Since our classes are imbalanced, it is important that both the training and validation sets have the same ratio of "good" to "bad" comments.

In [14]:
train, test = train_test_split(bin_df, test_size = 0.2, stratify = bin_df['toxic'], random_state=21)
print(train.shape, test.shape)

(126912, 4) (31728, 4)


In [15]:
# create a TF-IDF vectorizer object
vectorizer = TfidfVectorizer( max_features=100)
# fit the object with the training data comments
vectorizer.fit(train.comment_text)

Use the model and transform the train and test data comments:

In [16]:
# transform the train and test data
train_idf = vectorizer.transform(train.comment_text)
test_idf  = vectorizer.transform(test.comment_text)

In [30]:
# applying  logistic regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

# Fit the model
log_reg.fit(train_idf, train['toxic'])

# Predict on test data
log_reg_preds = log_reg.predict(test_idf)

# Calculate accuracy
accuracy = accuracy_score(test['toxic'], log_reg_preds)
print("Accuracy:", accuracy)

# Print classification report
print("\nClassification Report:")
print(classification_report(test['toxic'], log_reg_preds))

Accuracy: 0.9111825516893596

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     28669
           1       0.62      0.20      0.30      3059

    accuracy                           0.91     31728
   macro avg       0.77      0.59      0.63     31728
weighted avg       0.89      0.91      0.89     31728



In [32]:
# applying naive bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
naive_bayes_model = MultinomialNB()

# Fit the model
naive_bayes_model.fit(train_idf, train['toxic'])

# Predict on test data
naive_bayes_preds = naive_bayes_model.predict(test_idf)

# Calculate accuracy
accuracy = accuracy_score(test['toxic'], naive_bayes_preds)
print("Accuracy:", accuracy)

# Print classification report
print("\nClassification Report:")
print(classification_report(test['toxic'], naive_bayes_preds))

Accuracy: 0.9035867372667675

Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     28669
           1       0.00      0.00      0.00      3059

    accuracy                           0.90     31728
   macro avg       0.45      0.50      0.47     31728
weighted avg       0.82      0.90      0.86     31728



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier

# Initialize XGBoost and AdaBoost classifiers
xgb_classifier = xgb.XGBClassifier()
ada_classifier = AdaBoostClassifier()

# Train XGBoost classifier
xgb_classifier.fit(train_idf, train['toxic'])

# Train AdaBoost classifier
ada_classifier.fit(train_idf, train['toxic'])


# Calculate accuracy for XGBoost
xgb_accuracy = accuracy_score(test['toxic'], xgb_predictions)
print("Accuracy:", xgb_accuracy)

# Predictions for XGBoost
xgb_predictions = xgb_classifier.predict(test_idf)
print("Classification report:")
print(classification_report(test['toxic'], xgb_predictions))

# Calculate accuracy for AdaBoost
ada_accuracy = accuracy_score(test['toxic'], ada_predictions)
print("Accuracy:", ada_accuracy)

# Predictions for AdaBoost
ada_predictions = ada_classifier.predict(test_idf)
print("Classification report:")
print(classification_report(test['toxic'], ada_predictions))






Accuracy: 0.9099848714069592
Classification report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     28669
           1       0.59      0.22      0.32      3059

    accuracy                           0.91     31728
   macro avg       0.75      0.60      0.64     31728
weighted avg       0.89      0.91      0.89     31728

Accuracy: 0.9075895108421583
Classification report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     28669
           1       0.57      0.17      0.27      3059

    accuracy                           0.91     31728
   macro avg       0.74      0.58      0.61     31728
weighted avg       0.88      0.91      0.88     31728

