# **AVOTOX**

# Data Collection and Preprocessing

In [None]:
!pip install scikit-learn==1.2.2
!pip install pandas==2.0.3
!pip install contractions
!pip install xgboost
!pip install imbalanced-learn




In [None]:
import pandas as pd
import contractions
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.multioutput import MultiOutputClassifier
import joblib

**stopwords** are words that are filtered out of natural language data do to be considred unimportant (during or after processing)

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def preprocess_text(text):
  text = text.lower()
  # Expand contractions
  text = contractions.fix(text)
  # Remove HTML tags
  text = re.sub(r'<.*?>', '', text)
  # Remove URLs
  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
  # Remove email addresses
  text = re.sub(r'\S+@\S+', '', text)
  # Remove hashtags
  text = re.sub(r'#\w+', '', text)
  # Remove mentions
  text = re.sub(r'@\w+', '', text)
  # Remove special characters except punctuation
  text = re.sub(r'[^a-zA-Z\s.,]', '', text)
  # Remove extra whitespace
  text = re.sub(r'\s+', ' ', text).strip()
  # Tokenize text
  words = text.split()
  # Remove stop words
  stop_words = set(stopwords.words('english'))
  words = [word for word in words if word not in stop_words]
  # Lemmatize words
  lemmatizer = WordNetLemmatizer()
  words = [lemmatizer.lemmatize(word) for word in words]
  # Join words to a single string
  text = ' '.join(words)
  # print(text)
  return text

Download the CSV files via this link: [Kaggle Dataset](https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/data)

Then use Google Colab File Explorer:


*   On the left sidebar, click the Files tab.
*   Click the Upload button and select the files you want to upload.
*   Once uploaded, the files will appear in the file explorer.






In [None]:
# Load the files into pandas DataFrames
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')
test_labels = pd.read_csv('/content/test_labels.csv')

# Dislay the first few rows of the datasets
print("train_data")
print(train_data.head())
print("test_data")
print(test_data.head())
print("test_labels")
print(test_labels.head())

train_data
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
test_data
                 id                                       comment_text
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...
1 

# Model Building and Training

**Training data set:** This is the largest subset used to train the model by adjusting its parameters. It helps the model learn the underlying patterns in the data.


**Validation data set:** We use this set to provide an unbiased evaluation of the model during the training phase.
Credit: [link text](https://kili-technology.com/training-data/training-validation-and-test-sets-how-to-split-machine-learning-data#:~:text=Training%20data%20set%3A%20This%20is,model%20during%20the%20training%20phase.)





[RandomForestClassifier scikit](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

[XGBoost
](https://xgboost.readthedocs.io/en/latest/python/python_api.html)

[More](https://stackoverflow.com/questions/45251126/deprecation-warning-on-xgboost-sklearn)

In [None]:
# Apply preprocessing
train_data['comment_text'] = train_data['comment_text'].apply(preprocess_text)
test_data['comment_text'] = test_data['comment_text'].apply(preprocess_text)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(train_data['comment_text'])
y_train = train_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define the RandomForestClassifier and XGBoostClassifier models
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10, class_weight='balanced')
xgb_model = XGBClassifier(eval_metric='logloss', use_label_encoder=False)

# Combine the models in a VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('xgb', xgb_model)
], voting='soft')

multi_output_model = MultiOutputClassifier(ensemble_model)

# Fit the ensemble model
multi_output_model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_val_pred = multi_output_model.predict(X_val)
print('Validation Set Results')
print(classification_report(y_val, y_val_pred, target_names=y_train.columns, zero_division=0))

# Preprocess test data
X_test = vectorizer.transform(test_data['comment_text'])
y_test_pred = multi_output_model.predict(X_test)

# Filter out test labels with -1
valid_indices = test_labels[(test_labels[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] != -1).all(axis=1)].index
y_test_true = test_labels.loc[valid_indices, ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
y_test_true[y_test_true == -1] = 0
y_test_pred_filtered = y_test_pred[valid_indices]

# Evaluate the model on the test set
print('Test Set Results')
print(classification_report(y_test_true, y_test_pred_filtered, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], zero_division=0))

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Validation Set Results
               precision    recall  f1-score   support

        toxic       0.91      0.61      0.73      3056
 severe_toxic       0.48      0.29      0.36       321
      obscene       0.88      0.72      0.79      1715
       threat       0.49      0.24      0.32        74
       insult       0.78      0.58      0.66      1614
identity_hate       0.68      0.32      0.43       294

    micro avg       0.84      0.60      0.70      7074
    macro avg       0.70      0.46      0.55      7074
 weighted avg       0.84      0.60      0.69      7074
  samples avg       0.05      0.05      0.05      7074

Test Set Results
               precision    recall  f1-score   support

        toxic       0.63      0.70      0.67      6090
 severe_toxic       0.39      0.43      0.41       367
      obscene       0.67      0.69      0.68      3691
       threat       0.51      0.36      0.42       211
       insult       0.68      0.58      0.63      3427
identity_hate       0

**Toxic and Obscene** classes have relatively high precision and recall, indicating good performance.


**Severe Toxic, Threat, and Identity Hate** classes have lower precision and recall, indicating that the model struggles with these categories, likely due to class imbalance or insufficient distinctive features.

# Wondering if we could use a seperate file soley for training and then apply it to the comments in the Instagram API

In [None]:
# Save the model and vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(multi_output_model, 'multi_output_model.pkl')

['multi_output_model.pkl']