In [6]:
import pandas as pd

In [7]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [8]:
file_path = '/content/drive/MyDrive/SM technology task/datasets/'
train_df=pd.read_csv(file_path+'cleaned_train.csv')
val_df=pd.read_csv(file_path+'cleaned_validation.csv')

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23468 entries, 0 to 23467
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             23468 non-null  object
 1   feedback_text  23468 non-null  object
 2   toxic          23468 non-null  int64 
 3   abusive        23468 non-null  int64 
 4   vulgar         23468 non-null  int64 
 5   menace         23468 non-null  int64 
 6   offense        23468 non-null  int64 
 7   bigotry        23468 non-null  int64 
 8   cleaned_text   23468 non-null  object
dtypes: int64(6), object(3)
memory usage: 1.6+ MB


In [10]:
val_df.head()

Unnamed: 0,id,feedback_text,lang,toxic,text
0,1203,İyi tamam olabilir. Balkanlar maddesini gelişt...,tr,0,Well okay. We improve the Balkans substance an...
1,5871,"Por dios, y la canción de John Lennon: http://...",es,1,"By God, and John Lennon's song: http://www.you..."
2,3590,Selam. Öncelikle tebrik ederim... Bu arada ken...,tr,0,"Hi. First of all, I congratulate you ... In th..."
3,447,Leggiti tutte le discussioni. Magari cancellal...,it,1,Read all the discussions. Maybe delete and get...
4,6634,A LAS TOKITAS NOS VALE QUE LAS JONATICAS INSUL...,es,1,To the tokitas it is worth that the Jonaticas ...


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf.fit_transform(train_df['cleaned_text'])
X_val_tfidf = tfidf.transform(val_df['text'])

# Shape of the resulting TF-IDF matrix
print("TF-IDF Matrix Shape of train:", X_train_tfidf.shape)
print("TF-IDF Matrix Shape of validation:", X_val_tfidf.shape)

TF-IDF Matrix Shape of train: (23468, 10000)
TF-IDF Matrix Shape of validation: (840, 10000)


## 1. Baseline Model: Logistic Regression or Random Forest


In [12]:
val_df.head()

Unnamed: 0,id,feedback_text,lang,toxic,text
0,1203,İyi tamam olabilir. Balkanlar maddesini gelişt...,tr,0,Well okay. We improve the Balkans substance an...
1,5871,"Por dios, y la canción de John Lennon: http://...",es,1,"By God, and John Lennon's song: http://www.you..."
2,3590,Selam. Öncelikle tebrik ederim... Bu arada ken...,tr,0,"Hi. First of all, I congratulate you ... In th..."
3,447,Leggiti tutte le discussioni. Magari cancellal...,it,1,Read all the discussions. Maybe delete and get...
4,6634,A LAS TOKITAS NOS VALE QUE LAS JONATICAS INSUL...,es,1,To the tokitas it is worth that the Jonaticas ...


In [13]:
y_train=train_df[['toxic','abusive','vulgar','menace','offense','bigotry']]
y_val=val_df['toxic']

Logistic Regression Model

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Initialize and train
logistic_model = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=42))
logistic_model.fit(X_train_tfidf, y_train)


RandomForest Classifier Model

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# Initialize and train
rf_model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
rf_model.fit(X_train_tfidf, y_train)


In [16]:
# Predict
logistic_preds = logistic_model.predict(X_val_tfidf)
rf_preds = rf_model.predict(X_val_tfidf)

since validation has single-label data(toxic) but model trained for multi-label data,  
for evaluation,take only toxic result from predict output

In [17]:
lr_y_preds=logistic_preds[:,0]
rf_y_preds=rf_preds[:,0]

## Evaluation of Logistic Regression

In [18]:

from sklearn.metrics import classification_report, confusion_matrix, auc, roc_curve
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

lr_report = classification_report(y_val, lr_y_preds)
lr_cm = confusion_matrix(y_val, lr_y_preds)
lr_fpr, lr_tpr, thresholds = roc_curve(y_val, lr_y_preds)
lr_roc_auc = auc(lr_fpr, lr_tpr)

# Example: Save confusion matrix
fig, ax = plt.subplots()
disp = ConfusionMatrixDisplay(confusion_matrix=lr_cm)
disp.plot(ax=ax, cmap='Blues')
plt.title("Confusion Matrix of Logistic Regression")
plt.savefig("lr_confusion_matrix.png", bbox_inches='tight')
plt.close()

# Example: Save ROC curve
plt.figure()
plt.plot(lr_fpr, lr_tpr, label=f"AUC = {lr_roc_auc:.2f}")
plt.plot([0, 1], [0, 1], linestyle='--')
plt.title("ROC Curve of Logistic Regression")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid()
plt.savefig("lr_roc_curve.png", bbox_inches='tight')
plt.close()

print(lr_report)

              precision    recall  f1-score   support

           0       0.86      0.99      0.92       706
           1       0.79      0.14      0.24       134

    accuracy                           0.86       840
   macro avg       0.83      0.57      0.58       840
weighted avg       0.85      0.86      0.81       840



## Evaluation of Random Forest

In [27]:

rf_report = classification_report(y_val, rf_y_preds)
rf_cm = confusion_matrix(y_val, rf_y_preds)
rf_fpr, rf_tpr, thresholds = roc_curve(y_val, rf_y_preds)
rf_roc_auc = auc(rf_fpr, rf_tpr)

# Example: Save confusion matrix
fig, ax = plt.subplots()
disp = ConfusionMatrixDisplay(confusion_matrix=rf_cm)
disp.plot(ax=ax, cmap='Blues')
plt.title("Confusion Matrix of Random Forest")
plt.savefig("rf_confusion_matrix.png", bbox_inches='tight')
plt.close()

# Example: Save ROC curve
plt.figure()
plt.plot(rf_fpr, rf_tpr, label=f"AUC = {rf_roc_auc:.2f}")
plt.plot([0, 1], [0, 1], linestyle='--')
plt.title("ROC Curve of Random Forest")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid()
plt.savefig("rf_roc_curve.png", bbox_inches='tight')
plt.close()

print(rf_report)

              precision    recall  f1-score   support

           0       0.88      0.90      0.89       706
           1       0.39      0.35      0.37       134

    accuracy                           0.81       840
   macro avg       0.63      0.62      0.63       840
weighted avg       0.80      0.81      0.80       840



## Advanced Models: LSTM or GRU for capturing sequential nature of text
✅ Step-by-step Plan:

  - Preprocess text (you've already done this! ✅)

  - Tokenize & convert text to sequences

  - Pad sequences to equal length

  - Build LSTM / GRU model

  - Train the model

  - Evaluate performance

In [21]:
val_df['toxic'].value_counts()

Unnamed: 0_level_0,count
toxic,Unnamed: 1_level_1
0,706
1,134


Since validation data is imbalance ,

In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Use cleaned text
train_texts = train_df['cleaned_text'].astype(str).values
val_texts = val_df['text'].astype(str).values
#train_labels = train_df[['toxic','abusive','vulgar','menace','offense','bigotry']].values  # You can loop for other labels later
train_labels = train_df['toxic'].values
val_labels = val_df['toxic'].values

# Fixing shape mismatch (only do this temporarily if needed)
# y_val = val_df[['toxic']]  # only 1 column
# for col in ['abusive', 'vulgar', 'menace', 'offense', 'bigotry']:
#     y_val[col] = 0  # or np.nan, if acceptable
# val_labels = y_val[['toxic','abusive','vulgar','menace','offense','bigotry']].values

# Tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)
tokenizer.fit_on_texts(val_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)

# Pad sequences
max_len = 100  # max words per comment
X_train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
X_val_padded = pad_sequences(val_sequences, maxlen=max_len, padding='post', truncating='post')

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
print("Class Weights:", class_weight_dict)



Class Weights: {0: np.float64(0.5516171493042498), 1: np.float64(5.343351548269581)}


Build the LSTM/GRU Model

In [32]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, GRU, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=64, input_shape=(max_len,)))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



  super().__init__(**kwargs)


Train the model and Evaluation

In [33]:
from sklearn.metrics import classification_report
import numpy as np

# Train the model using class weights
model.fit(X_train_padded, train_labels, validation_data=(X_val_padded, val_labels),
          epochs=5, batch_size=64, class_weight=class_weight_dict)

y_pred = model.predict(X_val_padded)
y_pred_labels = (y_pred > 0.5).astype(int)
print(classification_report(val_labels, y_pred_labels))

Epoch 1/5
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 105ms/step - accuracy: 0.5259 - loss: 0.7023 - val_accuracy: 0.8405 - val_loss: 0.6483
Epoch 2/5
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 108ms/step - accuracy: 0.5603 - loss: 0.6803 - val_accuracy: 0.2405 - val_loss: 0.6985
Epoch 3/5
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 103ms/step - accuracy: 0.4453 - loss: 0.6769 - val_accuracy: 0.8369 - val_loss: 0.6170
Epoch 4/5
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 109ms/step - accuracy: 0.4565 - loss: 0.6599 - val_accuracy: 0.1726 - val_loss: 1.4087
Epoch 5/5
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 104ms/step - accuracy: 0.5757 - loss: 0.5924 - val_accuracy: 0.5393 - val_loss: 0.8751
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step
              precision    recall  f1-score   support

           0       0.97      0.46      0.63       7

## Evaluation of LSTM

In [34]:
train_df['toxic'].value_counts()

Unnamed: 0_level_0,count
toxic,Unnamed: 1_level_1
0,21272
1,2196


train data is imbalance data, so use oversampling to balancing data

In [35]:
lstm_report=classification_report(val_labels,y_pred_labels)
lstm_cm = confusion_matrix(val_labels, y_pred_labels)
lstm_fpr, lstm_tpr, thresholds = roc_curve(val_labels, y_pred_labels)
lstm_roc_auc = auc(lstm_fpr, lstm_tpr)

# Example: Save confusion matrix
fig, ax = plt.subplots()
disp = ConfusionMatrixDisplay(confusion_matrix=lstm_cm)
disp.plot(ax=ax, cmap='Blues')
plt.title("Confusion Matrix of LSTM")
plt.savefig("lstm_confusion_matrix.png", bbox_inches='tight')
plt.close()

# Example: Save ROC curve
plt.figure()
plt.plot(lstm_fpr, lstm_tpr, label=f"AUC = {lstm_roc_auc:.2f}")
plt.plot([0, 1], [0, 1], linestyle='--')
plt.title("ROC Curve of LSTM")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid()
plt.savefig("lstm_roc_curve.png", bbox_inches='tight')
plt.close()

print(lstm_report)

              precision    recall  f1-score   support

           0       0.97      0.46      0.63       706
           1       0.25      0.93      0.39       134

    accuracy                           0.54       840
   macro avg       0.61      0.70      0.51       840
weighted avg       0.86      0.54      0.59       840



## Create pdf report

In [1]:
!pip install reportlab


Collecting reportlab
  Downloading reportlab-4.3.1-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.3.1-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.3.1


In [36]:
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
import re

# PDF Setup
pdf_file = "Performance_Analysis_Report.pdf"
c = canvas.Canvas(pdf_file, pagesize=A4)
width, height = A4
c.setFont("Courier", 10)

# ─── Step 1: Extract metrics from classification report text ─── #
def extract_metrics(report_text):
    lines = report_text.splitlines()
    metrics = {}

    for line in lines:
        if re.match(r"\s*1\s+", line):  # class '1' row
            parts = line.strip().split()
            metrics['precision'] = float(parts[1])
            metrics['recall'] = float(parts[2])
            metrics['f1-score'] = float(parts[3])
        elif line.strip().startswith('accuracy'):
            metrics['accuracy'] = float(line.strip().split()[-2])
        elif line.strip().startswith('macro avg'):
            parts = line.strip().split()
            metrics['macro_f1'] = float(parts[3])
        elif line.strip().startswith('weighted avg'):
            parts = line.strip().split()
            metrics['weighted_f1'] = float(parts[3])

    return metrics

# ─── Step 2: Draw summary table ─── #
def add_summary_page(summary_dicts):
    c.setFont("Courier-Bold", 14)
    c.drawString(170, height - 40, "Model Performance Summary")

    c.setFont("Courier", 10)
    headers = ["Model", "Precision", "Recall", "F1", "Accuracy", "Macro F1", "Weighted F1"]
    y = height - 80

    c.drawString(40, y, "{:<22}{:>10}{:>10}{:>10}{:>12}{:>12}{:>14}".format(*headers))
    y -= 16
    c.line(40, y, width - 40, y)
    y -= 20

    for name, metrics in summary_dicts.items():
        line = "{:<22}{:>10.2f}{:>10.2f}{:>10.2f}{:>12.2f}{:>12.2f}{:>14.2f}".format(
            name,
            metrics.get("precision", 0),
            metrics.get("recall", 0),
            metrics.get("f1-score", 0),
            metrics.get("accuracy", 0),
            metrics.get("macro_f1", 0),
            metrics.get("weighted_f1", 0)
        )
        c.drawString(40, y, line)
        y -= 16

    c.showPage()

# ─── Step 3: Draw individual model report pages ─── #
def draw_text_block(text, x, y_start, min_y):
    lines = text.split('\n')
    y = y_start
    for line in lines:
        if y < min_y:
            break
        c.drawString(x, y, line)
        y -= 12
    return y

def draw_model_page(title, report_text, cm_image_path, roc_image_path):
    c.setFont("Courier-Bold", 12)
    c.drawString(40, height - 40, f"Model: {title}")

    c.setFont("Courier", 9)
    last_y = draw_text_block(report_text, 40, height - 70, 300)

    cm_x, cm_y = 40, last_y - 200
    roc_x, roc_y = 320, last_y - 200

    c.setFont("Courier-Bold", 10)
    c.drawString(cm_x, cm_y + 190, "Confusion Matrix")
    c.drawImage(cm_image_path, cm_x, cm_y, width=230, height=180, preserveAspectRatio=True)

    c.drawString(roc_x, roc_y + 190, "ROC Curve")
    c.drawImage(roc_image_path, roc_x, roc_y, width=230, height=180, preserveAspectRatio=True)

    c.showPage()

# ─── Step 4: Run everything ─── #
# Replace these with your actual reports (strings)
# e.g. lr_report = classification_report(y_true_lr, y_pred_lr)
summary = {
    "Logistic Regression": extract_metrics(lr_report),
    "Random Forest": extract_metrics(rf_report),
    "LSTM": extract_metrics(lstm_report)
}

# Add detailed pages for each model
draw_model_page("Logistic Regression", lr_report, "lr_confusion_matrix.png", "lr_roc_curve.png")
draw_model_page("Random Forest", rf_report, "rf_confusion_matrix.png", "rf_roc_curve.png")
draw_model_page("LSTM", lstm_report, "lstm_confusion_matrix.png", "lstm_roc_curve.png")

# Add summary table
add_summary_page(summary)

# Save the final PDF
c.save()


## For test dataset, generate output  
## i choose LSTM model because
- Best F1 score, Macro F1, and Recall

- In multi-label classification with imbalance, Macro F1 is a key metric

- High recall is important when missing a label is worse than a false alarm

In [37]:
test_df=pd.read_csv(file_path+'cleaned_test.csv')
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6700 entries, 0 to 6699
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       6700 non-null   int64 
 1   content  6700 non-null   object
 2   lang     6700 non-null   object
 3   text     6700 non-null   object
dtypes: int64(1), object(3)
memory usage: 209.5+ KB


In [41]:
test_texts= test_df['text'].astype(str).values
# tokenize, sequence, pad
tokenizer.fit_on_texts(test_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)
X_test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

In [42]:
y_test_pred= model.predict(X_test_padded)

[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 38ms/step
