In [146]:
import pandas as pd
import numpy as np
from IPython.display import display

# Data cleaning

In [147]:
import re
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    text = re.sub(r'[^\w\s]', ' ', text.lower())
    words = text.split()
    return ' '.join(lemmatizer.lemmatize(w) for w in words if w.isalpha())

train_csv = pd.read_csv("https://raw.githubusercontent.com/msaribekyan/Re-Code-1/refs/heads/main/data/dataset_C_train.csv")
val_csv = pd.read_csv("https://raw.githubusercontent.com/msaribekyan/Re-Code-1/refs/heads/main/data/dataset_C_val.csv")

X_raw_train = train_csv['text']
y_train = train_csv.drop('text', axis=1)

X_train = []

for index, row in X_raw_train.items():
  X_train.append(lemmatize_text(row))

X_raw_val = val_csv['text']
y_val = val_csv.drop('text', axis=1)

X_val = []

for index, row in X_raw_val.items():
  X_val.append(lemmatize_text(row))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Vectorization

In [148]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=30000)

X_train = tfidf.fit_transform(X_train)
X_val = tfidf.transform(X_val)

joblib.dump(tfidf, "tfidf.pkl")

['tfidf.pkl']

# Training the model

In [151]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

model = MultiOutputClassifier(LogisticRegression())
model.fit(X_train, y_train)

joblib.dump(model, "model.pkl")

['model.pkl']

# Metrics

In [150]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

y_predict = model.predict(X_val)

print("Metrics for Label 1")
print("Classification report:")
print(classification_report(y_val['label_1'], y_predict[:, 0]))
print("Confusion matrix:")
print(confusion_matrix(y_val['label_1'], y_predict[:, 0]))

print("Metrics for Label 2")
print("Classification report:")
print(classification_report(y_val['label_2'], y_predict[:, 1]))
print("Confusion matrix:")
print(confusion_matrix(y_val['label_2'], y_predict[:, 1]))

print("Metrics for Label 3")
print("Classification report:")
print(classification_report(y_val['label_3'], y_predict[:, 2]))
print("Confusion matrix:")
print(confusion_matrix(y_val['label_3'], y_predict[:, 2]))

print("Metrics for Label 4")
print("Classification report:")
print(classification_report(y_val['label_4'], y_predict[:, 3]))
print("Confusion matrix:")
print(confusion_matrix(y_val['label_4'], y_predict[:, 3]))


Metrics for Label 1
Classification report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88        78
           1       0.85      0.85      0.85        62

    accuracy                           0.87       140
   macro avg       0.87      0.87      0.87       140
weighted avg       0.87      0.87      0.87       140

Confusion matrix:
[[69  9]
 [ 9 53]]
Metrics for Label 2
Classification report:
              precision    recall  f1-score   support

           0       0.88      0.38      0.53        58
           1       0.69      0.96      0.80        82

    accuracy                           0.72       140
   macro avg       0.78      0.67      0.67       140
weighted avg       0.77      0.72      0.69       140

Confusion matrix:
[[22 36]
 [ 3 79]]
Metrics for Label 3
Classification report:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81        63
           1       0.88      0.78 