In [2]:
import pandas as pd

file_path = '../Datasets/cleaned_dataset.csv'
df = pd.read_csv(file_path)

In [3]:
# Check the balance of the categories in the target variable
category_distribution = df['category'].value_counts(normalize=True)

category_distribution

category
biographies                             0.318044
philosophy                              0.279965
programming                             0.214409
artificial intelligence                 0.169624
movies about artificial intelligence    0.017958
Name: proportion, dtype: float64

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Preprocessing the 'has_entity' feature by encoding it as binary features
# Extracting whether the paragraph contains references to an organization, product, or person
df['has_org'] = df['has_entity'].apply(lambda x: 'ORG_YES' in x).astype(int)
df['has_product'] = df['has_entity'].apply(lambda x: 'PRODUCT_YES' in x).astype(int)
df['has_person'] = df['has_entity'].apply(lambda x: 'PERSON_YES' in x).astype(int)

# Vectorizing the 'paragraph' text with TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X_text = tfidf.fit_transform(df['paragraph'])

# Encoding the categorical target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['category'])

# Combining the TF-IDF features with the 'has_entity' binary features
X_other_features = df[['has_org', 'has_product', 'has_person']].to_numpy()
X = np.hstack((X_text.toarray(), X_other_features))

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Output the shape of the training and testing sets to confirm the preprocessing steps
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7395, 1003), (1849, 1003), (7395,), (1849,))

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Initialize the Logistic Regression model
lr_model = LogisticRegression(C=1, solver='saga', class_weight='balanced', random_state=42, max_iter=500)

# Train the model
lr_model.fit(X_train, y_train)

# Predictions
y_pred = lr_model.predict(X_test)

# Evaluation
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print("\nAccuracy:", accuracy)
print("Macro F1-Score:", macro_f1)

# Cross-validation for stability check
cv_scores = cross_val_score(lr_model, X, y, cv=StratifiedKFold(5), scoring='f1_macro')
print("\nCV Macro F1-Scores:", cv_scores)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Initialize the Logistic Regression model with optimized parameters
lr_model = LogisticRegression(C=1, solver='saga', class_weight='balanced', random_state=42, max_iter=5000)

# Train the model
lr_model.fit(X_train, y_train)

# Predictions
y_pred = lr_model.predict(X_test)

# Evaluation
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print("\nAccuracy:", accuracy)
print("Macro F1-Score:", macro_f1)

# Cross-validation for stability check, with parallel processing
cv_scores = cross_val_score(lr_model, X, y, cv=StratifiedKFold(5), scoring='f1_macro', n_jobs=-1)
print("\nCV Macro F1-Scores:", cv_scores)



Confusion Matrix:
 [[272  12   3   8  19]
 [ 11 534   4  35   4]
 [  3   3  27   0   0]
 [ 19  30   5 453  11]
 [ 17   7   1   6 365]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.87      0.86       314
           1       0.91      0.91      0.91       588
           2       0.68      0.82      0.74        33
           3       0.90      0.87      0.89       518
           4       0.91      0.92      0.92       396

    accuracy                           0.89      1849
   macro avg       0.85      0.88      0.86      1849
weighted avg       0.89      0.89      0.89      1849


Accuracy: 0.8929150892374257
Macro F1-Score: 0.8622513237996225

CV Macro F1-Scores: [0.85944666 0.86617209 0.83527784 0.85549689 0.85791021]
