**Import and preprocess the dataset**

In [None]:
import pandas as pd
df = pd.read_parquet('/Users/tomorrowcute/DSCI521/MSDS_2024_2026/Winter_2025/DSCI521/Project/data_final.parquet')
print(df.head(5))

                                               title  \
0  Is Physics Sick? [In Praise of Classical Physics]   
1    Modern Mathematical Physics: what it should be?   
2                                Topology in Physics   
3       Contents of Physics Related E-Print Archives   
4        Fundamental Dilemmas in Theoretical Physics   

                                             authors  \
0                                     Hisham Ghassib   
1                                     Ludwig Faddeev   
2                                          R. Jackiw   
3  E. R. Prakasan, Anil Kumar, Anil Sagar, Lalit ...   
4                                     Hisham Ghassib   

                                             summary             published  \
0  In this paper, it is argued that theoretical p...  2012-09-04T10:32:56Z   
1  Personal view of author on goals and content o...  2000-02-08T13:13:00Z   
2  The phenomenon of quantum number fractionaliza...  2005-03-15T16:00:59Z   
3  The frontie

In [None]:
print(df[['target']].head(10))

              target
0             physic
1  math-stats,physic
2  math-stats,physic
3             physic
4             physic
5             physic
6             physic
7  math-stats,physic
8  math-stats,physic
9  math-stats,physic


In [None]:
df['combined'] = df['title'] + ' ' + df['summary']

print(df[['title', 'summary', 'combined']].head())

                                               title  \
0  Is Physics Sick? [In Praise of Classical Physics]   
1    Modern Mathematical Physics: what it should be?   
2                                Topology in Physics   
3       Contents of Physics Related E-Print Archives   
4        Fundamental Dilemmas in Theoretical Physics   

                                             summary  \
0  In this paper, it is argued that theoretical p...   
1  Personal view of author on goals and content o...   
2  The phenomenon of quantum number fractionaliza...   
3  The frontiers of physics related e-print archi...   
4  In this paper, we argue that there are foundat...   

                                            combined  
0  Is Physics Sick? [In Praise of Classical Physi...  
1  Modern Mathematical Physics: what it should be...  
2  Topology in Physics The phenomenon of quantum ...  
3  Contents of Physics Related E-Print Archives T...  
4  Fundamental Dilemmas in Theoretical Physics In..

In [None]:
df.shape[0]



150171

In [None]:
import re
import string
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

# Download necessary NLTK data (run once)
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the tokenizer, lemmatizer, stemmer
wpt = WordPunctTokenizer()
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Create a stopwords set
stop_words = set(stopwords.words('english'))

def normalized_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'<.*?>+', ' ', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', ' ', text)
    tokens = wpt.tokenize(text)

    filtered_tokens = [token for token in tokens if token not in stop_words]

    lemma_stem_tokens = [stemmer.stem(lemmatizer.lemmatize(token)) for token in filtered_tokens]

    cleaned_text = ' '.join(lemma_stem_tokens)
    return cleaned_text


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tomorrowcute/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tomorrowcute/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df['cleaned'] = df['combined'].apply(lambda x: normalized_text(x))

In [None]:
print(df["cleaned"].head(20))

0     physic sick paper argu theoret physic akin org...
1     modern mathemat physic person view author goal...
2     topolog physic phenomenon quantum number fract...
3     content physic relat e print archiv frontier p...
4     fundament dilemma theoret physic paper argu fo...
5     physic heavi quark select problem heavi quark ...
6     linear collid prospect electroweak physic pros...
7     topolog aspect gaug theori appear encyclopedia...
8     p adic mathemat physic brief review select top...
9     invari physic group theori short review herita...
10    physic divers world spheric cow model physic t...
11    quest understand relativist quantum physic dis...
12    physic structur form physic field manifold pro...
13    quantum glove physic inform slogan inform phys...
14    b physic experi past year flavor physic made i...
15    classic physic agre quantum physic quantum phe...
16    space matter topolog old branch mathemat topol...
17    method analyz pathway physic major physic 

In [None]:
print(df.columns)

Index(['title', 'authors', 'summary', 'published', 'updated', 'link',
       'pdf_url', 'categories', 'target', 'year', 'month', 'day', 'combined',
       'cleaned'],
      dtype='object')


In [None]:
df["target"].unique()

array(['physic', 'math-stats,physic', 'cs,physic', 'cs,eess', 'cs',
       'math-stats', 'bio,math-stats,physic', 'cs,math-stats,physic',
       'cs,math-stats', 'bio', 'econ-qfin,physic', 'bio,physic',
       'bio,cs,physic', 'cs,eess,physic', 'bio,cs',
       'econ-qfin,math-stats', 'econ-qfin', 'cs,eess,math-stats',
       'bio,math-stats', 'bio,cs,math-stats', 'bio,econ-qfin,math-stats',
       'bio,cs,math-stats,physic', 'bio,cs,eess,physic', 'bio,cs,eess',
       'eess', 'eess,physic', 'cs,econ-qfin,math-stats', 'bio,eess',
       'cs,econ-qfin,physic', 'bio,cs,eess,math-stats', 'eess,math-stats',
       'bio,econ-qfin,physic', 'bio,eess,math-stats',
       'eess,math-stats,physic', 'cs,econ-qfin,math-stats,physic',
       'bio,cs,eess,math-stats,physic', 'bio,econ-qfin,math-stats,physic',
       'bio,eess,physic', 'cs,econ-qfin', 'cs,eess,math-stats,physic',
       'bio,cs,econ-qfin,physic', 'cs,econ-qfin,eess', 'bio,cs,econ-qfin',
       'cs,econ-qfin,eess,math-stats', 'bio,eco

Performing multi-label encoding for the target column. This method for multi-label classification, where each data point may belong to multiple categories.

In [None]:
all_labels = [label.strip() for entry in df['target'].dropna() for label in entry.split(',')]
unique_labels = pd.unique(all_labels)
print("Unique individual labels:", unique_labels)

# Create a mapping from each unique label to an integer.
mapping = {label: idx for idx, label in enumerate(sorted(unique_labels))}
print("Mapping:", mapping)

# Define a function to encode a row (split by comma and mapping each label)
def encode_labels(row):
    labels = [x.strip() for x in row.split(',')]
    encoded = [mapping[label] for label in labels if label in mapping]
    return encoded

# Apply the function to create a new column "target_encoded"
df['target_encoded'] = df['target'].apply(encode_labels)

# Display the first few rows with original and encoded target values.
print(df[['target', 'target_encoded']].head())

Unique individual labels: ['physic' 'math-stats' 'cs' 'eess' 'bio' 'econ-qfin']
Mapping: {'bio': 0, 'cs': 1, 'econ-qfin': 2, 'eess': 3, 'math-stats': 4, 'physic': 5}
              target target_encoded
0             physic            [5]
1  math-stats,physic         [4, 5]
2  math-stats,physic         [4, 5]
3             physic            [5]
4             physic            [5]


  unique_labels = pd.unique(all_labels)


In [None]:
print(df["target_encoded"].head(10))

0       [5]
1    [4, 5]
2    [4, 5]
3       [5]
4       [5]
5       [5]
6       [5]
7    [4, 5]
8    [4, 5]
9    [4, 5]
Name: target_encoded, dtype: object


In [None]:
#See frequency of each label
import numpy as np
# Flatten all encoded labels into a single array.
all_encoded = np.concatenate(df['target_encoded'].values)

# Get unique encoded labels and their counts.
unique_encoded, counts = np.unique(all_encoded, return_counts=True)
mapping_count = dict(zip(unique_encoded, counts))
print("Frequency of each encoded label (numeric):", mapping_count)

# The frequencies with the original label names,
# Create an inverse mapping from integer to label.
inverse_mapping = {v: k for k, v in mapping.items()}
mapping_count_named = {inverse_mapping[k]: v for k, v in mapping_count.items()}
print("Frequency of each label (names):", mapping_count_named)

Frequency of each encoded label (numeric): {0: 10371, 1: 56996, 2: 12432, 3: 9866, 4: 55456, 5: 54223}
Frequency of each label (names): {'bio': 10371, 'cs': 56996, 'econ-qfin': 12432, 'eess': 9866, 'math-stats': 55456, 'physic': 54223}


**Prepare the data before training model**

In [None]:

from sklearn.model_selection import train_test_split

X = df["combined"]
y = df["target_encoded"]

# 1. Split the data:
# First, split 10% for testing.
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# Then, split the remaining 90% into training (80% total) and validation (10%).
# Get a validation set that is 10% of the entire data, use test_size=1/9
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=1/9, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# 2. TF-IDF Transformation:
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf   = tfidf.transform(X_val)
X_test_tfidf  = tfidf.transform(X_test)

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
# 3. Dimensionality Reduction with Truncated SVD (LSA):
svd = TruncatedSVD(n_components=300, random_state=42)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_val_lsa   = lsa.transform(X_val_tfidf)
X_test_lsa  = lsa.transform(X_test_tfidf)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
# 4. Convert multi-label target representation into binary matrices:
mlb = MultiLabelBinarizer()
y_train_bin = mlb.fit_transform(y_train)
y_val_bin  = mlb.transform(y_val)
y_test_bin  = mlb.transform(y_test)
print("Labels:", mlb.classes_)  # Shows the order of labels in the binary matrix

Labels: [0 1 2 3 4 5]


**Train Logistic Regression Model**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# 5. Train a multi-label classifier using OneVsRestClassifier with Logistic Regression:
model = OneVsRestClassifier(
    LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
)
model.fit(X_train_lsa, y_train_bin)

In [None]:
from sklearn.metrics import classification_report
# 6. Predict on training, validation, and test data:
y_pred_train = model.predict(X_train_lsa)
y_pred_val   = model.predict(X_val_lsa)
y_pred_test  = model.predict(X_test_lsa)

In [None]:
# 7. Print classification reports:
print("=== Training Data Classification Report ===")
print(classification_report(y_train_bin, y_pred_train, target_names=[str(label) for label in mlb.classes_]))

print("=== Test Data Classification Report ===")
print(classification_report(y_test_bin, y_pred_test, target_names=[str(label) for label in mlb.classes_]))

=== Training Data Classification Report ===
              precision    recall  f1-score   support

           0       0.39      0.88      0.54      8326
           1       0.82      0.88      0.85     45400
           2       0.54      0.92      0.68      9907
           3       0.32      0.88      0.47      7911
           4       0.76      0.82      0.79     44445
           5       0.83      0.86      0.84     43432

   micro avg       0.69      0.86      0.76    159421
   macro avg       0.61      0.87      0.70    159421
weighted avg       0.74      0.86      0.78    159421
 samples avg       0.76      0.89      0.79    159421

=== Test Data Classification Report ===
              precision    recall  f1-score   support

           0       0.38      0.85      0.52      1027
           1       0.83      0.88      0.85      5819
           2       0.55      0.91      0.69      1284
           3       0.31      0.88      0.46       960
           4       0.76      0.81      0.78     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print("=== Validation Data Classification Report ===")
print(classification_report(y_val_bin, y_pred_val, target_names=[str(label) for label in mlb.classes_]))

=== Validation Data Classification Report ===
              precision    recall  f1-score   support

           0       0.38      0.87      0.53      1018
           1       0.83      0.88      0.85      5777
           2       0.52      0.91      0.66      1241
           3       0.32      0.88      0.47       995
           4       0.75      0.81      0.78      5553
           5       0.81      0.86      0.83      5323

   micro avg       0.68      0.85      0.76     19907
   macro avg       0.60      0.87      0.69     19907
weighted avg       0.74      0.85      0.78     19907
 samples avg       0.76      0.89      0.79     19907



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Improve Logistics regression model**

In [None]:
# Hyperparameter tuning on val data
from sklearn.model_selection import GridSearchCV

# Build a logistic regression wrapped in OneVsRestClassifier
logreg_pipeline = OneVsRestClassifier(
    LogisticRegression(random_state=42, class_weight='balanced')
)

# Parameter grid for logistic regression
param_grid = {
    'estimator__C': [0.01, 0.1, 1, 10],
    'estimator__penalty': ['l2'],
}

grid_search = GridSearchCV(
    logreg_pipeline,
    param_grid,
    scoring='f1_macro',
    cv=3,                # 3-fold cross-validation
    verbose=2,
    n_jobs=-1
)

# Fit on training data
grid_search.fit(X_train_lsa, y_train_bin)

print("Best Params:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate on validation set
y_pred_val_best = grid_search.predict(X_val_lsa)
print("Validation Classification Report:")
print(classification_report(y_val_bin, y_pred_val_best, target_names=[str(label) for label in mlb.classes_]))


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END ...........estimator__C=0.01, estimator__penalty=l2; total time=   4.0s
[CV] END ...........estimator__C=0.01, estimator__penalty=l2; total time=   4.4s
[CV] END ...........estimator__C=0.01, estimator__penalty=l2; total time=   4.4s
[CV] END ............estimator__C=0.1, estimator__penalty=l2; total time=   5.2s
[CV] END ............estimator__C=0.1, estimator__penalty=l2; total time=   5.4s
[CV] END ............estimator__C=0.1, estimator__penalty=l2; total time=   5.5s
[CV] END ..............estimator__C=1, estimator__penalty=l2; total time=   5.8s
[CV] END ..............estimator__C=1, estimator__penalty=l2; total time=   5.8s
[CV] END ..............estimator__C=1, estimator__penalty=l2; total time=   4.2s
[CV] END .............estimator__C=10, estimator__penalty=l2; total time=   4.2s
[CV] END .............estimator__C=10, estimator__penalty=l2; total time=   3.4s
[CV] END .............estimator__C=10, estimator_

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


C is like a setting that controls how flexible the or model is
The program tested different values of “C” for logistic regression (like 0.01, 0.1, 1, 10).
For each value of C, it measured how good the model was (using macro F1 score).
It found that C=1 gave the best balance overall.
small C: might lead to lower recall but higher precision
large C: improve recall but could lead to overfitting and lower precision

In [None]:
import numpy as np
from sklearn.metrics import classification_report

# Define a function to adjust per-class threshold
def apply_per_class_threshold(model, X, thresholds):
    """
    Predict probabilities then convert them to binary predictions
    using per-class thresholds.

    Parameters:
      model: multi-label model (supports predict_proba)
      X: feature matrix (LSA-transformed)
      thresholds

    Returns:
      y_pred_adjusted: binary predictions after threshold adjustment
    """
    # Get probability estimates
    y_proba = model.predict_proba(X)
    n_classes = y_proba.shape[1]
    y_pred_adjusted = np.zeros_like(y_proba, dtype=int)
    for i in range(n_classes):
        y_pred_adjusted[:, i] = (y_proba[:, i] >= thresholds[i]).astype(int)
    return y_pred_adjusted

# Define a default threshold of 0.5 for all classes
default_threshold = 0.5
n_classes = y_train_bin.shape[1]
thresholds = np.full(n_classes, default_threshold)

# Adjust thresholds for classes 0, 3, and 2 to balance precision/recall
thresholds[0] = 0.9
thresholds[3] = 0.9
thresholds[2] = 0.8
print("Per-Class Thresholds:", thresholds)

# Apply the threshold adjustment to each dataset:
y_pred_train_adjusted = apply_per_class_threshold(model, X_train_lsa, thresholds)
y_pred_val_adjusted   = apply_per_class_threshold(model, X_val_lsa, thresholds)
y_pred_test_adjusted  = apply_per_class_threshold(model, X_test_lsa, thresholds)

# Print classification reports:
print("=== Training Data Classification Report with Per-Class Threshold Adjustment ===")
print(classification_report(y_train_bin, y_pred_train_adjusted, target_names=[str(label) for label in mlb.classes_]))

print("=== Test Data Classification Report with Per-Class Threshold Adjustment ===")
print(classification_report(y_test_bin, y_pred_test_adjusted, target_names=[str(label) for label in mlb.classes_]))


Per-Class Thresholds: [0.9 0.5 0.8 0.9 0.5 0.5]
=== Training Data Classification Report with Per-Class Threshold Adjustment ===
              precision    recall  f1-score   support

           0       0.69      0.60      0.64      8326
           1       0.82      0.88      0.85     45400
           2       0.73      0.80      0.76      9907
           3       0.59      0.54      0.57      7911
           4       0.76      0.82      0.79     44445
           5       0.83      0.86      0.84     43432

   micro avg       0.78      0.82      0.80    159421
   macro avg       0.74      0.75      0.74    159421
weighted avg       0.78      0.82      0.80    159421
 samples avg       0.83      0.86      0.82    159421

=== Test Data Classification Report with Per-Class Threshold Adjustment ===
              precision    recall  f1-score   support

           0       0.68      0.59      0.63      1027
           1       0.83      0.88      0.85      5819
           2       0.72      0.80   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print("=== Validation Data Classification Report with Per-Class Threshold Adjustment ===")
print(classification_report(y_val_bin, y_pred_val_adjusted, target_names=[str(label) for label in mlb.classes_]))

=== Validation Data Classification Report with Per-Class Threshold Adjustment ===
              precision    recall  f1-score   support

           0       0.69      0.60      0.64      1018
           1       0.83      0.88      0.85      5777
           2       0.71      0.78      0.74      1241
           3       0.59      0.54      0.56       995
           4       0.75      0.81      0.78      5553
           5       0.81      0.86      0.83      5323

   micro avg       0.78      0.82      0.80     19907
   macro avg       0.73      0.74      0.74     19907
weighted avg       0.78      0.82      0.80     19907
 samples avg       0.82      0.86      0.82     19907



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, jaccard_score, hamming_loss

# For Training Data:
avg_precision_train = precision_score(y_train_bin, y_pred_train_adjusted, average='macro')
avg_recall_train    = recall_score(y_train_bin, y_pred_train_adjusted, average='macro')
avg_f1_train        = f1_score(y_train_bin, y_pred_train_adjusted, average='macro')
jaccard_train       = jaccard_score(y_train_bin, y_pred_train_adjusted, average='samples')
ham_loss_train      = hamming_loss(y_train_bin, y_pred_train_adjusted)

# For Test Data:
avg_precision_test = precision_score(y_test_bin, y_pred_test_adjusted, average='macro')
avg_recall_test    = recall_score(y_test_bin, y_pred_test_adjusted, average='macro')
avg_f1_test        = f1_score(y_test_bin, y_pred_test_adjusted, average='macro')
jaccard_test       = jaccard_score(y_test_bin, y_pred_test_adjusted, average='samples')
ham_loss_test      = hamming_loss(y_test_bin, y_pred_test_adjusted)

print("=== Training Metrics ===")
print("Average Precision (macro):", avg_precision_train)
print("Average Recall (macro):", avg_recall_train)
print("Average F1 (macro):", avg_f1_train)
print("Jaccard Score (samples):", jaccard_train)
print("Hamming Loss:", ham_loss_train)

print("\n=== Test Metrics ===")
print("Average Precision (macro):", avg_precision_test)
print("Average Recall (macro):", avg_recall_test)
print("Average F1 (macro):", avg_f1_test)
print("Jaccard Score (samples):", jaccard_test)
print("Hamming Loss:", ham_loss_test)


=== Training Metrics ===
Average Precision (macro): 0.7365574775810018
Average Recall (macro): 0.7473292308290039
Average F1 (macro): 0.7405097761523338
Jaccard Score (samples): 0.7644667987392066
Hamming Loss: 0.09011592417482409

=== Test Metrics ===
Average Precision (macro): 0.7335743084026287
Average Recall (macro): 0.7474809299561169
Average F1 (macro): 0.7392220887537507
Jaccard Score (samples): 0.7651629156123763
Hamming Loss: 0.09040262795756204


In [None]:
# For Validation Data:
avg_precision_val = precision_score(y_val_bin, y_pred_val_adjusted, average='macro')
avg_recall_val    = recall_score(y_val_bin, y_pred_val_adjusted, average='macro')
avg_f1_val        = f1_score(y_val_bin, y_pred_val_adjusted, average='macro')
jaccard_val       = jaccard_score(y_val_bin, y_pred_val_adjusted, average='samples')
ham_loss_val      = hamming_loss(y_val_bin, y_pred_val_adjusted)

print("\n=== Validation Metrics ===")
print("Average Precision (macro):", avg_precision_val)
print("Average Recall (macro):", avg_recall_val)
print("Average F1 (macro):", avg_f1_val)
print("Jaccard Score (samples):", jaccard_val)
print("Hamming Loss:", ham_loss_val)


=== Validation Metrics ===
Average Precision (macro): 0.7301810985664448
Average Recall (macro): 0.7433841132675097
Average F1 (macro): 0.7354531664084093
Jaccard Score (samples): 0.7602783512019711
Hamming Loss: 0.09187365430290116


**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier


# -------------------------------
# Train Random Forest Model
# -------------------------------
# X_train_lsa, X_val_lsa, X_test_lsa, y_train_bin, y_val_bin, y_test_bin, and mlb.
# (These come from TF-IDF, LSA, and MultiLabelBinarizer processing steps above)

model_rf = OneVsRestClassifier(
    RandomForestClassifier(
        n_estimators=100,
        max_depth=5,           # Limit tree depth to reduce overfitting
        min_samples_split=10,  # Require at least 10 samples to split a node
        min_samples_leaf=5,    # Each leaf must have at least 5 samples
        max_features='sqrt',   # Use square root of the total features at each split
        class_weight='balanced',
        random_state=42
    )
)
model_rf.fit(X_train_lsa, y_train_bin)

# Step 1: Predict on Data Splits and preprocessing
y_pred_train_rf = model_rf.predict(X_train_lsa)
y_pred_val_rf   = model_rf.predict(X_val_lsa)
y_pred_test_rf  = model_rf.predict(X_test_lsa)

# Step 2: Print Classification Reports
print("=== Random Forest: Training Data Classification Report ===")
print(classification_report(y_train_bin, y_pred_train_rf, target_names=[str(label) for label in mlb.classes_]))

print("=== Random Forest: Test Data Classification Report ===")
print(classification_report(y_test_bin, y_pred_test_rf, target_names=[str(label) for label in mlb.classes_]))


=== Random Forest: Training Data Classification Report ===
              precision    recall  f1-score   support

           0       0.33      0.80      0.46      8326
           1       0.77      0.83      0.80     45400
           2       0.46      0.85      0.60      9907
           3       0.28      0.85      0.42      7911
           4       0.77      0.69      0.73     44445
           5       0.79      0.73      0.76     43432

   micro avg       0.64      0.76      0.69    159421
   macro avg       0.57      0.79      0.63    159421
weighted avg       0.71      0.76      0.72    159421
 samples avg       0.71      0.81      0.73    159421

=== Random Forest: Test Data Classification Report ===
              precision    recall  f1-score   support

           0       0.30      0.75      0.43      1027
           1       0.77      0.82      0.80      5819
           2       0.46      0.84      0.59      1284
           3       0.26      0.83      0.40       960
           4      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:

print("=== Random Forest: Validation Data Classification Report ===")
print(classification_report(y_val_bin, y_pred_val_rf, target_names=[str(label) for label in mlb.classes_]))

=== Random Forest: Validation Data Classification Report ===
              precision    recall  f1-score   support

           0       0.31      0.75      0.43      1018
           1       0.77      0.83      0.80      5777
           2       0.45      0.84      0.59      1241
           3       0.27      0.84      0.40       995
           4       0.77      0.68      0.72      5553
           5       0.78      0.73      0.75      5323

   micro avg       0.63      0.76      0.69     19907
   macro avg       0.56      0.78      0.62     19907
weighted avg       0.70      0.76      0.71     19907
 samples avg       0.71      0.80      0.72     19907



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Improve model**

In [None]:
y_proba_train_rf = model_rf.predict_proba(X_train_lsa)
y_proba_val_rf   = model_rf.predict_proba(X_val_lsa)
y_proba_test_rf  = model_rf.predict_proba(X_test_lsa)

# Step 1: Set Per-Class Thresholds and Adjust Predictions
# Get number of classes from the probability matrix shape
n_classes = y_proba_train_rf.shape[1]

# Create an array of default thresholds (0.5 for all classes)
default_threshold = 0.5
thresholds = np.full(n_classes, default_threshold)

# Adjust thresholds for classes 0, 3, and 2
thresholds[0] = 0.6
thresholds[3] = 0.63
thresholds[2] = 0.6

print("Per-Class Thresholds:", thresholds)

def apply_thresholds(y_proba, thresholds):
    """
    Convert probability predictions to binary predictions using per-class thresholds.
    """
    n_samples, n_classes = y_proba.shape
    y_pred = np.zeros_like(y_proba, dtype=int)
    for i in range(n_classes):
        y_pred[:, i] = (y_proba[:, i] >= thresholds[i]).astype(int)
    return y_pred

# Apply thresholds for each dataset:
y_pred_train_rf_adj = apply_thresholds(y_proba_train_rf, thresholds)
y_pred_val_rf_adj   = apply_thresholds(y_proba_val_rf, thresholds)
y_pred_test_rf_adj  = apply_thresholds(y_proba_test_rf, thresholds)

# Print Classification Reports for Adjusted Predictions
print("=== Random Forest with Adjusted Thresholds: Training Data Classification Report ===")
print(classification_report(y_train_bin, y_pred_train_rf_adj, target_names=[str(label) for label in mlb.classes_]))

print("=== Random Forest with Adjusted Thresholds: Test Data Classification Report ===")
print(classification_report(y_test_bin, y_pred_test_rf_adj, target_names=[str(label) for label in mlb.classes_]))

Per-Class Thresholds: [0.6  0.5  0.6  0.63 0.5  0.5 ]
=== Random Forest with Adjusted Thresholds: Training Data Classification Report ===
              precision    recall  f1-score   support

           0       0.55      0.52      0.53      8326
           1       0.77      0.83      0.80     45400
           2       0.68      0.70      0.69      9907
           3       0.53      0.54      0.53      7911
           4       0.77      0.69      0.73     44445
           5       0.79      0.73      0.76     43432

   micro avg       0.75      0.73      0.74    159421
   macro avg       0.68      0.67      0.67    159421
weighted avg       0.75      0.73      0.74    159421
 samples avg       0.78      0.78      0.75    159421

=== Random Forest with Adjusted Thresholds: Test Data Classification Report ===
              precision    recall  f1-score   support

           0       0.53      0.51      0.52      1027
           1       0.77      0.82      0.80      5819
           2       0.6

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print("=== Random Forest with Adjusted Thresholds: Validation Data Classification Report ===")
print(classification_report(y_val_bin, y_pred_val_rf_adj, target_names=[str(label) for label in mlb.classes_]))

=== Random Forest with Adjusted Thresholds: Validation Data Classification Report ===
              precision    recall  f1-score   support

           0       0.52      0.49      0.51      1018
           1       0.77      0.83      0.80      5777
           2       0.66      0.67      0.67      1241
           3       0.50      0.52      0.51       995
           4       0.77      0.68      0.72      5553
           5       0.78      0.73      0.75      5323

   micro avg       0.74      0.72      0.73     19907
   macro avg       0.67      0.65      0.66     19907
weighted avg       0.74      0.72      0.73     19907
 samples avg       0.77      0.77      0.74     19907



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, jaccard_score, hamming_loss
# Training Metrics:
avg_precision_train_rf = precision_score(y_train_bin, y_pred_train_rf_adj, average='macro')
avg_recall_train_rf    = recall_score(y_train_bin, y_pred_train_rf_adj, average='macro')
avg_f1_train_rf        = f1_score(y_train_bin, y_pred_train_rf_adj, average='macro')
jaccard_train_rf       = jaccard_score(y_train_bin, y_pred_train_rf_adj, average='samples')
ham_loss_train_rf      = hamming_loss(y_train_bin, y_pred_train_rf_adj)

# Test Metrics:
avg_precision_test_rf = precision_score(y_test_bin, y_pred_test_rf_adj, average='macro')
avg_recall_test_rf    = recall_score(y_test_bin, y_pred_test_rf_adj, average='macro')
avg_f1_test_rf        = f1_score(y_test_bin, y_pred_test_rf_adj, average='macro')
jaccard_test_rf       = jaccard_score(y_test_bin, y_pred_test_rf_adj, average='samples')
ham_loss_test_rf      = hamming_loss(y_test_bin, y_pred_test_rf_adj)

print("\n=== Random Forest (Adjusted Thresholds) Metrics ===")
print("Training Metrics:")
print("  Average Precision (macro):", avg_precision_train_rf)
print("  Average Recall (macro):", avg_recall_train_rf)
print("  Average F1 (macro):", avg_f1_train_rf)
print("  Jaccard Score (samples):", jaccard_train_rf)
print("  Hamming Loss:", ham_loss_train_rf)

print("\nTest Metrics:")
print("  Average Precision (macro):", avg_precision_test_rf)
print("  Average Recall (macro):", avg_recall_test_rf)
print("  Average F1 (macro):", avg_f1_test_rf)
print("  Jaccard Score (samples):", jaccard_test_rf)
print("  Hamming Loss:", ham_loss_test_rf)


=== Random Forest (Adjusted Thresholds) Metrics ===
Training Metrics:
  Average Precision (macro): 0.6807101453797265
  Average Recall (macro): 0.6703791152714028
  Average F1 (macro): 0.6746272567916732
  Jaccard Score (samples): 0.6973356584759495
  Hamming Loss: 0.11527213602361767

Test Metrics:
  Average Precision (macro): 0.6723861803752863
  Average Recall (macro): 0.656220578686312
  Average F1 (macro): 0.6635174810802299
  Jaccard Score (samples): 0.690846539707906
  Hamming Loss: 0.11804723221023661
