In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from torch.utils.data import DataLoader, Dataset
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from common.functions import evaluate_model, train_model
from common.ffnn import FFNN
from common.custom_dataset import CustomDataset

In [2]:
train_data = pd.read_csv("../Data/arxiv_train.csv")
train_data

Unnamed: 0.1,Unnamed: 0,abstract,label
0,31716,Automatic meeting analysis is an essential f...,eess
1,89533,We propose a protocol to encode classical bi...,quant-ph
2,82700,A number of physically intuitive results for...,quant-ph
3,78830,In the last decade rare-earth hexaborides ha...,physics
4,94948,We introduce the weak barycenter of a family...,stat
...,...,...,...
79995,27913,"In this paper, the sum secure degrees of fre...",cs
79996,94441,"In areas of application, including actuarial...",stat
79997,33015,Failure detection is employed in the industr...,eess
79998,942,As part of the ongoing effort to characteriz...,astro-ph


In [3]:
# Load the train set
test_data = pd.read_csv("../Data/arxiv_test.csv")
test_data

Unnamed: 0.1,Unnamed: 0,abstract,label
0,64481,We describe a shape derivative approach to p...,math
1,48104,We study displaced signatures of sneutrino p...,hep-ph
2,48233,High precision studies of Beyond-Standard-Mo...,hep-ph
3,49026,We find that a class of models of MeV-GeV da...,hep-ph
4,37957,Knowledge of power grid's topology during ca...,eess
...,...,...,...
19995,50391,We explore the dynamics of a simple class of...,hep-th
19996,63534,In this paper one construction of compositio...,math
19997,16712,The Random-First-Order-Transition theory of ...,cond-mat
19998,6596,Accurate chemical abundance measurements of ...,astro-ph


In [4]:
count_vectorizer = CountVectorizer(max_features=5000)  # Adjust max_features as needed
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [5]:
X_train_count = count_vectorizer.fit_transform(train_data["abstract"]).toarray()
X_test_count = count_vectorizer.transform(test_data["abstract"]).toarray()

In [6]:
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data["abstract"]).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data["abstract"]).toarray()

In [7]:
label_encoder = {label: idx for idx, label in enumerate(train_data["label"].unique())}
train_data["label_encoded"] = train_data["label"].map(label_encoder)
test_data["label_encoded"] = test_data["label"].map(label_encoder)

In [8]:
y_train = torch.tensor(train_data["label_encoded"].values)
y_test = torch.tensor(test_data["label_encoded"].values)

In [10]:
# Define train and test datasets
train_dataset_count = CustomDataset(X_train_count, y_train)
test_dataset_count = CustomDataset(X_test_count, y_test)

train_dataset_tfidf = CustomDataset(X_train_tfidf, y_train)
test_dataset_tfidf = CustomDataset(X_test_tfidf, y_test)

# Define data loaders
train_loader_count = DataLoader(train_dataset_count, batch_size=64, shuffle=True)
test_loader_count = DataLoader(test_dataset_count, batch_size=64, shuffle=False)

train_loader_tfidf = DataLoader(train_dataset_tfidf, batch_size=64, shuffle=True)
test_loader_tfidf = DataLoader(test_dataset_tfidf, batch_size=64, shuffle=False)

# Evaluating different number of layers for Count Vectorizer feature detection method


In [11]:
# Train and evaluate FFNN with CountVectorizer and different numbers of layers
input_size_count = X_train_count.shape[1]
output_size_count = len(label_encoder)
for num_layers in [1, 2, 3]:
    print(f"Training FFNN with CountVectorizer and {num_layers} layers\n")

    ffnn_count = FFNN(input_size_count, 256, output_size_count, num_layers)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(ffnn_count.parameters(), lr=0.001)

    train_model(ffnn_count, criterion, optimizer, train_loader_count, num_epochs=10)
    evaluate_model(ffnn_count, test_loader_count)

Training FFNN with CountVectorizer and 1 layers

Epoch [1/10], Loss: 1946.976118028164
Epoch [2/10], Loss: 1397.7542569637299
Epoch [3/10], Loss: 1292.314246147871
Epoch [4/10], Loss: 1231.597168892622
Epoch [5/10], Loss: 1189.9652605056763
Epoch [6/10], Loss: 1158.3046594560146
Epoch [7/10], Loss: 1134.1816556155682
Epoch [8/10], Loss: 1113.3856071829796
Epoch [9/10], Loss: 1096.8290793299675
Epoch [10/10], Loss: 1081.954888612032

Evaluations:
Accuracy: 72.21%
F1 Score: 62.38%
Precision: 60.06%
Recall: 65.56%


Training FFNN with CountVectorizer and 2 layers

Epoch [1/10], Loss: 1376.873632967472
Epoch [2/10], Loss: 1166.51555493474
Epoch [3/10], Loss: 1056.2928673923016
Epoch [4/10], Loss: 970.5347982347012
Epoch [5/10], Loss: 926.9555596113205
Epoch [6/10], Loss: 912.0390577316284
Epoch [7/10], Loss: 877.7392210066319
Epoch [8/10], Loss: 726.7501022815704
Epoch [9/10], Loss: 675.9775292426348
Epoch [10/10], Loss: 638.7597019895911

Evaluations:
Accuracy: 68.16%
F1 Score: 63.28%
Pre

# Evaluating Activation Function for Count Vectorizer feature detection method


In [12]:
# Train and evaluate FFNN with CountVectorizer and different activation function
input_size_count = X_train_count.shape[1]
output_size_count = len(label_encoder)
activation_functions = [nn.LeakyReLU(), nn.ELU(), nn.Sigmoid(), nn.Softmax(dim=1)]

for activation_function in activation_functions:
    print(
        f"Training FFNN with CountVectorizer and {activation_function.__class__.__name__} as an activation function\n"
    )

    ffnn_count = FFNN(
        input_size_count,
        256,
        output_size_count,
        num_layers=2,
        activation=activation_function,
    )
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(ffnn_count.parameters(), lr=0.001)

    train_model(ffnn_count, criterion, optimizer, train_loader_count, num_epochs=10)
    evaluate_model(ffnn_count, test_loader_count)

Training FFNN with CountVectorizer and LeakyReLU as an activation function

Epoch [1/10], Loss: 767.8481568694115
Epoch [2/10], Loss: 459.78384954482317
Epoch [3/10], Loss: 293.2984654158354
Epoch [4/10], Loss: 142.851327534765
Epoch [5/10], Loss: 56.92739152116701
Epoch [6/10], Loss: 24.90542499208823
Epoch [7/10], Loss: 15.26887806400191
Epoch [8/10], Loss: 18.091219988418743
Epoch [9/10], Loss: 19.20246203520219
Epoch [10/10], Loss: 9.432539400557289

Evaluations:
Accuracy: 80.81%
F1 Score: 80.68%
Precision: 80.83%
Recall: 80.77%


Training FFNN with CountVectorizer and ELU as an activation function

Epoch [1/10], Loss: 734.2853848040104
Epoch [2/10], Loss: 483.7553330808878
Epoch [3/10], Loss: 375.2395723313093
Epoch [4/10], Loss: 274.2511309199035
Epoch [5/10], Loss: 178.73587717488408
Epoch [6/10], Loss: 98.63105329032987
Epoch [7/10], Loss: 48.09541300870478
Epoch [8/10], Loss: 26.138406450510956
Epoch [9/10], Loss: 16.399672850966454
Epoch [10/10], Loss: 14.749973135156324

Eva

# Evaluating different number of layers for tfidf feature detection method


In [14]:
input_size_tfidf = X_train_tfidf.shape[1]
output_size_tfidf = len(label_encoder)
for num_layers in [1, 2, 3]:
    print(f"Training FFNN with TF-IDF and {num_layers} layers\n")

    ffnn_tfidf = FFNN(input_size_tfidf, 256, output_size_tfidf, num_layers)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(ffnn_tfidf.parameters(), lr=0.001)

    train_model(ffnn_tfidf, criterion, optimizer, train_loader_tfidf, num_epochs=10)
    evaluate_model(ffnn_tfidf, test_loader_tfidf)

Training FFNN with TF-IDF and 1 layers

Epoch [1/10], Loss: 5147.310200452805
Epoch [2/10], Loss: 3573.825830221176
Epoch [3/10], Loss: 3191.022933244705
Epoch [4/10], Loss: 3014.785920381546
Epoch [5/10], Loss: 2910.055863380432
Epoch [6/10], Loss: 2840.5273592472076
Epoch [7/10], Loss: 2790.981001138687
Epoch [8/10], Loss: 2753.766632795334
Epoch [9/10], Loss: 2724.5755944252014
Epoch [10/10], Loss: 2700.876099705696

Evaluations:
Accuracy: 59.41%
F1 Score: 50.83%
Precision: 45.43%
Recall: 59.18%


Training FFNN with TF-IDF and 2 layers

Epoch [1/10], Loss: 2374.7484006881714
Epoch [2/10], Loss: 2279.775706768036
Epoch [3/10], Loss: 2248.8546879291534
Epoch [4/10], Loss: 2215.6028389930725
Epoch [5/10], Loss: 2176.0430750846863
Epoch [6/10], Loss: 2137.171267747879
Epoch [7/10], Loss: 2105.2292317152023
Epoch [8/10], Loss: 2084.2411848306656
Epoch [9/10], Loss: 2072.4610480070114
Epoch [10/10], Loss: 1866.9293683767319

Evaluations:
Accuracy: 33.78%
F1 Score: 24.92%
Precision: 22.22%

# Evaluating Activation Function for tfidf feature detection method


In [22]:
input_size_tfidf = X_train_tfidf.shape[1]
output_size_tfidf = len(label_encoder)
activation_functions = [nn.LeakyReLU(), nn.ELU(), nn.Sigmoid(), nn.Softmax(dim=1)]

for activation_function in activation_functions:
    print(
        f"Training FFNN with tfidf and {activation_function.__class__.__name__} as an activation function\n"
    )

    ffnn_tfidf = FFNN(
        input_size_tfidf,
        256,
        output_size_tfidf,
        num_layers=2,
        activation=activation_function,
    )
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(ffnn_tfidf.parameters(), lr=0.001)

    train_model(ffnn_tfidf, criterion, optimizer, train_loader_tfidf, num_epochs=10)
    evaluate_model(ffnn_tfidf, test_loader_tfidf)

Training FFNN with tfidf and LeakyReLU as an activation function

Epoch [1/10], Loss: 836.8895402252674
Epoch [2/10], Loss: 492.3718654215336
Epoch [3/10], Loss: 415.8100789934397
Epoch [4/10], Loss: 357.06805112212896
Epoch [5/10], Loss: 301.2109507843852
Epoch [6/10], Loss: 245.48004936799407
Epoch [7/10], Loss: 190.95033019781113
Epoch [8/10], Loss: 139.5831142552197
Epoch [9/10], Loss: 95.46690713055432
Epoch [10/10], Loss: 61.44520836416632

Evaluations:
Accuracy: 79.90%
F1 Score: 79.87%
Precision: 79.91%
Recall: 79.86%


Training FFNN with tfidf and ELU as an activation function

Epoch [1/10], Loss: 788.2655977755785
Epoch [2/10], Loss: 495.04162622988224
Epoch [3/10], Loss: 434.3660581856966
Epoch [4/10], Loss: 395.89662363380194
Epoch [5/10], Loss: 368.8371285125613
Epoch [6/10], Loss: 345.68841253221035
Epoch [7/10], Loss: 329.3641115576029
Epoch [8/10], Loss: 313.5195683389902
Epoch [9/10], Loss: 301.0600880756974
Epoch [10/10], Loss: 288.79009548947215

Evaluations:
Accuracy

# Evaluating hidden size


In [16]:
class FFNN_hidden_size(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, activation=nn.ReLU()):
        super(FFNN_hidden_size, self).__init__()  # Corrected super call
        self.num_layers = len(hidden_sizes) + 1
        self.hidden_layers = nn.ModuleList()
        for i in range(self.num_layers - 1):
            if i == 0:
                self.hidden_layers.append(nn.Linear(input_size, hidden_sizes[i]))
            else:
                self.hidden_layers.append(
                    nn.Linear(hidden_sizes[i - 1], hidden_sizes[i])
                )
            self.hidden_layers.append(activation)
        self.hidden_layers.append(nn.Linear(hidden_sizes[-1], output_size))

    def forward(self, x):
        for layer in self.hidden_layers:
            x = layer(x)
        return x

# Evaluating hidden size for Count Vectorizer


In [17]:
input_size_count = X_train_count.shape[1]
output_size_count = len(label_encoder)
hidden_sizes = [64, 128, 256]

for hidden_size in hidden_sizes:
    print(f"Training FFNN with CountVectorizer and {hidden_size} as hidden size\n")

    ffnn_count = FFNN_hidden_size(
        input_size_count, [hidden_size], output_size_count, activation=nn.Sigmoid()
    )
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(ffnn_count.parameters(), lr=0.001)

    train_model(ffnn_count, criterion, optimizer, train_loader_count, num_epochs=10)
    evaluate_model(ffnn_count, test_loader_count)

Training FFNN with CountVectorizer and 64 as hidden size

Epoch [1/10], Loss: 928.1101182699203
Epoch [2/10], Loss: 521.2363376766443
Epoch [3/10], Loss: 431.4762702435255
Epoch [4/10], Loss: 367.9116633683443
Epoch [5/10], Loss: 313.4780855104327
Epoch [6/10], Loss: 265.27464877814054
Epoch [7/10], Loss: 220.85574132576585
Epoch [8/10], Loss: 179.75636037439108
Epoch [9/10], Loss: 142.45119661837816
Epoch [10/10], Loss: 109.49029067344964

Evaluations:
Accuracy: 81.19%
F1 Score: 81.05%
Precision: 81.11%
Recall: 81.15%


Training FFNN with CountVectorizer and 128 as hidden size

Epoch [1/10], Loss: 853.1412792801857
Epoch [2/10], Loss: 506.3931464701891
Epoch [3/10], Loss: 415.78018732368946
Epoch [4/10], Loss: 347.91063272953033
Epoch [5/10], Loss: 285.9519297629595
Epoch [6/10], Loss: 230.24208783730865
Epoch [7/10], Loss: 176.87036663293839
Epoch [8/10], Loss: 130.54939570464194
Epoch [9/10], Loss: 90.7079890165478
Epoch [10/10], Loss: 59.17798920162022

Evaluations:
Accuracy: 80.69

# Model for evaluating batch size


In [18]:
num_epochs = 10
learning_rate = 0.001
batch_sizes = [32, 64, 128, 256]
results = {}
input_size_count = X_train_count.shape[1]
output_size_count = len(label_encoder)
for batch_size in batch_sizes:
    print(f"Evaluating with batch size: {batch_size}")

    model = FFNN(
        input_size_count, 256, output_size_count, num_layers=2, activation=nn.ReLU()
    )

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    for batch_size in batch_sizes:
        train_loader_count = DataLoader(train_dataset_count, batch_size, shuffle=True)
        test_loader_count = DataLoader(test_dataset_count, batch_size, shuffle=False)

    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader_count:
            optimizer.zero_grad()
            inputs = inputs.float()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        accuracy = evaluate_model(model, test_loader_count)
    results[batch_size] = accuracy
    print(f"Accuracy with batch size {batch_size}: {accuracy}")

Evaluating with batch size: 32

Evaluations:
Accuracy: 69.58%
F1 Score: 63.04%
Precision: 59.33%
Recall: 69.50%



Evaluations:
Accuracy: 69.14%
F1 Score: 63.37%
Precision: 60.27%
Recall: 69.05%



Evaluations:
Accuracy: 68.45%
F1 Score: 62.82%
Precision: 59.68%
Recall: 68.36%



Evaluations:
Accuracy: 68.44%
F1 Score: 62.67%
Precision: 59.25%
Recall: 68.36%



Evaluations:
Accuracy: 67.90%
F1 Score: 62.61%
Precision: 60.48%
Recall: 67.84%



Evaluations:
Accuracy: 68.08%
F1 Score: 62.53%
Precision: 59.59%
Recall: 68.00%



Evaluations:
Accuracy: 67.51%
F1 Score: 62.35%
Precision: 59.82%
Recall: 67.43%



Evaluations:
Accuracy: 67.19%
F1 Score: 62.11%
Precision: 59.64%
Recall: 67.11%



Evaluations:
Accuracy: 67.50%
F1 Score: 62.37%
Precision: 59.92%
Recall: 67.42%



Evaluations:
Accuracy: 67.42%
F1 Score: 62.33%
Precision: 59.80%
Recall: 67.34%


Accuracy with batch size 256: None
Evaluating with batch size: 64

Evaluations:
Accuracy: 77.80%
F1 Score: 73.94%
Precision: 71.02%
Recall:

In [19]:
print("Evaluation results:")
for batch_size, accuracy in results.items():
    print(f"Batch size: {batch_size}, Accuracy: {accuracy}")

Evaluation results:
Batch size: 256, Accuracy: None


# Model for regularization


In [20]:
class FFNN_regularization(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.5):
        super(FFNN_regularization, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [21]:
input_size_count = X_train_count.shape[1]
output_size_count = len(label_encoder)
print(f"FFNN regularization\n")

ffnn_count = FFNN_regularization(input_size_count, 256, output_size_count)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ffnn_count.parameters(), lr=0.001)

train_model(ffnn_count, criterion, optimizer, train_loader_count, num_epochs=10)
evaluate_model(ffnn_count, test_loader_count)

FFNN regularization

Epoch [1/10], Loss: 222.71560445427895
Epoch [2/10], Loss: 141.26893106102943
Epoch [3/10], Loss: 114.34151920676231
Epoch [4/10], Loss: 94.36796480417252
Epoch [5/10], Loss: 76.94665214419365
Epoch [6/10], Loss: 62.584375970065594
Epoch [7/10], Loss: 49.860425889492035
Epoch [8/10], Loss: 40.20941513776779
Epoch [9/10], Loss: 33.51778922602534
Epoch [10/10], Loss: 27.590626560151577

Evaluations:
Accuracy: 82.67%
F1 Score: 82.60%
Precision: 82.65%
Recall: 82.65%


