In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from torch.utils.data import DataLoader, Dataset
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from common.functions import evaluate_model, train_model
from common.rnn import RNN
from common.custom_dataset import CustomDataset

In [4]:
train_data = pd.read_csv("../Data/arxiv_train.csv")
train_data

Unnamed: 0.1,Unnamed: 0,abstract,label
0,31716,Automatic meeting analysis is an essential f...,eess
1,89533,We propose a protocol to encode classical bi...,quant-ph
2,82700,A number of physically intuitive results for...,quant-ph
3,78830,In the last decade rare-earth hexaborides ha...,physics
4,94948,We introduce the weak barycenter of a family...,stat
...,...,...,...
79995,27913,"In this paper, the sum secure degrees of fre...",cs
79996,94441,"In areas of application, including actuarial...",stat
79997,33015,Failure detection is employed in the industr...,eess
79998,942,As part of the ongoing effort to characteriz...,astro-ph


In [5]:
# Load the train set
test_data = pd.read_csv("../Data/arxiv_test.csv")
test_data

Unnamed: 0.1,Unnamed: 0,abstract,label
0,64481,We describe a shape derivative approach to p...,math
1,48104,We study displaced signatures of sneutrino p...,hep-ph
2,48233,High precision studies of Beyond-Standard-Mo...,hep-ph
3,49026,We find that a class of models of MeV-GeV da...,hep-ph
4,37957,Knowledge of power grid's topology during ca...,eess
...,...,...,...
19995,50391,We explore the dynamics of a simple class of...,hep-th
19996,63534,In this paper one construction of compositio...,math
19997,16712,The Random-First-Order-Transition theory of ...,cond-mat
19998,6596,Accurate chemical abundance measurements of ...,astro-ph


In [6]:
count_vectorizer = CountVectorizer(max_features=5000)  # Adjust max_features as needed
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [7]:
X_train_count = count_vectorizer.fit_transform(train_data["abstract"]).toarray()
X_test_count = count_vectorizer.transform(test_data["abstract"]).toarray()

In [8]:
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data["abstract"]).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data["abstract"]).toarray()

In [9]:
label_encoder = {label: idx for idx, label in enumerate(train_data["label"].unique())}
train_data["label_encoded"] = train_data["label"].map(label_encoder)
test_data["label_encoded"] = test_data["label"].map(label_encoder)

In [10]:
y_train = torch.tensor(train_data["label_encoded"].values)
y_test = torch.tensor(test_data["label_encoded"].values)

In [11]:
# Define train and test datasets
train_dataset_count = CustomDataset(X_train_count, y_train)
test_dataset_count = CustomDataset(X_test_count, y_test)

train_dataset_tfidf = CustomDataset(X_train_tfidf, y_train)
test_dataset_tfidf = CustomDataset(X_test_tfidf, y_test)

# Define data loaders
train_loader_count = DataLoader(train_dataset_count, batch_size=64, shuffle=True)
test_loader_count = DataLoader(test_dataset_count, batch_size=64, shuffle=False)

train_loader_tfidf = DataLoader(train_dataset_tfidf, batch_size=64, shuffle=True)
test_loader_tfidf = DataLoader(test_dataset_tfidf, batch_size=64, shuffle=False)

# Evaluating different number of layers for Count Vectorizer feature detection method

In [34]:
# Train and evaluate RNN with CountVectorizer and different numbers of layers
input_size_count = X_train_count.shape[1]
output_size_count = len(label_encoder)
for num_layers in [1, 2, 3]:
    print(f"Training RNN with CountVectorizer and {num_layers} layers\n")
    
    rnn_count = RNN(input_size_count, 256, output_size_count, num_layers)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(rnn_count.parameters(), lr=0.001)
    
    train_model(rnn_count, criterion, optimizer, train_loader_count, num_epochs=10)
    evaluate_model(rnn_count, test_loader_count)

Training RNN with CountVectorizer and 1 layers

Epoch [1/10], Loss: 763.3199734687805
Epoch [2/10], Loss: 506.98329558968544
Epoch [3/10], Loss: 400.7188519388437
Epoch [4/10], Loss: 309.13360740989447
Epoch [5/10], Loss: 218.14016295969486
Epoch [6/10], Loss: 144.04848780483007
Epoch [7/10], Loss: 90.89709524437785
Epoch [8/10], Loss: 59.36363659566268
Epoch [9/10], Loss: 41.46178376721218
Epoch [10/10], Loss: 35.75288056908175

Evaluations:
Accuracy: 80.40%
F1 Score: 80.28%
Precision: 80.27%
Recall: 80.35%


Training RNN with CountVectorizer and 2 layers

Epoch [1/10], Loss: 780.1336170881987
Epoch [2/10], Loss: 533.3758004158735
Epoch [3/10], Loss: 448.5421806126833
Epoch [4/10], Loss: 380.02794320881367
Epoch [5/10], Loss: 317.9964202865958
Epoch [6/10], Loss: 259.1879898197949
Epoch [7/10], Loss: 205.02648516744375
Epoch [8/10], Loss: 160.85056117735803
Epoch [9/10], Loss: 123.30341439787298
Epoch [10/10], Loss: 95.95782375568524

Evaluations:
Accuracy: 79.81%
F1 Score: 79.86%
Pre

# Evaluating Different Activation Functions for Count Vectorizer feature detection method

In [12]:
# Train and evaluate FFNN with CountVectorizer and different activation function
input_size_count = X_train_count.shape[1]
output_size_count = len(label_encoder)
activation_functions = [nn.LeakyReLU(), nn.ELU(), nn.Sigmoid(), nn.Softmax(dim=1)]

for activation_function in activation_functions:
    print(f"Training RNN with CountVectorizer and {activation_function.__class__.__name__} as an activation function\n")
    
    rnn_count = RNN(input_size_count, 256, output_size_count, num_layers=2, activation=activation_function)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(rnn_count.parameters(), lr=0.001)
    
    train_model(rnn_count, criterion, optimizer, train_loader_count, num_epochs=10)
    evaluate_model(rnn_count, test_loader_count)


Training RNN with CountVectorizer and LeakyReLU as an activation function

Epoch [1/10], Loss: 806.3141439259052
Epoch [2/10], Loss: 535.044398277998
Epoch [3/10], Loss: 451.13853193819523
Epoch [4/10], Loss: 383.09631638228893
Epoch [5/10], Loss: 322.45223496854305
Epoch [6/10], Loss: 265.23423055931926
Epoch [7/10], Loss: 211.11417189612985
Epoch [8/10], Loss: 161.5899314135313
Epoch [9/10], Loss: 128.69750709366053
Epoch [10/10], Loss: 98.5733313281089

Evaluations:
Accuracy: 80.27%
F1 Score: 80.25%
Precision: 80.51%
Recall: 80.26%


Training RNN with CountVectorizer and ELU as an activation function

Epoch [1/10], Loss: 785.3260654062033
Epoch [2/10], Loss: 536.790642619133
Epoch [3/10], Loss: 449.99404802173376
Epoch [4/10], Loss: 382.0542240664363
Epoch [5/10], Loss: 322.22879678756
Epoch [6/10], Loss: 263.0231476314366
Epoch [7/10], Loss: 208.42247082665563
Epoch [8/10], Loss: 161.14629240706563
Epoch [9/10], Loss: 126.28357488056645
Epoch [10/10], Loss: 99.38592691207305

Evalu

# Evaluating different number of layers for tfidf feature detection method

In [13]:
input_size_tfidf = X_train_tfidf.shape[1]
output_size_tfidf = len(label_encoder)
for num_layers in [1, 2, 3]:
    print(f"Training RNN with TF-IDF and {num_layers} layers\n")
    
    rnn_tfidf = RNN(input_size_tfidf, 256, output_size_tfidf, num_layers)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(rnn_tfidf.parameters(), lr=0.001)
    
    train_model(rnn_tfidf, criterion, optimizer, train_loader_tfidf, num_epochs=10)
    evaluate_model(rnn_tfidf, test_loader_tfidf)

Training RNN with TF-IDF and 1 layers

Epoch [1/10], Loss: 974.9493601024151
Epoch [2/10], Loss: 560.0621326118708
Epoch [3/10], Loss: 495.56233832240105
Epoch [4/10], Loss: 459.116337954998
Epoch [5/10], Loss: 432.02958865463734
Epoch [6/10], Loss: 412.2890088185668
Epoch [7/10], Loss: 397.56641874462366
Epoch [8/10], Loss: 384.2523663341999
Epoch [9/10], Loss: 375.58368457853794
Epoch [10/10], Loss: 364.91814502328634

Evaluations:
Accuracy: 79.35%
F1 Score: 79.27%
Precision: 79.29%
Recall: 79.32%


Training RNN with TF-IDF and 2 layers

Epoch [1/10], Loss: 944.0967095196247
Epoch [2/10], Loss: 586.5105360001326
Epoch [3/10], Loss: 524.6750490069389
Epoch [4/10], Loss: 484.3246194422245
Epoch [5/10], Loss: 457.8491253107786
Epoch [6/10], Loss: 432.71689858287573
Epoch [7/10], Loss: 417.4559953585267
Epoch [8/10], Loss: 405.78441286087036
Epoch [9/10], Loss: 391.60116408765316
Epoch [10/10], Loss: 376.27822683006525

Evaluations:
Accuracy: 79.81%
F1 Score: 79.65%
Precision: 79.93%
Rec

# Evaluating Activation Function for tfidf feature detection method

In [14]:
input_size_tfidf = X_train_tfidf.shape[1]
output_size_tfidf = len(label_encoder)
activation_functions = [nn.LeakyReLU(), nn.ELU(), nn.Sigmoid(), nn.Softmax(dim=1)]

for activation_function in activation_functions:
    print(f"Training RNN with tfidf and {activation_function.__class__.__name__} as an activation function\n")
    
    rnn_tfidf = RNN(input_size_tfidf, 256, output_size_tfidf, num_layers=2)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(rnn_tfidf.parameters(), lr=0.001)
    
    train_model(rnn_tfidf, criterion, optimizer, train_loader_tfidf, num_epochs=10)
    evaluate_model(rnn_tfidf, test_loader_tfidf)

Training RNN with tfidf and LeakyReLU as an activation function

Epoch [1/10], Loss: 959.5855261832476
Epoch [2/10], Loss: 586.7226179987192
Epoch [3/10], Loss: 520.5368458628654
Epoch [4/10], Loss: 482.04027332365513
Epoch [5/10], Loss: 456.0838768184185
Epoch [6/10], Loss: 435.62667295336723
Epoch [7/10], Loss: 418.85956501215696
Epoch [8/10], Loss: 405.2117108106613
Epoch [9/10], Loss: 389.3891863003373
Epoch [10/10], Loss: 380.1468401700258

Evaluations:
Accuracy: 79.97%
F1 Score: 79.87%
Precision: 79.95%
Recall: 79.90%


Training RNN with tfidf and ELU as an activation function

Epoch [1/10], Loss: 930.935423463583
Epoch [2/10], Loss: 590.0360115617514
Epoch [3/10], Loss: 524.5577806830406
Epoch [4/10], Loss: 484.2687980681658
Epoch [5/10], Loss: 459.3589107468724
Epoch [6/10], Loss: 436.2060491144657
Epoch [7/10], Loss: 417.8359133377671
Epoch [8/10], Loss: 405.26593001931906
Epoch [9/10], Loss: 392.54777332395315
Epoch [10/10], Loss: 378.93930273503065

Evaluations:
Accuracy: 80

# Evaluating hidden size

In [16]:

class FFNN_hidden_size(nn.Module):
    def __init__(
        self, input_size, hidden_sizes, output_size, activation=nn.ReLU()
    ):
        super(FFNN_hidden_size, self).__init__()  # Corrected super call
        self.num_layers = len(hidden_sizes) + 1
        self.hidden_layers = nn.ModuleList()
        for i in range(self.num_layers - 1):
            if i == 0:
                self.hidden_layers.append(nn.Linear(input_size, hidden_sizes[i]))
            else:
                self.hidden_layers.append(nn.Linear(hidden_sizes[i-1], hidden_sizes[i]))
            self.hidden_layers.append(activation)
        self.hidden_layers.append(nn.Linear(hidden_sizes[-1], output_size))

    def forward(self, x):
        for layer in self.hidden_layers:
            x = layer(x)
        return x

# Evaluating hidden size for Count Vectorizer

In [17]:

input_size_count = X_train_count.shape[1]
output_size_count = len(label_encoder)
hidden_sizes = [64, 128, 256]

for hidden_size in hidden_sizes:
    print(f"Training FFNN with CountVectorizer and {hidden_size} as hidden size\n")
    
    ffnn_count = FFNN_hidden_size(input_size_count, [hidden_size], output_size_count, activation=nn.Sigmoid())
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(ffnn_count.parameters(), lr=0.001)
    
    train_model(ffnn_count, criterion, optimizer, train_loader_count, num_epochs=10)
    evaluate_model(ffnn_count, test_loader_count)


Training FFNN with CountVectorizer and 64 as hidden size

Epoch [1/10], Loss: 928.1101182699203
Epoch [2/10], Loss: 521.2363376766443
Epoch [3/10], Loss: 431.4762702435255
Epoch [4/10], Loss: 367.9116633683443
Epoch [5/10], Loss: 313.4780855104327
Epoch [6/10], Loss: 265.27464877814054
Epoch [7/10], Loss: 220.85574132576585
Epoch [8/10], Loss: 179.75636037439108
Epoch [9/10], Loss: 142.45119661837816
Epoch [10/10], Loss: 109.49029067344964

Evaluations:
Accuracy: 81.19%
F1 Score: 81.05%
Precision: 81.11%
Recall: 81.15%


Training FFNN with CountVectorizer and 128 as hidden size

Epoch [1/10], Loss: 853.1412792801857
Epoch [2/10], Loss: 506.3931464701891
Epoch [3/10], Loss: 415.78018732368946
Epoch [4/10], Loss: 347.91063272953033
Epoch [5/10], Loss: 285.9519297629595
Epoch [6/10], Loss: 230.24208783730865
Epoch [7/10], Loss: 176.87036663293839
Epoch [8/10], Loss: 130.54939570464194
Epoch [9/10], Loss: 90.7079890165478
Epoch [10/10], Loss: 59.17798920162022

Evaluations:
Accuracy: 80.69

# Model for evaluating batch size

In [18]:
num_epochs = 10
learning_rate = 0.001
batch_sizes = [32, 64, 128, 256] 
results = {}
input_size_count = X_train_count.shape[1]
output_size_count = len(label_encoder)
for batch_size in batch_sizes:
    print(f"Evaluating with batch size: {batch_size}")
    
    model = FFNN(input_size_count, 256, output_size_count, num_layers=2, activation=nn.ReLU())
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    for batch_size in batch_sizes: 
        train_loader_count = DataLoader(train_dataset_count, batch_size, shuffle=True)
        test_loader_count = DataLoader(test_dataset_count, batch_size, shuffle=False)
    
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader_count:
            optimizer.zero_grad()
            inputs = inputs.float()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        accuracy = evaluate_model(model, test_loader_count)
    results[batch_size] = accuracy
    print(f"Accuracy with batch size {batch_size}: {accuracy}")



Evaluating with batch size: 32

Evaluations:
Accuracy: 69.58%
F1 Score: 63.04%
Precision: 59.33%
Recall: 69.50%



Evaluations:
Accuracy: 69.14%
F1 Score: 63.37%
Precision: 60.27%
Recall: 69.05%



Evaluations:
Accuracy: 68.45%
F1 Score: 62.82%
Precision: 59.68%
Recall: 68.36%



Evaluations:
Accuracy: 68.44%
F1 Score: 62.67%
Precision: 59.25%
Recall: 68.36%



Evaluations:
Accuracy: 67.90%
F1 Score: 62.61%
Precision: 60.48%
Recall: 67.84%



Evaluations:
Accuracy: 68.08%
F1 Score: 62.53%
Precision: 59.59%
Recall: 68.00%



Evaluations:
Accuracy: 67.51%
F1 Score: 62.35%
Precision: 59.82%
Recall: 67.43%



Evaluations:
Accuracy: 67.19%
F1 Score: 62.11%
Precision: 59.64%
Recall: 67.11%



Evaluations:
Accuracy: 67.50%
F1 Score: 62.37%
Precision: 59.92%
Recall: 67.42%



Evaluations:
Accuracy: 67.42%
F1 Score: 62.33%
Precision: 59.80%
Recall: 67.34%


Accuracy with batch size 256: None
Evaluating with batch size: 64

Evaluations:
Accuracy: 77.80%
F1 Score: 73.94%
Precision: 71.02%
Recall:

In [19]:
print("Evaluation results:")
for batch_size, accuracy in results.items():
    print(f"Batch size: {batch_size}, Accuracy: {accuracy}")

Evaluation results:
Batch size: 256, Accuracy: None


# Model for regularization

In [20]:


class FFNN_regularization(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.5):
        super(FFNN_regularization, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [21]:
input_size_count = X_train_count.shape[1]
output_size_count = len(label_encoder)
print(f"FFNN regularization\n")

ffnn_count = FFNN_regularization(input_size_count, 256, output_size_count)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ffnn_count.parameters(), lr=0.001)

train_model(ffnn_count, criterion, optimizer, train_loader_count, num_epochs=10)
evaluate_model(ffnn_count, test_loader_count)

FFNN regularization

Epoch [1/10], Loss: 222.71560445427895
Epoch [2/10], Loss: 141.26893106102943
Epoch [3/10], Loss: 114.34151920676231
Epoch [4/10], Loss: 94.36796480417252
Epoch [5/10], Loss: 76.94665214419365
Epoch [6/10], Loss: 62.584375970065594
Epoch [7/10], Loss: 49.860425889492035
Epoch [8/10], Loss: 40.20941513776779
Epoch [9/10], Loss: 33.51778922602534
Epoch [10/10], Loss: 27.590626560151577

Evaluations:
Accuracy: 82.67%
F1 Score: 82.60%
Precision: 82.65%
Recall: 82.65%


