In [3]:
# GOT THIS SCRIPT FROM https://www.geeksforgeeks.org/text-classification-using-cnn/


# importing the necessary libraries
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.datasets import imdb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Setting up the parameters
maximum_features = 5000  # Maximum number of words to consider as features
maximum_length = 100  # Maximum length of input sequences
word_embedding_dims = 50  # Dimension of word embeddings
no_of_filters = 250  # Number of filters in the convolutional layer
kernel_size = 3  # Size of the convolutional filters
hidden_dims = 250  # Number of neurons in the hidden layer
batch_size = 32  # Batch size for training
epochs = 2  # Number of training epochs
threshold = 0.5  # Threshold for binary classification

df = pd.read_csv('data/processed_data.csv')
data = df['abstract'] + df['author_names']
labels = df['cited_paper_id'].notnull().astype(int)  # Binary target: 1 for citing, 0 for non-citing

data.head()

0    The development of an automated system for the...
1    This paper proposes a novel hybrid forward alg...
2    Modern CCD cameras are usually capable of a sp...
3    This paper deals with the problem of fuzzy non...
4    A number of neural networks can be formulated ...
dtype: object

In [4]:
#remove stop words
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(tokens)
data['abstract'] = data['abstract'].fillna('').apply(remove_stopwords)


#apply stemming
from nltk.stem import PorterStemmer
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()
def stemming(text):
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)
data['abstract'] = data['abstract'].fillna('').apply(stemming)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyError: 'abstract'

In [None]:

# Loading the IMDB dataset
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=maximum_features)

# Padding the sequences to ensure uniform length
x_train = pad_sequences(x_train, maxlen=maximum_length)
x_test = pad_sequences(x_test, maxlen=maximum_length)

In [1]:

# Building the model
model = Sequential()

# Adding the embedding layer to convert input sequences to dense vectors
model.add(Embedding(maximum_features, word_embedding_dims,
                    input_length=maximum_length))

# Adding the 1D convolutional layer with ReLU activation
model.add(Conv1D(no_of_filters, kernel_size, padding='valid',
                 activation='relu', strides=1))

# Adding the global max pooling layer to reduce dimensionality
model.add(GlobalMaxPooling1D())

# Adding the dense hidden layer with ReLU activation
model.add(Dense(hidden_dims, activation='relu'))

# Adding the output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compiling the model with binary cross-entropy loss and Adam optimizer
model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=['accuracy'])

# Training the model
model.fit(x_train, y_train, batch_size=batch_size,
          epochs=epochs, validation_data=(x_test, y_test))

# Predicting the probabilities for test data
y_pred_prob = model.predict(x_test)

# Converting the probabilities to binary classes based on threshold
y_pred = (y_pred_prob > threshold).astype(int)

# Calculating the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Printing the evaluation metrics
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step
Epoch 1/2




[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.6978 - loss: 0.5284 - val_accuracy: 0.8509 - val_loss: 0.3339
Epoch 2/2
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.8992 - loss: 0.2505 - val_accuracy: 0.8565 - val_loss: 0.3296
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy: 0.85652
Precision: 0.8357062146892655
Recall: 0.88752
F1-score: 0.8608341416100873
