In [11]:
# Import necessary libraries
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Download NLTK stopwords if not already downloaded
import nltk
nltk.download('stopwords')

# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Function to preprocess text
def preprocess_text(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'\W', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords and apply stemming
    text = ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text

# Load the dataset
df = pd.read_csv('data/processed_data.csv')


# Assume 'abstract' is the text column and 'cited_paper_id' is the target column
df['abstract'] = df['abstract'].fillna('').apply(preprocess_text)  # Preprocess the text
texts = df['abstract'] + df['author_names'] # Preprocessed abstracts
labels = df['cited_paper_id'].notnull().astype(int)  # Binary target: 1 for citing, 0 for non-citing

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2)

# Tokenize the text
tokenizer = Tokenizer(num_words=10000)  # Use the top 10,000 words
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
max_length = 200  # Maximum sequence length
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:

# Build the CNN model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),  # Embedding layer
    Conv1D(filters=128, kernel_size=3, activation='relu'),  # Convolutional layer
    GlobalMaxPooling1D(),  # Global max pooling
    Dense(64, activation='relu'),  # Fully connected layer
    Dropout(0.5),  # Dropout for regularization
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

# Generate classification report
y_pred = (model.predict(X_test_padded) > 0.5).astype(int)
print(classification_report(y_test, y_pred))

Epoch 1/5




[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 10ms/step - accuracy: 0.7246 - loss: 0.5677 - val_accuracy: 0.7272 - val_loss: 0.5416
Epoch 2/5
[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 14ms/step - accuracy: 0.7356 - loss: 0.5172 - val_accuracy: 0.7298 - val_loss: 0.5443
Epoch 3/5
[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 10ms/step - accuracy: 0.7760 - loss: 0.4503 - val_accuracy: 0.7253 - val_loss: 0.5737
Epoch 4/5
[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 10ms/step - accuracy: 0.8528 - loss: 0.3296 - val_accuracy: 0.6838 - val_loss: 0.7381
Epoch 5/5
[1m2770/2770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 10ms/step - accuracy: 0.9245 - loss: 0.1954 - val_accuracy: 0.6690 - val_loss: 0.9238
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.6633 - loss: 0.9511
Test Accuracy: 0.67
[1m866/866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [13]:
# Preprocess the abstracts

# Tokenize the abstracts
abstracts_seq = tokenizer.texts_to_sequences(df['abstract'].iloc[:106692] + df['author_names'].iloc[:106692])

# Pad the sequences
abstracts_padded = pad_sequences(abstracts_seq, maxlen=max_length, padding='post')

# Predict using the model
pred = model.predict(abstracts_padded)

[1m3335/3335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step


In [14]:
result = pd.DataFrame()
result['ID'] = df['paper_id'].iloc[:106692]
result['Label'] = pred 
result.to_csv('data/predictions.csv', index=False)