In [1]:
import warnings
warnings.filterwarnings("ignore", category= UserWarning)

import numpy as np
import pandas as pd 
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from keras_tuner.tuners import RandomSearch
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
train = pd.read_csv('/kaggle/input/fdl-arxiv-final/train_resampled.csv')
test = pd.read_csv('/kaggle/input/fdl-arxiv-final/test.csv')

In [3]:
X_train = train.abstract
X_test = test.abstract
y_train = train.labels
y_test = test.labels

In [4]:
rng = np.random.default_rng(seed = 42)
shuffler_train = rng.permutation(len(X_train))
shuffler_test = rng.permutation(len(X_test))

In [5]:
X_train, y_train = [X_train[i] for i in shuffler_train], [y_train[i] for i in shuffler_train]
X_test, y_test = [X_test[i] for i in shuffler_test], [y_test[i] for i in shuffler_test]

# logistic regression with BOW representation 

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features = 30000)

X_train_bow = vectorizer.fit_transform(X_train)

X_test_bow = vectorizer.transform(X_test)

logreg = LogisticRegression()


logreg.fit(X_train_bow, y_train)

# Evaluate the model
accuracy = logreg.score(X_test_bow, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.8412884333821377


# CNN WITH GLOVE EMBEDDINGS

In [7]:
vocab_size = 30000
embedding_dim = 100
sequence_length = 300
num_classes = 8

In [8]:
vectorizer = TextVectorization(max_tokens=30000,
                               output_mode='int',
                               output_sequence_length=sequence_length)
vectorizer.adapt(X_train)

X_train = vectorizer(X_train)
X_test= vectorizer(X_test)

In [9]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [10]:
def build_embeddings_index(path):
    embeddings_index = {}
    with open(path, encoding = 'latin-1') as f:
        for line in f:
            word, coeffs = line.split(maxsplit=1)
            coeffs = np.fromstring(coeffs, "f", sep=" ")
            embeddings_index[word] = coeffs
    return embeddings_index

def embedding_matrix(X_train, vectorizer, vocab_size, sequence_length, path, embedding_dim = 100):
    embeddings_index = build_embeddings_index(path)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for i, word in enumerate(vectorizer.get_vocabulary()):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [11]:
embedding_matrix = embedding_matrix(X_train, vectorizer,
                                           vocab_size, sequence_length,
                                           path = '/kaggle/input/glove6b/glove.6B.100d.txt')

  coeffs = np.fromstring(coeffs, "f", sep=" ")


In [29]:
def tune_and_build_cnn(hp):
    
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim,
                        embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                        trainable=False))
    filters_1 = hp.Int('filters_1', min_value=128, max_value=256, step= 32)
    kernel_size_1 = hp.Choice('kernel_size_1', values=[3, 5, 7])
    filters_2 = hp.Int('filters_2', min_value= 64, max_value=128, step=16)
    kernel_size_2 = hp.Choice('kernel_size_2', values=[3, 5])
    filters_3 = hp.Int('filters_3', min_value= 32, max_value= 64, step= 8)
    kernel_size_3 = values= 3
    model.add(Conv1D(filters_1, kernel_size_1, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(rate=0.1))
    model.add(Conv1D(filters_2, kernel_size_2, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(rate=0.1))
    model.add(Conv1D(filters_3, kernel_size_3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(GlobalMaxPooling1D())
    units = hp.Int('units', min_value=128, max_value=512, step= 64)
    model.add(Dense(units, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [33]:
tuner = RandomSearch(tune_and_build_cnn, objective='val_accuracy',
                     max_trials=20, executions_per_trial=1,
                     directory='/kaggle/working', project_name='result')

In [31]:
early_stopping = EarlyStopping(monitor='val_loss', patience= 2, restore_best_weights=True)

In [34]:
tuner.search(X_train, y_train, epochs=10, validation_data= (X_test, y_test), callbacks = [early_stopping])

Trial 20 Complete [00h 02m 21s]
val_accuracy: 0.8056002855300903

Best val_accuracy So Far: 0.8287701606750488
Total elapsed time: 00h 32m 57s


In [35]:
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         3000000   
                                                                 
 conv1d (Conv1D)             (None, None, 256)         77056     
                                                                 
 max_pooling1d (MaxPooling1D  (None, None, 256)        0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, None, 256)         0         
                                                                 
 conv1d_1 (Conv1D)           (None, None, 128)         163968    
                                                                 
 max_pooling1d_1 (MaxPooling  (None, None, 128)        0         
 1D)                                                    

In [36]:
tuner.results_summary()

Results summary
Results in /kaggle/working/result
Showing 10 best trials
Objective(name="val_accuracy", direction="max")

Trial 18 summary
Hyperparameters:
filters_1: 256
kernel_size_1: 3
filters_2: 128
kernel_size_2: 5
filters_3: 40
units: 256
Score: 0.8287701606750488

Trial 01 summary
Hyperparameters:
filters_1: 224
kernel_size_1: 7
filters_2: 96
kernel_size_2: 5
filters_3: 48
units: 384
Score: 0.8272327780723572

Trial 14 summary
Hyperparameters:
filters_1: 224
kernel_size_1: 3
filters_2: 96
kernel_size_2: 3
filters_3: 32
units: 384
Score: 0.8219985365867615

Trial 13 summary
Hyperparameters:
filters_1: 224
kernel_size_1: 7
filters_2: 80
kernel_size_2: 5
filters_3: 40
units: 192
Score: 0.8179721832275391

Trial 17 summary
Hyperparameters:
filters_1: 256
kernel_size_1: 3
filters_2: 96
kernel_size_2: 3
filters_3: 48
units: 128
Score: 0.8166544437408447

Trial 10 summary
Hyperparameters:
filters_1: 224
kernel_size_1: 5
filters_2: 80
kernel_size_2: 5
filters_3: 64
units: 320
Score: 0.8

In [37]:
best_model.save('/kaggle/working/cnn_with_embed')

In [None]:
import shutil

shutil.make_archive('/kaggle/working/directory_1', 'zip', '/kaggle/working/cnn_with_embed')

# cnn 2

In [39]:
def tune_and_build_cnn_2(hp):
    
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim,
                        embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                        trainable=False))
    filters = hp.Int('filters', min_value=64, max_value=256, step= 32)
    kernel_size = hp.Choice('kernel_size', values=[3, 5, 7])
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(GlobalMaxPooling1D())
    units = hp.Int('units', min_value=128, max_value=512, step= 64)
    model.add(Dense(units, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [42]:
tuner = RandomSearch(tune_and_build_cnn_2, objective='val_accuracy',
                     max_trials=10, executions_per_trial=1,
                     directory='/kaggle/working', project_name='result_2')

In [43]:
early_stopping = EarlyStopping(monitor='val_loss', patience= 2, restore_best_weights=True)

In [44]:
tuner.search(X_train, y_train, epochs=10, validation_data= (X_test, y_test), callbacks = [early_stopping])

Trial 10 Complete [00h 00m 52s]
val_accuracy: 0.8098828792572021

Best val_accuracy So Far: 0.8498901724815369
Total elapsed time: 00h 07m 07s


In [45]:
best_model_2 = tuner.get_best_models(num_models=1)[0]
best_model_2.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         3000000   
                                                                 
 conv1d (Conv1D)             (None, None, 96)          67296     
                                                                 
 global_max_pooling1d (Globa  (None, 96)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 256)               24832     
                                                                 
 dense_1 (Dense)             (None, 8)                 2056      
                                                                 
Total params: 3,094,184
Trainable params: 94,184
Non-trainable params: 3,000,000
_________________________________________

In [46]:
best_model_2.save('/kaggle/working/cnn_2_with_embed')

In [47]:
import shutil

shutil.make_archive('/kaggle/working/directory_2', 'zip', '/kaggle/working/cnn_2_with_embed')

'/kaggle/working/directory_2.zip'