In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("/content/drive/MyDrive/dataset/train.csv")
df.head()

Unnamed: 0,id,title,abstract,category
0,2009.0642,Completely Self-Supervised Crowd Counting via ...,Dense crowd counting is a challenging task t...,cs
1,2010.13821,Wavelet Flow: Fast Training of High Resolution...,Normalizing flows are a class of probabilist...,cs
2,1904.12782,Transversally Elliptic Complex and Cohomologic...,This work is a continuation of our previous ...,math
3,2105.00878,On the Malliavin-Rubel theorem on small entire...,"In the early 1960s, P. Malliavin and L. A. R...",math
4,1906.04024,On the Odd Cycle Game and Connected Rules,We study the positional game where two playe...,math


In [None]:
df = df[~df['category'].isin(['q-alg', 'funct-an', 'alg-geom'])]

In [None]:
# Create a dataframe with 2538 samples for each category
df_balanced = df.groupby('category').apply(lambda x: x.sample(2538))

# Reset the index of the new dataframe
df_balanced.reset_index(drop=True, inplace=True)

In [None]:
df_balanced['category'].value_counts()

astro-ph    2538
cond-mat    2538
quant-ph    2538
q-fin       2538
q-bio       2538
physics     2538
nucl-th     2538
nucl-ex     2538
nlin        2538
math-ph     2538
math        2538
hep-th      2538
hep-ph      2538
hep-lat     2538
hep-ex      2538
gr-qc       2538
eess        2538
econ        2538
cs          2538
stat        2538
Name: category, dtype: int64

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import logging
from numpy import random
from nltk.corpus import stopwords
import re

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub(' ', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

df_balanced['abstract'] = df_balanced['abstract'].apply(clean_text)
df_balanced['title'] = df_balanced['title'].apply(clean_text)

In [None]:
df_balanced.isna().sum()

id          0
title       0
abstract    0
category    0
dtype: int64

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
df_balanced

Unnamed: 0,id,title,abstract,category
0,2003.13862,technologies supporting high order geodesic me...,many important problems astrophysics space phy...,astro-ph
1,1902.03992,carmenes search exoplanets around dwarfs chrom...,chromospheric modeling observed differences st...,astro-ph
2,2102.05068,low metallicity young clusters outer galaxy ii...,deep near infrared imaging low metallicity rm ...,astro-ph
3,2005.13957,linear perturbations spectra dynamical dark en...,paper study particular modified gravity equati...,astro-ph
4,2009.10555,orbits five triple stars,joint analysis radial velocities position meas...,astro-ph
...,...,...,...,...
50755,2105.03109,laplace matching fast approximate inference ge...,bayesian inference generalized linear models g...,stat
50756,2102.07771,online learning riemannian hidden markov model...,hidden markov models observations euclidean sp...,stat
50757,2007.01675,stochastic variational bayesian inference nonl...,variational bayes vb used facilitate calculati...,stat
50758,2104.01115,local global topics text modeling web pages ne...,topic models popular models analyzing collecti...,stat


In [None]:
texts = df_balanced['abstract'].values
labels = df_balanced['category'].values

In [None]:
labels

array(['astro-ph', 'astro-ph', 'astro-ph', ..., 'stat', 'stat', 'stat'],
      dtype=object)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential
from keras.utils import to_categorical

In [None]:
num_classes = len(set(labels))
num_classes

20

In [None]:
# Tokenize the texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad the sequences
max_length = max([len(s) for s in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label = le.fit_transform(labels)
labels = to_categorical(label, num_classes=num_classes)

In [None]:
# Split the dataset into train and test sets
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2)

In [None]:
# Build the model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, 100, input_length=max_length))
model.add(LSTM(100))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
#history_1 = model.fit(padded_sequences, labels, epochs=25)
history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#Build another model
model_2 = Sequential()
model_2.add(Embedding(len(tokenizer.word_index)+1, 100, input_length=max_length))
model_2.add(LSTM(100))
#model2.add()
model_2.add(Dense(128, activation='relu'))
model_2.add(Dense(num_classes, activation='softmax'))
model_2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#Train the model
history2 = model_2.fit(x_train, y_train, validation_data=(x_test, y_test), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Save the model's architecture and weights
model.save('model_1.h5')

In [None]:
model_2.save('model_2.h5')