In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
#Importing necessary libraries
import pandas as pd
import nltk
nltk.download('stopwords')
import logging
from numpy import random
from nltk.corpus import stopwords
import re
from nltk.util import ngrams
import random
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
#Loading data from google drive
train_data = pd.read_csv("/content/drive/MyDrive/dataset/train.csv")

In [None]:
train_data.head()

Unnamed: 0,id,title,abstract,category
0,2009.0642,Completely Self-Supervised Crowd Counting via ...,Dense crowd counting is a challenging task t...,cs
1,2010.13821,Wavelet Flow: Fast Training of High Resolution...,Normalizing flows are a class of probabilist...,cs
2,1904.12782,Transversally Elliptic Complex and Cohomologic...,This work is a continuation of our previous ...,math
3,2105.00878,On the Malliavin-Rubel theorem on small entire...,"In the early 1960s, P. Malliavin and L. A. R...",math
4,1906.04024,On the Odd Cycle Game and Connected Rules,We study the positional game where two playe...,math


In [None]:
#Let's visualize data distribution
train_data['category'].value_counts()

cs          262503
math        177679
cond-mat     71772
physics      66282
astro-ph     58737
stat         51308
eess         34330
quant-ph     25171
hep-th       19706
hep-ph       19207
gr-qc        15590
math-ph      12238
q-bio        10530
hep-ex        7435
nucl-th       6662
nlin          6047
q-fin         5942
econ          3924
nucl-ex       3627
hep-lat       2538
q-alg            2
funct-an         1
alg-geom         1
Name: category, dtype: int64

In [None]:
#Drop duplicate values
train_data = train_data.drop_duplicates(subset='title')
train_data['category'].value_counts()

cs          139225
math        105146
cond-mat     42786
physics      38243
astro-ph     38039
stat         22745
quant-ph     16020
eess         15843
hep-ph       11400
hep-th       10690
gr-qc         8662
q-bio         5665
hep-ex        3950
math-ph       3822
nucl-th       3517
q-fin         3053
nlin          2894
econ          2050
nucl-ex       1788
hep-lat       1132
funct-an         1
alg-geom         1
Name: category, dtype: int64

In [None]:
#dropping rows with just 1 sample
df = train_data[~train_data['category'].isin(['funct-an', 'alg-geom'])]

In [None]:
#Cleaning text data
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub(' ', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

df['abstract'] = df['abstract'].apply(clean_text)
df['title'] = df['title'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['abstract'] = df['abstract'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title'] = df['title'].apply(clean_text)


In [None]:
#Visualize
df['category'].value_counts()

cs          139225
math        105146
cond-mat     42786
physics      38243
astro-ph     38039
stat         22745
quant-ph     16020
eess         15843
hep-ph       11400
hep-th       10690
gr-qc         8662
q-bio         5665
hep-ex        3950
math-ph       3822
nucl-th       3517
q-fin         3053
nlin          2894
econ          2050
nucl-ex       1788
hep-lat       1132
Name: category, dtype: int64

In [None]:
len(df)

476670

Since some classes have very few data, let's try to generate more data through augmentation so that the model will have enough data to be trained later.

In [None]:
#Function for text augmentation
#thorai data vako class ma apply gareko xa, to generate more data
def augment(sentences):

  augmented_sentences = []

  for sentence in sentences:
    tokens = word_tokenize(sentence)
    new_tokens = []
    for token in tokens:
        synonyms = wordnet.synsets(token)
        if synonyms:
            new_token = random.choice(synonyms).lemmas()[0].name()
            new_tokens.append(new_token)
        else:
            new_tokens.append(token)
    augmented_sentence = " ".join(new_tokens)
    augmented_sentences.append(augmented_sentence)

  return(augmented_sentences)

In [None]:
#Creating a copy
df_copy = df.copy()

In [None]:
df_copy['category'].value_counts()

cs          139225
math        105146
cond-mat     42786
physics      38243
astro-ph     38039
stat         22745
quant-ph     16020
eess         15843
hep-ph       11400
hep-th       10690
gr-qc         8662
q-bio         5665
hep-ex        3950
math-ph       3822
nucl-th       3517
q-fin         3053
nlin          2894
econ          2050
nucl-ex       1788
hep-lat       1132
Name: category, dtype: int64

In [None]:
#Listing classes which have low number of data
less = ['gr-qc', 'q-bio', 'hep-ex', 'math-ph', 'nucl-th', 'q-fin', 'nlin', 'econ', 'nucl-ex', 'hep-lat']

In [None]:
#classes with extremely high number of training data
#excess data vako class bata chai data remove garinxa (after sometime)
more = ['cs', 'math']
import numpy as np

In [None]:
#some ugly code
#ke trick le ramro banauna sakxau vane give it a try
A = df.loc[df['category'] == 'gr-qc']
B = df.loc[df['category'] == 'q-bio']
C = df.loc[df['category'] == 'hep-ex']
D = df.loc[df['category'] == 'math-ph']
E = df.loc[df['category'] == 'nucl-th']
F = df.loc[df['category'] == 'q-fin']
G = df.loc[df['category'] == 'nlin']
H = df.loc[df['category'] == 'econ']
I = df.loc[df['category'] == 'nucl-ex']
J = df.loc[df['category'] == 'hep-lat']

In [None]:
#some more ugly code XD
A1 = A['title'].values
A2 = A['abstract'].values
B1 = B['title'].values
B2 = B['abstract'].values
C1 = C['title'].values
C2 = C['abstract'].values
D1 = D['title'].values
D2 = D['abstract'].values
E1 = E['title'].values
E2 = E['abstract'].values
F1 = F['title'].values
F2 = F['abstract'].values
G1 = G['title'].values
G2 = G['abstract'].values
H1 = H['title'].values
H2 = H['abstract'].values
I1 = I['title'].values
I2 = I['abstract'].values
J1 = J['title'].values
J2 = J['abstract'].values

In [None]:
A11 = augment(A1)
A21 = augment(A2)
B11 = augment(B1)
B21 = augment(B2)
C11 = augment(C1)
C21 = augment(C2)
D11 = augment(D1)
D21 = augment(D2)
E11 = augment(E1)
E21 = augment(E2)
F11 = augment(F1)
F21 = augment(F2)
G11 = augment(G1)
G21 = augment(G2)
H11 = augment(H1)
H21 = augment(H2)
I11 = augment(I1)
I21 = augment(I2)
J11 = augment(J1)
J21 = augment(J2)

In [None]:
# Create a DataFrame from the lists
#less = ['gr-qc', 'q-bio', 'hep-ex', 'math-ph', 'nucl-th', 'q-fin', 'nlin', 'econ', 'nucl-ex', 'hep-lat']
dfA = pd.DataFrame({'title': A11, 'abstract': A21, 'category':'gr-qc'})
dfB = pd.DataFrame({'title': B11, 'abstract': B21, 'category':'q-bio'})
dfC = pd.DataFrame({'title': C11, 'abstract': C21, 'category':'hep-ex'})
dfD = pd.DataFrame({'title': D11, 'abstract': D21, 'category':'math-ph'})
dfE = pd.DataFrame({'title': E11, 'abstract': E21, 'category':'nucl-th'})
dfF = pd.DataFrame({'title': F11, 'abstract': F21, 'category':'q-fin'})
dfG = pd.DataFrame({'title': G11, 'abstract': G21, 'category':'nlin'})
dfH = pd.DataFrame({'title': H11, 'abstract': H21, 'category':'econ'})
dfI = pd.DataFrame({'title': I11, 'abstract': I21, 'category':'nucl-ex'})
dfJ = pd.DataFrame({'title': J11, 'abstract': J21, 'category':'hep-lat'})

Yo result_df ma hamile regenerate gareko data xa (10 ota class ko data)

result_df contains data generated from text augmentation

In [None]:
result_df = pd.concat([dfA, dfB, dfC, dfD, dfE, dfF, dfG, dfH, dfI, dfJ], ignore_index=True)

In [None]:
#Let's visualize
result_df.tail()

Unnamed: 0,title,abstract,category
36528,meson spectrum su nitrogen estimate theory qua...,gauge theory gauge group su N quark belong arb...,hep-lat
36529,actual time evolution heavy quark glasma,inaugurate fresh real fourth_dimension concept...,hep-lat
36530,color electric correlation function gradient flow,report_card advance study color electric corre...,hep-lat
36531,analyze pion pion scattering combination propa...,paper reputation Holocene development hal quan...,hep-lat
36532,wicket barn phase agent radius v_ cb,discus Holocene progress lattice calculation b...,hep-lat


In [None]:
#comparision with the original
df.tail()

Unnamed: 0,id,title,abstract,category
861212,2102.00941,fast greedy subset selection large candidate s...,subset selection interesting important topic f...,cs
861221,2106.06956,billiard tables rotational symmetry,generalize following simple geometric fact cen...,math
861222,1909.02882,implications increased lambda separation energ...,stimulated recent indications binding energy h...,nucl-th
861226,1902.05287,risk management machine learning based algorithms,propose machine learning based algorithms solv...,q-fin
861233,1904.00375,lightchain dht based blockchain resource const...,append distributed database blockchain utilize...,cs


In [None]:
#final_df = pd.concat([df, result_df], ignore_index=True)

In [None]:
#final_df.tail()

In [None]:
# Shuffling a Pandas dataframe with .shuffle()
#Shuffle gareko because category wise serially xa data haru
shuffled = result_df.sample(frac=1)

In [None]:
shuffled.head()

Unnamed: 0,title,abstract,category
32264,ambiguous opinion dynamic consistency,opinion plot transmitter telephone_receiver am...,econ
29311,conformal geodesic gravitational instantons,analyze integrability conformal geodesic flow ...,nlin
23130,antiproton output heavy ion collision subthres...,inside framework Lanzhou quantum molecular dyn...,nucl-th
22322,quarkonium phenomenology popularize Gauss law,salute better analytic parametrisation complex...,nucl-th
11627,comparative study educate intensity neurofeedback,neurofeedback prove utilitarian many exemplify...,q-bio


In [None]:
#Thorai vako class ma add garne kaam sakkiyo
#So aba dherai vako class bata remove garna try garam

In [None]:
#dherai data vako class haru
cs=df.loc[df['category'] == 'cs']
math = df.loc[df['category'] == 'math']
cond_mat = df.loc[df['category'] == 'cond-mat']

In [None]:
#Let's make a dataframe containing these 3 categories
major = pd.concat([cs, math, cond_mat], axis=0)

In [None]:
major.tail()

Unnamed: 0,id,title,abstract,category
861114,2006.00349,ii vi organic inorganic hybrid nanostructures ...,organic inorganic hybrids may offer material p...,cond-mat
861136,2106.05721,self oscillation synchronisation transitions e...,interplay activity elasticity often found acti...,cond-mat
861171,2011.13215,oscillatory shear flows dense suspensions impo...,oscillatory shear widely used study rheologica...,cond-mat
861192,2103.09143,maximal diversity zipf law,zipf law describes empirical size distribution...,cond-mat
861199,2005.09337,local thickness composition measurements scann...,measured local composition thickness sio2 base...,cond-mat


In [None]:
# Separating the independent variables from dependent variables
#major dataframe bata matra
X = major.iloc[:,1:-1]
y = major.iloc[:,-1]

In [None]:
#Now, let's do undersampling
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# Apply the random under-sampling
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [None]:
len(X_resampled), len(X)

(128358, 287157)

In [None]:
#X_resampled, y_resampled vaneko 3ta class ko dherai data lai thorai banayepachi ko ho hai
y_resampled.head()

0    cond-mat
1    cond-mat
2    cond-mat
3    cond-mat
4    cond-mat
Name: category, dtype: object

In [None]:
#Now let's combine X_resampled and y_resampled into a new dataframe --reduced
reduced = pd.concat([X_resampled, y_resampled], axis = 1)

In [None]:
#Visualize
reduced.tail()

Unnamed: 0,title,abstract,category
128353,crepant resolution holomorphic anomaly equatio...,study orbifold gromov witten theory quotient c...,math
128354,passive beamforming information transfer via l...,large intelligent surface lis emerged promisin...,math
128355,codes hierarchical locality covering maps curves,locally recoverable lrc codes provide ways rec...,math
128356,general affine invariants plane curves space c...,present fundamental theory curves affine plane...,math
128357,period mappings ampleness hodge line bundle,discuss progress towards conjectural hodge the...,math


In [None]:
reduced['category'].value_counts()

cond-mat    42786
cs          42786
math        42786
Name: category, dtype: int64

In [None]:
# so among those three classes, math had the smallest number(42786)
#That's why other two classes namely cond-mat and cs also are reduced to the size of math

In [None]:
#Mathi thorai vako data lai badhayera pani sakkiyo -- reduced
#Ani tespachi dherai vako data lai ghatayera pani sakkiyo -- shuffled
#Aba kunai kunai class ko data as it is xa - df
#Aba sabailai appropriately combine garera final dataset banaune

In [None]:
#Now original dataframe bata major class haru remove garna paryo (3 ota class)
rem_cs=df.loc[df['category'] != 'cs']
rem_math = rem_cs.loc[rem_cs['category'] != 'math']
major_removed = rem_math.loc[rem_math['category'] != 'cond-mat'] # 3ta class hatyo

In [None]:
#Let's see 3 ota class hatyo ki nai
major_removed['category'].value_counts()

physics     38243
astro-ph    38039
stat        22745
quant-ph    16020
eess        15843
hep-ph      11400
hep-th      10690
gr-qc        8662
q-bio        5665
hep-ex       3950
math-ph      3822
nucl-th      3517
q-fin        3053
nlin         2894
econ         2050
nucl-ex      1788
hep-lat      1132
Name: category, dtype: int64

In [None]:
#Visualize 
major_removed.head()

Unnamed: 0,id,title,abstract,category
5,2102.10644,surface tension destabiliser vortical interface,study dynamics initially flat interface two im...,physics
7,1902.08524,acoustically modulated optical emission hexago...,investigate effect surface acoustic waves atom...,physics
13,1912.02502,monte carlo studies spin chirality decoupling ...,extensive equilibrium monte carlo simulation p...,physics
14,1911.08413,igatelink gateway library linking iot edge fog...,recent years internet things iot growing popul...,eess
18,2005.00921,epidemic model sipherd application prediction ...,propose epidemic model sipherd three categorie...,q-bio


In [None]:
#Now, lets combine our dataframes -- reduced, shuffled, major_removed
final_df = pd.concat([reduced, shuffled, major_removed], ignore_index=True)

In [None]:

len(reduced)+ len(shuffled)+ len(major_removed) == len(final_df)

True

In [None]:
#Visualize
final_df.head()

Unnamed: 0,title,abstract,category,id
0,noise noise ratios correlation length calculat...,finite random systems possible define two type...,cond-mat,
1,exactly solvable model 4+1d beyond cohomology ...,construct exactly solvable commuting projector...,cond-mat,
2,n independent localized krylov bogoliubov de g...,propose ultra fast numerical approach large sc...,cond-mat,
3,molding 3d curved structures selective heating,interest fabricate curved surfaces three dimen...,cond-mat,
4,piezoelectric properties substitutionally dope...,modern semiconductor materials increasingly us...,cond-mat,


In [None]:
#Let's see how the data distribution has become
final_df['category'].value_counts()

cond-mat    42786
math        42786
cs          42786
physics     38243
astro-ph    38039
stat        22745
gr-qc       17324
quant-ph    16020
eess        15843
hep-ph      11400
q-bio       11330
hep-th      10690
hep-ex       7900
math-ph      7644
nucl-th      7034
q-fin        6106
nlin         5788
econ         4100
nucl-ex      3576
hep-lat      2264
Name: category, dtype: int64

In [None]:
#Finally, our dataset looks much better 
#Dataset has become way more balaned than original one

Now, let's preprocess our text data so that they can be used with RNN model

In [None]:
#Create features and labels
X1 = final_df['title'].values
X2 = final_df['abstract'].values
labels = final_df['category'].values
texts = X1+X2

In [None]:
texts[:3]

array(['noise noise ratios correlation length calculations near criticalityfinite random systems possible define two types variances noises demonstrated ratio useful calculating correlation length infinite rather general random system function temperature numerical method obtaining variables relevant real space numerical renormalization simulation method matter correlation length obtained novel technique may used obtain directly critical correlation exponent nu rather indirectly using scaling relations often done method demonstrated applying random field ising model',
       'exactly solvable model 4+1d beyond cohomology symmetry protected topological phaseconstruct exactly solvable commuting projector model 4+1 dimensional mathbb z _2 symmetry protected topological phase spt outside cohomology classification spts model described decorated domain wall construction three fermion walker wang phases domain walls describe anomalous nature phase several ways one interesting feature contrast

In [None]:
labels

array(['cond-mat', 'cond-mat', 'cond-mat', ..., 'quant-ph', 'nucl-th',
       'q-fin'], dtype=object)

In [None]:
num_classes = len(set(labels))
num_classes

20

In [None]:
from keras.utils import to_categorical

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label = le.fit_transform(labels)
labels = to_categorical(label, num_classes=num_classes)

In [None]:
# Split the dataset into train and validation sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2)

In [None]:
#Let's use embedding (transfer learning)
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

In [None]:
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)

In [None]:
hub_layer(texts[:2])

<tf.Tensor: shape=(2, 50), dtype=float32, numpy=
array([[ 0.2638609 , -0.64824814,  0.04562867, -0.1098823 , -0.9232943 ,
         0.01212509, -0.07654639,  0.5181153 , -0.8579675 ,  0.22006272,
         0.14659154,  0.1276213 ,  0.41004118, -0.64290124, -0.22879317,
         0.03863401,  0.53961587, -0.6501783 ,  0.5105377 , -0.19373697,
        -0.09167834,  0.6161593 ,  0.74786407, -0.546173  ,  0.14825577,
        -0.34129947,  0.18156204,  0.12870675,  0.57867944, -0.24376369,
         0.20244217,  0.2578369 , -0.05890707, -0.44663632, -0.02562064,
        -0.5357646 ,  0.21947792, -0.82818544,  0.13461113, -0.17218429,
         0.21569909,  0.7806    ,  0.5461602 ,  0.42031467, -0.02079178,
        -0.4855936 , -0.44177014,  0.19496308,  1.2824658 ,  0.3102327 ],
       [ 0.30610925, -1.1763316 ,  0.10156737, -0.02282765, -0.9733205 ,
         0.00179539, -0.21293321,  0.57153076, -1.2882495 ,  0.3619177 ,
        -0.2925936 ,  0.20599039,  0.30590293, -0.1680086 ,  0.0947945 ,
 

In [None]:
#Now, let's build the model
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer_2 (KerasLayer)  (None, 50)                48190600  
                                                                 
 dense_2 (Dense)             (None, 16)                816       
                                                                 
 dense_3 (Dense)             (None, 20)                340       
                                                                 
Total params: 48,191,756
Trainable params: 48,191,756
Non-trainable params: 0
_________________________________________________________________


In [None]:
#Compile the model
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
#Train the model
history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs = 10)

In [None]:
from keras.models import load_model

# Save the model
model.save('model1.h5')


In [None]:
# Download the model file
from google.colab import files
files.download('model1.h5')

loaded_model = load_model('model1.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>