In [10]:
import pandas as pd
import numpy as np

import re
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

In [11]:
initial_data = pd.read_csv('data/arxiv-dataset.csv')

In [12]:
processed_data = initial_data.copy()

In [13]:
processed_data['text'] = processed_data['title'].str.cat(processed_data['authors'], sep = ' ').str.cat(processed_data['abstract'], sep = ' ')

In [14]:
processed_data = processed_data.drop(['link', 'title', 'publishedDate', 'authors', 'abstract'], axis = 1)

In [15]:
def textPreProcessing(text):

    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and symbols
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Text tokenization (temporary)
    tokens = word_tokenize(text)

    # Stopwords removal
    stop_words = set(stopwords.words('english'))
    tokens = [ word for word in tokens if word not in stop_words ]

    # Words lemmatization 
    lemmatizer = WordNetLemmatizer()
    tokens = [ lemmatizer.lemmatize(word) for word in tokens ]
    
    # Join tokens array in a single string
    cleaned_text = ' '.join(tokens)

    return cleaned_text

In [19]:
processed_data['text'] = processed_data.apply(lambda row: textPreProcessing(row['text']), axis = 1)

In [29]:
category_L1 = pd.DataFrame({
    'category': processed_data['categoryGroup'].unique(),
    'label': np.arange(0,len(processed_data['categoryGroup'].unique()))
})

In [32]:
processed_data_catL1 = pd.merge(processed_data, category_L1, left_on = 'categoryGroup', right_on = 'category', how = 'left')
processed_data_catL1.drop(['categoryId', 'categoryName', 'categoryGroup', 'category'], axis = 1, inplace = True)

In [35]:
category_L1.to_csv('data/categoryL1-label.csv')
processed_data_catL1.to_csv('data/arxiv-dataset-processed-L1.csv')

In [37]:
processed_data_catL1['label'].value_counts()

4    25500
0    19708
3    16000
5     5000
6     4500
7     3000
2     2000
1     1500
Name: label, dtype: int64