In [19]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re

### Import Dataset

In [15]:
train_df = pd.read_csv("../Data/arxiv_train.csv")
test_df = pd.read_csv("../Data/arxiv_test.csv")

### Dataset Overview

In [18]:
research_fields = sorted(train_df["label"].unique())

# Getting number of rows
num_rows = train_df.shape[0]
print(f"Number of articles: {num_rows}\n")

# Printing each unique element in an ordered list format
print("Research Fields")
for i, element in enumerate(research_fields, start=1):
    print(f"{i}. {element}")

Number of articles: 80000

Research Fields
1. astro-ph
2. cond-mat
3. cs
4. eess
5. hep-ph
6. hep-th
7. math
8. physics
9. quant-ph
10. stat


### Data Cleaning

In [20]:
def clean_text(text):

    # Remove special characters
    text = re.sub(r"\W", " ", text)

    # Remove single characters
    text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)

    # Remove single characters from the start
    text = re.sub(r"\^[a-zA-Z]\s+", " ", text)

    # Substitute multiple spaces with single space
    text = re.sub(r"\s+", " ", text, flags=re.I)

    # Remove prefixed 'b'
    text = re.sub(r"^b\s+", "", text)

    # Converting to Lowercase
    text = text.lower()

    # Removing stopwords
    stopwords = set(ENGLISH_STOP_WORDS)
    text = " ".join([word for word in text.split() if word not in stopwords])

    return text


# Cleaning the abstracts
train_df["cleaned_abstract"] = train_df["abstract"].apply(clean_text)

In [23]:
train_df.head(10)

Unnamed: 0.1,Unnamed: 0,abstract,label,cleaned_abstract
0,31716,Automatic meeting analysis is an essential f...,eess,automatic meeting analysis essential fundament...
1,89533,We propose a protocol to encode classical bi...,quant-ph,propose protocol encode classical bits measure...
2,82700,A number of physically intuitive results for...,quant-ph,number physically intuitive results calculatio...
3,78830,In the last decade rare-earth hexaborides ha...,physics,decade rare earth hexaborides investigated fun...
4,94948,We introduce the weak barycenter of a family...,stat,introduce weak barycenter family probability d...
5,74849,Direct Statistical Simulation (DSS) solves t...,physics,direct statistical simulation dss solves equat...
6,66424,We introduce a notion of a girth-regular gra...,math,introduce notion girth regular graph k regular...
7,6562,Planet host stars with well-constrained ages...,astro-ph,planet host stars constrained ages provide rar...
8,84292,Unprecedented increase of complexity and sca...,quant-ph,unprecedented increase complexity scale data e...
9,18822,"The usual concepts of topological physics, s...",cond-mat,usual concepts topological physics berry curva...
