In [5]:
import pandas as pd
import spacy
import re
from tqdm import tqdm

In [2]:
df_train= pd.read_csv("./arxiv_train.csv")
df_test=pd.read_csv("./arxiv_test.csv")

In [4]:
#Removing Special character
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)
df_train['abstract'] = df_train['abstract'].apply(remove_special_characters)
df_train['label'] = df_train['label'].apply(remove_special_characters)
df_test['abstract'] = df_test['abstract'].apply(remove_special_characters)
df_test['label'] = df_test['label'].apply(remove_special_characters)

In [6]:
#Tokenizing
nlp = spacy.load("en_core_web_sm")
def tokenize_with_progress(texts):
    tokenized_texts = []
    for doc in tqdm(nlp.pipe(texts, batch_size=20), total=len(texts)):
        tokenized_texts.append([token.text for token in doc])
    return tokenized_texts

In [7]:
df_train['abstract_tokenized'] = tokenize_with_progress(df_train['abstract'].tolist())
df_train['label_tokenized'] = tokenize_with_progress(df_train['label'].tolist())

100%|██████████| 80000/80000 [17:52<00:00, 74.60it/s]
100%|██████████| 80000/80000 [00:43<00:00, 1845.35it/s]


In [8]:
def lemmatize_tokens(token_lists):
    lemmatized_texts = []
    # Processing each list of tokens
    for token_list in tqdm(token_lists, total=len(token_lists)):
        # Temporarily rejoining tokens for processing as Doc objects
        doc = nlp(" ".join(token_list))
        # Lemmatizing tokens and collecting lemmas
        lemmatized_texts.append([token.lemma_ for token in doc])
    return lemmatized_texts

In [9]:
df_train['abstract_tokenized_lemmatized'] = lemmatize_tokens(df_train['abstract_tokenized'].tolist())


100%|██████████| 80000/80000 [3:07:24<00:00,  7.11it/s]      


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_TF = tfidf_vectorizer.fit_transform(df_train['abstract'])
y_train_TF = df_train['label']

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(df_train['abstract'])
y_train_count = df_train['label']