In [1]:
import os
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [2]:
def download_nltk_data():
    datasets = ["stopwords", "wordnet", "punkt"]
    for dataset in datasets:
        try:
            nltk.data.find(f'corpora/{dataset}')
        except LookupError:
            nltk.download(dataset)

In [3]:
def preprocess_text(text, stop_words, lemmatizer):
    """
    Preprocesses a given text.
    """
    if pd.isna(text):
        return ''

    # Cleaning operations
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)    # Remove HTML tags
    text = re.sub(r'\d+', '', text)      # Remove digits
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove words with 1 or 2 characters
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)  # Keep only alphanumeric characters

    words = word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(words)

In [4]:
def preprocess_text(text, stop_words, lemmatizer):
    if pd.isna(text):
        return ''

    # Cleaning operations
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)

    words = word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]


    return ' '.join(words)



def preprocess_data(input_file, output_file, chunk_size=50000):
    download_nltk_data()

    stop_words = set(stopwords.words('english'))
    custom_stopwords = [
        "ability", "able", "absolute", "absolutely", "account", "accurate", "achieve", "address",
        "allowing", "also", "analyze", "analyzes", "answer", "application", "approach",
        "around", "art", "article", "aspect", "audience", "author", "available", "based", "begin", "best", "better",
        "beyond", "bound", "brief", "called", "capable", "capture", "carefully", "case", "certain", "challenging",
        "compare", "compared" "complex", "component", "comprehensive", "concept", "conceptual",
        "conclusion", "condition", "conduct", "conjecture", "consider", "construct", "content", "context", "cost",
        "cross", "crucial", "current", "demonstrate", "derive", "derived", "describe",
        "described", "describes", "detailed", "determine", "developed", "different", "difficult", "directly",
        "discourse", "discuss", "distinguish", "driven", "due", "effect", "effective", "efficient", "efficiently",
        "eight", "element", "emphasis", "end", "enhanced", "evaluate", "even", "example", "experiment", "experimental",
        "explain", "extensive", "family", "feature", "figure", "finally", "find", "fine", "finite", "finitely", "first",
        "fit", "five", "found", "four", "form", "framework", "function", "fundamental", "future", "general", "give",
        "given", "good", "grained", "graph", "group", "handed", "high", "higher", "however", "illustrate", "impact",
        "implement", "important", "include", "included", "including", "integrate", "interest", "introduce", "introduced", "introduction",
        "investigate", "issue", "iteration", "known", "large", "last", "leading", "left", "let", "like", "long", "low", "lower", "make",
        "many", "maximal", "may", "method", "methodology", "minimal", "model", "moreover", "multiple",
        "necessary", "need", "needed", "new", "news", "next", "nine", "non", "note", "novel", "number", "numerical",
        "objective", "observables", "observation", "obtain", 'obtained', "often", "one", "open", "operator", "optimal", "order",
        "outline", "outlines", "output", "paper", "papr", "parameter", "part", "particular", "perform",
        "performance", "performed", "performing", "performance", "phase", "point", "possible",
        "potential", "pre", "precisely", "present", "previous", "principle", "problem", "process", "prof",
        "proof", "proper", "property", "propose", "proposed", "proposes", "prove", "provide", "provided",
        "publicly", "publish", "purpose", "quality", "question", "range", "real", "recent", "recently",
        "recommendation", "related", "reliable", "representation", "require", "research", "result", "rev", "review",
        "right", "rigorous", "role", "scale", "scenario", "second", "section", "selection", "series", "serious", "set", "setting",
        "seven", "show", "shown", "significant", "significantly", "simulation", "simple", "single", "six", "solution", "state",
        "strongly", "structure", "studied", "study", "sufficient", "suggestion", "sum", "synthesize", "system",
        "table", "take", "taken", "task", "technique", "ten", "term", "theorem", "theory", "third",
        "though", "three", "thus", "time", "topic", "two", "type", "upper", "use", "used", "using", "utilize", "valid",
        "value", "variable", "variety", "various", "via", "view", "way", "well", "whether", "wide", "widely", "within",
        "without", "work", "world", "written", "year", "zero", "zeroth"]
    stop_words = stop_words.union(custom_stopwords)
    lemmatizer = WordNetLemmatizer()

    with open(input_file, 'r') as f:
        total_rows = sum(1 for _ in f)

    output_exists = os.path.isfile(output_file)

    with tqdm(total=total_rows) as pbar:
        for chunk in pd.read_json(input_file, lines=True, chunksize=chunk_size):
            # drop duplicates
            chunk = chunk[['title', 'abstract', 'categories', 'update_date']].drop_duplicates(
                subset=['title', 'abstract'])
            # convert update_date to datetime
            chunk['update_date'] = pd.to_datetime(chunk['update_date'])
            chunk = chunk[chunk['update_date'].dt.year > 2018]
            # preprocess text 
            chunk['title'] = chunk['title'].apply(preprocess_text, args=(stop_words, lemmatizer))
            chunk['abstract'] = chunk['abstract'].apply(preprocess_text, args=(stop_words, lemmatizer))
            # combine title and abstract into one column and drop original columns
            chunk['text'] = chunk['title'] + ' ' + chunk['abstract']
            chunk = chunk.drop(['title', 'abstract'], axis=1)
            chunk = chunk[chunk['text'] != '']
            # split categories into list of categories and save number of categories in a separate column and drop original column
            chunk['categories_list'] = chunk['categories'].str.split()
            chunk['num_categories'] = chunk['categories_list'].apply(len)
            chunk = chunk.drop(['categories'], axis=1)

            # save to csv
            if not output_exists:
                chunk.to_csv(output_file, mode='w', index=False)
                output_exists = True
            else:
                chunk.to_csv(output_file, mode='a', index=False, header=False)

            pbar.update(chunk_size)

In [5]:
raw_path = '../data/arxiv-metadata-oai.json'
processed_path = '../data/data_preprocessed.csv'
preprocess_data(raw_path, processed_path)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2290000it [13:06, 2909.80it/s]                              
