In [None]:
import spacy
from pymongo import MongoClient

In [None]:
# Load English tokenizer, tagger, parser, and NER
nlp = spacy.load("en_core_web_sm")

In [None]:
# Increase the max_length limit
nlp.max_length = 2500000  # Set it to a value that accommodates your text length

In [None]:
client = MongoClient()
db = client['article_recommendation']
article_collection = db['article']

# Find the first document in the collection
# first_article = article_collection.find_one()
# abstract = first_article['abstract']

# Find all documents in the collection
all_articles = article_collection.find()

# List to store all abstracts
all_abstracts = []

# Iterate over all documents
for article in all_articles:
    # Check if the document has an abstract field
    if 'abstract' in article:
        abstract = article['abstract']
        all_abstracts.append(abstract)

# Concatenate all abstracts into a single string
all_abstracts_text = " ".join(all_abstracts)


# Example text
# text = "This is an example sentence. John go to the school."

In [None]:
# Process the text
doc = nlp(all_abstracts_text)

Tokenization:
Tokenization is the process of splitting text into individual words or tokens.

In [None]:
# Iterate over tokens
for token in doc:
    print(token.text)

Part-of-speech (POS) Tagging:
POS tagging assigns a grammatical label to each token, such as noun, verb, adjective, etc.

In [None]:
# Iterate over tokens with POS tags
for token in doc:
    print(token.text, token.pos_)


Named Entity Recognition (NER):
NER identifies named entities such as persons, organizations, locations, etc.

In [None]:
# Extract named entities
for ent in doc.ents:
    print(ent.text, ent.label_)

1-Removing Stopwords:
Stopwords are common words (e.g., "the", "is", "and") that are often removed during preprocessing.

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
# Remove stopwords
filtered_tokens = [token.text for token in doc if token.text.lower() not in STOP_WORDS]

# Join filtered tokens back into a sentence
filtered_text = ' '.join(filtered_tokens)

doc = nlp(filtered_text)
print(filtered_text)

In [None]:
# Filter out stopwords
# filtered_tokens = [token.text for token in doc if not token.is_stop]
# filtered_tokens

2-Remove punctuations

In [None]:
# Filter out tokens that are not punctuation
filtered_tokens = [token.text for token in doc if token.is_punct == False]

# Join the filtered tokens into a string
clean_text = " ".join(filtered_tokens)
doc = nlp(clean_text)
print(clean_text)

3-Lemmatization:
Lemmatization reduces words to their base or root form.

In [None]:
# Iterate over tokens with lemmatized forms
for token in doc:
    print(token.text, token.lemma_)


In [None]:
# Generate the sentence from lemmatized tokens
lemmatized_abstract = " ".join([token.lemma_ for token in doc])
lemmatized_abstract

In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
from pymongo import MongoClient

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

# Function to preprocess the abstract
def preprocess_abstract(abstract):
    # Tokenize the abstract
    tokens = nlp(abstract.lower())
    
    # Remove stopwords and punctuation, and lemmatize the tokens
    processed_tokens = [token.lemma_ for token in tokens if token.text not in STOP_WORDS and token.text not in string.punctuation]
    
    # Join the processed tokens back into a string
    preprocessed_abstract = ' '.join(processed_tokens)
    
    return preprocessed_abstract

# Connect to MongoDB
client = MongoClient()  # Update with your MongoDB connection URI
db = client["article_recommendation"]  # Update with your database name
articles_collection = db["article"]  # Update with your collection name

# Retrieve articles from MongoDB
articles = articles_collection.find()

# Process each article
for article in articles:
    # Preprocess the abstract
    preprocessed_abstract = preprocess_abstract(article["abstract"])
    
    # Update the article in the collection with the preprocessed abstract
    articles_collection.update_one({"_id": article["_id"]}, {"$set": {"preprocessed_abstract": preprocessed_abstract}})
