In [4]:
import spacy
from pymongo import MongoClient
import fasttext


In [None]:
# Load English tokenizer, tagger, parser, and NER
nlp = spacy.load("en_core_web_sm")

In [None]:
# Increase the max_length limit
nlp.max_length = 2500000  # Set it to a value that accommodates your text length

In [None]:
client = MongoClient()
db = client['article_recommendation']
article_collection = db['article']

# Find the first document in the collection
# first_article = article_collection.find_one()
# abstract = first_article['abstract']

# Find all documents in the collection
all_articles = article_collection.find()

# List to store all abstracts
all_abstracts = []

# Iterate over all documents
for article in all_articles:
    # Check if the document has an abstract field
    if 'abstract' in article:
        abstract = article['abstract']
        all_abstracts.append(abstract)

# Concatenate all abstracts into a single string
all_abstracts_text = " ".join(all_abstracts)


# Example text
# text = "This is an example sentence. John go to the school."

In [None]:
# Process the text
doc = nlp(all_abstracts_text)

Tokenization:
Tokenization is the process of splitting text into individual words or tokens.

In [None]:
# Iterate over tokens
for token in doc:
    print(token.text)

Part-of-speech (POS) Tagging:
POS tagging assigns a grammatical label to each token, such as noun, verb, adjective, etc.

In [None]:
# Iterate over tokens with POS tags
for token in doc:
    print(token.text, token.pos_)


Named Entity Recognition (NER):
NER identifies named entities such as persons, organizations, locations, etc.

In [None]:
# Extract named entities
for ent in doc.ents:
    print(ent.text, ent.label_)

1-Removing Stopwords:
Stopwords are common words (e.g., "the", "is", "and") that are often removed during preprocessing.

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
# Remove stopwords
filtered_tokens = [token.text for token in doc if token.text.lower() not in STOP_WORDS]

# Join filtered tokens back into a sentence
filtered_text = ' '.join(filtered_tokens)

doc = nlp(filtered_text)
print(filtered_text)

In [None]:
# Filter out stopwords
# filtered_tokens = [token.text for token in doc if not token.is_stop]
# filtered_tokens

2-Remove punctuations

In [None]:
# Filter out tokens that are not punctuation
filtered_tokens = [token.text for token in doc if token.is_punct == False]

# Join the filtered tokens into a string
clean_text = " ".join(filtered_tokens)
doc = nlp(clean_text)
print(clean_text)

3-Lemmatization:
Lemmatization reduces words to their base or root form.

In [None]:
# Iterate over tokens with lemmatized forms
for token in doc:
    print(token.text, token.lemma_)


In [None]:
# Generate the sentence from lemmatized tokens
lemmatized_abstract = " ".join([token.lemma_ for token in doc])
lemmatized_abstract

In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
from pymongo import MongoClient

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

# Function to preprocess the abstract
def preprocess_abstract(abstract):
    # Tokenize the abstract
    tokens = nlp(abstract.lower())
    
    # Remove stopwords and punctuation, and lemmatize the tokens
    processed_tokens = [token.lemma_ for token in tokens if token.text not in STOP_WORDS and token.text not in string.punctuation]
    
    # Join the processed tokens back into a string
    preprocessed_abstract = ' '.join(processed_tokens)
    
    return preprocessed_abstract

# Connect to MongoDB
client = MongoClient()  # Update with your MongoDB connection URI
db = client["article_recommendation"]  # Update with your database name
articles_collection = db["article"]  # Update with your collection name

# Retrieve articles from MongoDB
articles = articles_collection.find()

# Process each article
for article in articles:
    # Preprocess the abstract
    preprocessed_abstract = preprocess_abstract(article["abstract"])
    
    # Update the article in the collection with the preprocessed abstract
    articles_collection.update_one({"_id": article["_id"]}, {"$set": {"preprocessed_abstract": preprocessed_abstract}})


In [1]:
import fasttext

In [2]:
# Load the pre-trained model
model = fasttext.load_model('../../cc.en.300.bin')



In [4]:
# Get the vector embedding for a single word
word_embedding = model.get_word_vector('word')
word_embedding

array([-1.70015432e-02,  8.15567933e-03, -1.88383684e-02,  1.23793781e-01,
        5.05270716e-03, -6.70323521e-02,  3.45998369e-02,  1.84294637e-02,
       -7.64054060e-02, -4.19045091e-02, -8.06465298e-02,  1.28976656e-02,
        2.00077564e-01,  1.82825729e-01,  2.11075842e-02,  7.39751160e-02,
        1.44422576e-02, -2.42733415e-02,  4.43353727e-02, -2.89499313e-02,
       -3.03028524e-02,  1.96899474e-03, -5.38224690e-02, -1.65010661e-01,
        4.10782881e-02, -2.79612606e-03,  1.20798722e-02,  7.80077726e-02,
       -1.24231517e-01,  9.36542973e-02,  1.25443518e-01, -4.22076993e-02,
        1.87173914e-02, -1.19053960e-01, -3.97234261e-02,  3.18592042e-02,
       -3.64079326e-03, -1.88531447e-02,  5.71619682e-02, -9.96990204e-02,
        9.28039253e-02, -1.79177329e-01,  2.02885523e-04,  1.45949662e-01,
       -1.19298488e-01,  2.01106481e-02, -2.17653401e-02,  5.99322319e-02,
        1.02034425e-02,  1.73299946e-02, -9.40109342e-02,  2.78837830e-02,
       -9.87280831e-02, -

In [6]:
# Get the vector embedding for a sentence
sentence_embedding = model.get_sentence_vector('your sentence')
sentence_embedding

array([ 2.20564678e-02,  3.35800238e-02, -2.18866915e-02,  1.03436381e-01,
       -4.20465022e-02, -7.09157586e-02,  1.89646464e-02,  3.62958163e-02,
        8.37822072e-03, -7.26430714e-02, -4.14594412e-02, -1.50927966e-02,
        1.77079570e-02,  9.17252973e-02,  4.17290628e-02,  5.53209372e-02,
        5.23850434e-02, -2.83666775e-02,  1.02283200e-04, -6.09768033e-02,
       -2.66362354e-02, -2.81010419e-02,  9.60057369e-04, -6.37210011e-02,
        1.24849118e-02,  4.27029058e-02,  1.80861782e-02,  2.03714650e-02,
       -1.91458035e-02,  9.76150036e-02,  2.62749121e-02,  4.45653535e-02,
        4.37861308e-02, -1.23121291e-01, -3.74393351e-02, -4.68374044e-02,
        1.38125811e-02,  4.69143540e-02, -2.24425383e-02,  3.88402678e-03,
        2.34032907e-02, -2.84987241e-02, -1.53347040e-02,  8.29414278e-02,
       -1.49825159e-02,  2.90117897e-02, -1.39033813e-02, -5.84693477e-02,
        5.31740189e-02,  5.66520961e-04, -3.39121348e-03,  2.55819224e-02,
       -8.31171572e-02, -

In [1]:
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['article_recommendation']
collection = db['article']

# Update each document in the collection to rename the vector_embedding field
collection.update_many({}, {'$rename': {'vector_embedding': 'fasttext_vector_embedding'}})


<pymongo.results.UpdateResult at 0x211e5193a08>

In [2]:
from transformers import AutoModel, AutoTokenizer

# Specify the SciBERT model name
model_name = "allenai/scibert_scivocab_uncased"

# Load the SciBERT model
scibert_model = AutoModel.from_pretrained(model_name)

# Load the SciBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Example usage:
text = "This is an example sentence to tokenize."
inputs = tokenizer(text, return_tensors="pt")
outputs = scibert_model(**inputs)


Downloading: 100%|██████████| 385/385 [00:00<00:00, 64.2kB/s]
Downloading: 100%|██████████| 422M/422M [03:05<00:00, 2.39MB/s] 
Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClas