#installations

In [None]:
!pip install colorama
!pip install gensim

#imports

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from colorama import Fore
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Preprocessing + Model Creation + Training Phase

In [41]:
def summarize_text_word2vec(text:str, num_sentences:int=3) -> str:

  sentences:list = nltk.sent_tokenize(text)
  words = [word_tokenize(sentence.lower()) for sentence in sentences] # Sentence Tokenization.


  stop_words = set(stopwords.words('english'))
  words = [[word for word in sentence if word.isalnum() and word not in stop_words] for sentence in words] # Stop Words Removal.

  # Train a Word2Vec model on the text.
  model = Word2Vec(words, vector_size=300, window=5, min_count=1, workers=4)  # Increased vector size


  sentence_embeddings:list = []  # Calculating sentence embeddings.
  for sentence in words:
    sentence_embedding = np.mean([model.wv[word] for word in sentence if word in model.wv], axis=0)
    if sentence_embedding is not None:
      sentence_embeddings.append(sentence_embedding)

  # Calculate sentence similarity using cosine similarity.
  similarity_matrix = cosine_similarity(sentence_embeddings)

  # Select the most important sentences based on similarity scores.
  sentence_scores = np.sum(similarity_matrix, axis=1)
  ranked_sentences = sorted(((score, index) for index, score in enumerate(sentence_scores)), reverse=True)
  summary_sentences = [sentences[index] for score, index in ranked_sentences[:num_sentences]]

  # Return the summary.
  return " ".join(summary_sentences)

In [None]:
text:str = """Word2vec is a technique in natural language processing (NLP) for obtaining vector representations of words.
 These vectors capture information about the meaning of the word based on the surrounding words.
 The word2vec algorithm estimates these representations by modeling text in a large corpus.
 Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence.
 Word2vec was developed by Tomáš Mikolov and colleagues at Google and published in 2013.
Word2vec represents a word as a high-dimension vector of numbers which capture relationships between words.
In particular, words which appear in similar contexts are mapped to vectors which are nearby as measured by cosine similarity. This indicates the level of semantic similarity between the words,
 so for example the vectors for walk and ran are nearby, as are those for "but" and "however", and "Berlin" and "Germany"."""

summary:str = summarize_text_word2vec(text)

In [43]:
print(f"""
{Fore.BLUE }
Original:

{text}

Count: {len(text.split())}

      --------------------------------------------------------------------------------
{Fore.LIGHTCYAN_EX}
Summary:

{summary}

Count: {len(summary.split())}
""")


[34m
Original:

Word2vec is a technique in natural language processing (NLP) for obtaining vector representations of words.
 These vectors capture information about the meaning of the word based on the surrounding words.
 The word2vec algorithm estimates these representations by modeling text in a large corpus.
 Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence.
 Word2vec was developed by Tomáš Mikolov and colleagues at Google and published in 2013.
Word2vec represents a word as a high-dimension vector of numbers which capture relationships between words.
In particular, words which appear in similar contexts are mapped to vectors which are nearby as measured by cosine similarity. This indicates the level of semantic similarity between the words,
 so for example the vectors for walk and ran are nearby, as are those for "but" and "however", and "Berlin" and "Germany".

Count: 141
      
      --------------------------------------