In [29]:
#word embedding Word2vec
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [30]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
text = "Word embedding is a powerful technique in natural language processing, capturing semantic relationships between words. Utilizing algorithms like Word2Vec or GloVe, it transforms words into high-dimensional vectors. These vectors preserve contextual information, facilitating tasks like sentiment analysis, machine translation, and document clustering"

In [32]:
sentences = sent_tokenize(text)
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]


In [33]:
#train word2vec model
model = Word2Vec(tokenized_sentences, vector_size=100, window=5, min_count=1, sg=0)


In [34]:

#acess word vectors
word_vectors = model.wv
vector = word_vectors['word']
print("vector for word",vector)

vector for word [-4.7877301e-03 -4.3088831e-03 -4.8047639e-03 -9.8078838e-03
 -7.7606901e-04 -7.8003672e-03 -4.8259199e-03 -7.3728049e-03
  9.9300118e-03 -1.3649601e-03 -7.7508264e-03  5.5093332e-03
 -8.0684284e-03 -1.9051839e-03  2.7484007e-03 -7.0880977e-03
 -8.7190233e-03 -6.6439309e-03  7.4664587e-03  9.0433713e-03
  4.0041995e-03  2.4071313e-03  4.1500423e-03 -4.0752497e-03
  9.7585255e-03  6.8997387e-03  4.0977001e-03  8.9474991e-03
 -8.3900252e-03  9.3544787e-04  8.9066550e-03 -9.4757846e-04
 -6.7378129e-03 -2.2899285e-03 -2.7877369e-03 -2.5010272e-03
 -1.8444222e-03  7.2917496e-03 -3.1157737e-04  4.2268690e-03
 -3.7631996e-03  8.2187662e-03 -6.4854939e-03 -9.3289278e-03
 -4.1385097e-03 -7.3735467e-03 -4.8864479e-03  4.8679614e-04
  2.3911817e-03  1.7403081e-03  5.0138067e-03 -1.4038198e-03
  2.2743368e-03  3.5204491e-04  7.8353090e-03  8.0244029e-03
 -1.9645488e-03  1.2591110e-03 -6.0412632e-03 -4.7685327e-03
 -6.9669224e-03  1.4424011e-06 -4.8934356e-03 -4.0285895e-03
  1.4983

In [35]:
#finding similar words
similar_words = model.wv.most_similar('embedding',topn= 3)
print("similar words to'embedding' :", similar_words)

similar words to'embedding' : [('between', 0.31900984048843384), ('sentiment', 0.18884754180908203), ('in', 0.16206954419612885)]


Word2Vec is a popular word embedding technique in natural language processing. Developed by Google, it represents words as dense vectors in a continuous vector space. It employs two models: Continuous Bag of Words (CBOW) predicts a word from its context, while Skip-gram predicts context words given a target word. These embeddings capture semantic relationships, facilitating tasks like sentiment analysis and machine translation, and are learned from large text corpora, enabling efficient representation of words with similar meanings in a numerical format.







In [36]:
#co-occurrence vectors
from collections import defaultdict

In [37]:
window_size = 2
co_occurrence_matrix = defaultdict(lambda: defaultdict(int))
for sentence in tokenized_sentences:
  for i , target_word in enumerate(sentence):
    for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
      if i != j:
        context_word = sentence[j]
        co_occurrence_matrix[target_word][context_word] +=1



In [38]:
for target_word, context_word in co_occurrence_matrix.items():
  print(target_word, context_word)
print()

word defaultdict(<class 'int'>, {'embedding': 1, 'is': 1})
embedding defaultdict(<class 'int'>, {'word': 1, 'is': 1, 'a': 1})
is defaultdict(<class 'int'>, {'word': 1, 'embedding': 1, 'a': 1, 'powerful': 1})
a defaultdict(<class 'int'>, {'embedding': 1, 'is': 1, 'powerful': 1, 'technique': 1})
powerful defaultdict(<class 'int'>, {'is': 1, 'a': 1, 'technique': 1, 'in': 1})
technique defaultdict(<class 'int'>, {'a': 1, 'powerful': 1, 'in': 1, 'natural': 1})
in defaultdict(<class 'int'>, {'powerful': 1, 'technique': 1, 'natural': 1, 'language': 1})
natural defaultdict(<class 'int'>, {'technique': 1, 'in': 1, 'language': 1, 'processing': 1})
language defaultdict(<class 'int'>, {'in': 1, 'natural': 1, 'processing': 1, ',': 1})
processing defaultdict(<class 'int'>, {'natural': 1, 'language': 1, ',': 1, 'capturing': 1})
, defaultdict(<class 'int'>, {'language': 1, 'processing': 1, 'capturing': 1, 'semantic': 1, 'or': 1, 'glove': 1, 'it': 1, 'transforms': 1, 'contextual': 1, 'information': 1, 

Co-occurrence matrices are a fundamental concept in NLP, and we can use them to represent the relationship between elements in a text corpus. Usually, in NLP, we work with a collection of text or text corpus. Elements of text corpus can refer to sentences, words, phrases, or any other linguistic unit of interest.

With co-occurrence matrices, it is possible to represent these elements using rows and columns of a matrix. More precisely, each row and column of a matrix represents a unique element of a text corpus. Cells of the matrix represent the number of times two elements appear together in a predefined context. The context can be a document, sentence, word window, or any other relevant unit

In [39]:
#doc2vec

In [40]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument


In [41]:
tagged_data = [TaggedDocument(words=words, tags = [str(idx)]) for idx ,words in enumerate(tokenized_sentences)]

In [42]:
#train the doc2vec model
model = Doc2Vec(vector_size=100, window=5, min_count = 1, dm=1, epochs = 20)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples = model.corpus_count, epochs = model.epochs)

In [43]:
doc_vector = model.infer_vector(word_tokenize("DOC2Vec is a powerful tool for documents embeddings"))
print(doc_vector)

[-3.8006394e-03 -7.6809665e-04 -1.6907499e-03 -2.1911014e-03
  1.6654968e-03  3.8139229e-03  3.3960289e-03 -4.2499080e-03
  3.9905277e-03 -2.0782156e-03 -4.0397733e-03  3.5720621e-03
  7.4088655e-04  3.4847155e-03 -3.3172825e-03  2.8531004e-03
 -1.6846898e-04  4.7689835e-03  2.8872620e-03 -2.0982616e-03
  3.5755602e-03  3.1527507e-03  3.5843158e-03  8.0920977e-04
 -3.2826650e-03 -3.4659245e-04  1.8716755e-03 -2.2667188e-03
 -2.8462415e-03  8.7195536e-04  1.2657329e-03  4.1245185e-03
  5.6471606e-04 -2.5968167e-03 -1.6592431e-03  1.3917380e-03
 -4.9890880e-03  4.6170740e-03 -2.7549677e-03 -1.3207658e-03
  8.7225251e-04 -3.7129656e-03  1.4312005e-03  3.9877831e-03
  3.8101235e-03 -1.4723151e-03 -2.4656674e-03  3.1810470e-03
  1.0947608e-03 -2.3028874e-03 -1.0190174e-03  1.4492406e-05
 -2.5434566e-03  3.1412757e-04  4.6439879e-03 -8.9946348e-04
 -4.1318811e-03 -1.7177417e-03 -1.4343650e-04  9.7687756e-05
  1.3573680e-03  4.9959756e-03 -1.7995589e-03 -4.3527400e-03
  4.5892517e-03 -4.41497

Doc2vec is a technique that extracts semantic information from documents and then uses that information to classify the documents. By applying Doc2vec to existing documents, it becomes possible for AI software to rapidly identify similar topics in a large collection of text without having to read the entire corpus

In [44]:
#textblob
from textblob import TextBlob

In [45]:
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [46]:
text = 'Sunil Chhetri (born 3 August 1984) is an Indian professional footballer who plays as a forward and captains both Indian Super League club Bengaluru FC and the India national team. He is known for his link-up play, goal scoring abilities, and leadership.[3][4] He is the third-highest international goalscorer among active players, behind only Cristiano Ronaldo and Lionel Messi,[5][6] fourth overall, and is also the most-capped player and the all-time top goalscorer of the India national team. He is widely regarded as one of the greatest Indian footballers of all time for his contributions for the country'

In [47]:
# introduction to text blob
intro_blob = TextBlob(text)
print("introduction to text blob: ", intro_blob)

introduction to text blob:  Sunil Chhetri (born 3 August 1984) is an Indian professional footballer who plays as a forward and captains both Indian Super League club Bengaluru FC and the India national team. He is known for his link-up play, goal scoring abilities, and leadership.[3][4] He is the third-highest international goalscorer among active players, behind only Cristiano Ronaldo and Lionel Messi,[5][6] fourth overall, and is also the most-capped player and the all-time top goalscorer of the India national team. He is widely regarded as one of the greatest Indian footballers of all time for his contributions for the country


In [48]:
text1 = "I'm extremely disappointed with the quality of this item. It broke after just a few uses."

In [49]:
sentiment_blob = TextBlob(text1)
polarity = sentiment_blob.sentiment.polarity
sentiment = 'positive' if polarity >0 else 'negative' if polarity < 0 else 'neutral'
print('sentiment analysis:',sentiment, "(Polarity:" ,polarity,")")

sentiment analysis: negative (Polarity: -0.475 )


In [50]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [51]:
#part of speech tagging and noun phrase extraction
pos_blob = TextBlob("the black cat is sleeping on the soft mat")
pos_tags = pos_blob.tags
noun_phrases = pos_blob.noun_phrases
print("part of speech tags:" ,pos_tags)
print('noun pharse', noun_phrases)

part of speech tags: [('the', 'DT'), ('black', 'JJ'), ('cat', 'NN'), ('is', 'VBZ'), ('sleeping', 'VBG'), ('on', 'IN'), ('the', 'DT'), ('soft', 'JJ'), ('mat', 'NN')]
noun pharse ['black cat', 'soft mat']


Here, 'DT' stands for determiner, 'JJ' for adjective, 'NN' for noun, 'VBZ' for verb (present tense, 3rd person singular), 'VBG' for verb (gerund or present participle), 'IN' for preposition or subordinating conjunction, etc.

In [52]:
#subclass textblob to add custom methods
class ExtendedTextBlob(TextBlob):
  def my_custom_function(self,name):
    return "we create our sub class" ,name

extended_blob = ExtendedTextBlob("this is some text")
result = extended_blob.my_custom_function('class1')
print(result)

('we create our sub class', 'class1')


In [53]:
#language taranslation and language detection
!pip install googletrans==4.0.0-rc1



In [54]:
from googletrans import Translator
translator =Translator()
original_text = "अरे दोस्तों आप कैसे हो"
detected_lang = translator.detect(original_text).lang
translated_text = translator.translate(original_text, src = detected_lang, dest ='en').text

print("original text:",original_text)
print("Detected language",detected_lang)
print("Translated Text",translated_text)


original text: अरे दोस्तों आप कैसे हो
Detected language hi
Translated Text Hey guys how are you



TextBlob is a Python library for processing textual data in natural language processing (NLP). It simplifies common NLP tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. TextBlob is built on top of NLTK (Natural Language Toolkit) and provides a simple API for beginners while offering flexibility for advanced users. It is widely used for quick prototyping and analysis of text data due to its ease of use and effectiveness in handling various NLP tasks.