In [2]:
import spacy
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np

# Завантажуємо англійську модель Spacy
nlp = spacy.load("en_core_web_sm")

# Тестовий текст
text = "The quick brown fox jumps over the lazy dogs. Running, jumped, and better are words to test."

# === Токенізація ===
# Токенізація речень
sent_tokens = [sent.text for sent in nlp(text).sents]
print("Sentence Tokenization:", sent_tokens)

# Токенізація слів
word_tokens = [token.text for token in nlp(text)]
print("Word Tokenization:", word_tokens)

# === Стемінг ===
stemmer = PorterStemmer()
stems = [stemmer.stem(word) for word in word_tokens]
print("Stemming:", stems)

# === Лемматизація ===
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word) for word in word_tokens]
print("Lemmatization (NLTK):", lemmas)

# Тестові дані
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "The dog is quick and brown.",
    "Foxes are wild animals."
]

# === Bag of Words ===
bow_vectorizer = CountVectorizer()
bow = bow_vectorizer.fit_transform(documents)
print("Bag of Words:\n", bow.toarray())
print("Feature Names:", bow_vectorizer.get_feature_names_out())

# === TF-IDF ===
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(documents)
print("\nTF-IDF:\n", tfidf.toarray())
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())

# === Word Embeddings (Word2Vec) ===
tokenized_docs = [doc.lower().split() for doc in documents]
word2vec_model = Word2Vec(sentences=tokenized_docs, vector_size=10, window=2, min_count=1)
print("\nWord Embedding (Word2Vec) for 'fox':", word2vec_model.wv['fox'])


Sentence Tokenization: ['The quick brown fox jumps over the lazy dogs.', 'Running, jumped, and better are words to test.']
Word Tokenization: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dogs', '.', 'Running', ',', 'jumped', ',', 'and', 'better', 'are', 'words', 'to', 'test', '.']
Stemming: ['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.', 'run', ',', 'jump', ',', 'and', 'better', 'are', 'word', 'to', 'test', '.']
Lemmatization (NLTK): ['The', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '.', 'Running', ',', 'jumped', ',', 'and', 'better', 'are', 'word', 'to', 'test', '.']
Bag of Words:
 [[0 0 0 1 1 1 0 0 1 1 1 1 2 0]
 [1 0 0 1 1 0 0 1 0 0 0 1 1 0]
 [0 1 1 0 0 0 1 0 0 0 0 0 0 1]]
Feature Names: ['and' 'animals' 'are' 'brown' 'dog' 'fox' 'foxes' 'is' 'jumps' 'lazy'
 'over' 'quick' 'the' 'wild']

TF-IDF:
 [[0.         0.         0.         0.26807016 0.26807016 0.35248004
  0.         0.         0.35248004 0.35248004 0.3524