In [4]:
### 1. Creating a Gensim Dictionary and Corpus
import gensim
from gensim import corpora

# Sample documents
documents = [
    "Natural language processing and machine learning are fun",
    "Python is my favorite programming language",
    "I enjoy reading about machine learning"
]

# Tokenize the documents
texts = [doc.lower().split() for doc in documents]

# Create a dictionary
dictionary = corpora.Dictionary(texts)

# Create a corpus (Bag of Words)
corpus = [dictionary.doc2bow(text) for text in texts]

# Print dictionary and corpus
print("Dictionary:", dictionary.token2id)
print("Corpus:", corpus)

Dictionary: {'and': 0, 'are': 1, 'fun': 2, 'language': 3, 'learning': 4, 'machine': 5, 'natural': 6, 'processing': 7, 'favorite': 8, 'is': 9, 'my': 10, 'programming': 11, 'python': 12, 'about': 13, 'enjoy': 14, 'i': 15, 'reading': 16}
Corpus: [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(3, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(4, 1), (5, 1), (13, 1), (14, 1), (15, 1), (16, 1)]]


In [5]:

### 2. Training a Topic Model (LDA)


from gensim.models import LdaModel

# Using the previously created corpus and dictionary
# Train an LDA model (Latent Dirichlet Allocation)
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)

# Print the topics
topics = lda_model.print_topics(num_words=3)
for topic in topics:
    print(topic)

(0, '0.111*"machine" + 0.111*"learning" + 0.067*"fun"')
(1, '0.104*"language" + 0.103*"programming" + 0.103*"is"')


In [6]:
### 3. Word2Vec Model for Word Embeddings


from gensim.models import Word2Vec

# Sample sentences
sentences = [
    "natural language processing and machine learning are fun".split(),
    "python is my favorite programming language".split(),
    "I enjoy reading about machine learning".split()
]

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=2)

# Get the vector for a word
vector = word2vec_model.wv['machine']
print("Vector for 'machine':", vector)

# Find the most similar words to 'language'
similar_words = word2vec_model.wv.most_similar('language')

print("Most similar words to 'language':", similar_words)


Vector for 'machine': [-0.01631559  0.00899109 -0.00827429  0.00164873  0.01699636 -0.00892452
  0.00903518 -0.01357433 -0.00709761  0.01879663 -0.00315458  0.0006425
 -0.0082806  -0.01536514 -0.00301648  0.0049403  -0.00177546  0.01106799
 -0.00548653  0.00451997  0.01091197  0.01669232 -0.00290673 -0.01841601
  0.00874086  0.00114438  0.01488423 -0.00162644 -0.00527695 -0.01750608
 -0.00171279  0.00565267  0.01080237  0.01410518 -0.01140631  0.00371709
  0.0121772  -0.00959642 -0.0062141   0.01359509  0.00326311  0.00038021
  0.00694774  0.00043512  0.01923844  0.01012189 -0.01783467 -0.0140836
  0.00180344  0.01278435]
Most similar words to 'language': [('I', 0.21057182550430298), ('my', 0.1670549064874649), ('programming', 0.15019884705543518), ('are', 0.13204392790794373), ('learning', 0.1267007291316986), ('reading', 0.0998455360531807), ('natural', 0.059367649257183075), ('machine', 0.0423765629529953), ('favorite', 0.04066995531320572), ('processing', 0.012373912148177624)]


In [7]:
### 4. TF-IDF Model


from gensim.models import TfidfModel

# Create a TF-IDF model from the corpus
tfidf_model = TfidfModel(corpus)

# Transform a document using the TF-IDF model
tfidf_corpus = tfidf_model[corpus]

# Print the TF-IDF values for the first document
for doc in tfidf_corpus:
    print(doc)


[(0, 0.42998768831312806), (1, 0.42998768831312806), (2, 0.42998768831312806), (3, 0.1586956620869655), (4, 0.1586956620869655), (5, 0.1586956620869655), (6, 0.42998768831312806), (7, 0.42998768831312806)]
[(3, 0.16284991207632715), (8, 0.44124367556640004), (9, 0.44124367556640004), (10, 0.44124367556640004), (11, 0.44124367556640004), (12, 0.44124367556640004)]
[(4, 0.17855490118826328), (5, 0.17855490118826328), (13, 0.48379652089574265), (14, 0.48379652089574265), (15, 0.48379652089574265), (16, 0.48379652089574265)]


In [8]:

### 5. Doc2Vec Model

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Sample documents
documents = [
    "I love playing football",
    "Football is a great sport",
    "I play football every weekend",
    "I love watching football matches"
]

# Prepare the training data for Doc2Vec
tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(documents)]

# Train a Doc2Vec model
doc2vec_model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4)

# Get the vector for a specific document (e.g., document 0)
doc_vector = doc2vec_model.dv['0']
print("Vector for document 0:", doc_vector)

# Find most similar documents to the first one
similar_docs = doc2vec_model.dv.most_similar('0')
print("Most similar documents to document 0:", similar_docs)

Vector for document 0: [-0.02617824 -0.02989686 -0.04943514  0.04278     0.01787857  0.00126838
 -0.04944359 -0.02587047 -0.04864132  0.01000107  0.01413195  0.02325049
 -0.0215772  -0.0158158  -0.01545884 -0.04362763  0.01084296  0.04613708
 -0.04757599 -0.01728128]
Most similar documents to document 0: [('1', 0.26511624455451965), ('2', -0.13240714371204376), ('3', -0.3683653473854065)]


In [9]:

### 6. Applying LSI (Latent Semantic Indexing)


from gensim.models import LsiModel

# Train an LSI model using the corpus
lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=2)

# Print the topics
lsi_topics = lsi_model.print_topics(num_words=3)
for topic in lsi_topics:
    print(topic)

(0, '0.431*"machine" + 0.431*"learning" + 0.352*"language"')
(1, '0.365*"language" + 0.365*"my" + 0.365*"programming"')


In [10]:
### 7. Finding Similar Documents with Similarity Queries


from gensim.similarities import MatrixSimilarity

# Build a similarity index using the LDA model
index = MatrixSimilarity(lda_model[corpus])

# Query the similarity of the first document against the corpus
query_bow = corpus[0]
query_lda = lda_model[query_bow]
sims = index[query_lda]

# Print similarity scores
print(list(enumerate(sims)))




[(0, 1.0), (1, 0.14832774), (2, 0.99989796)]


In [11]:
### NLU using Gensim

In [12]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Sample text documents (corpus)
documents = [
    "Natural language processing (NLP) is a field of artificial intelligence.",
    "Deep learning methods have greatly improved the accuracy of language models.",
    "Machine learning algorithms are essential in building NLP systems.",
    "Gensim is a popular library for topic modeling and text analysis.",
    "Artificial intelligence and machine learning are transforming various industries."
]

### Step 1: Text Preprocessing

# Tokenize the documents and remove stopwords
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    return [word for word in tokens if word.isalnum() and word not in stop_words]

# Preprocess all documents
texts = [preprocess(doc) for doc in documents]

### Step 2: Create Gensim Dictionary and Corpus

# Create a dictionary from the texts
dictionary = corpora.Dictionary(texts)

# Create a bag-of-words representation of the corpus
corpus = [dictionary.doc2bow(text) for text in texts]

# Print the dictionary and corpus
print("Dictionary:", dictionary.token2id)
print("Corpus:", corpus)

### Step 3: Train the LDA Model

# Train an LDA model with 2 topics
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=10)

# Print the topics
print("\nTopics found by the LDA Model:")
topics = lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)

### Step 4: Understanding New Documents

# New unseen document
new_doc = "Artificial intelligence and deep learning are key technologies in NLP."

# Preprocess the new document
new_bow = dictionary.doc2bow(preprocess(new_doc))

# Get the topic distribution for the new document
new_doc_topics = lda_model.get_document_topics(new_bow)

# Print the topic distribution for the new document
print("\nTopic Distribution for the New Document:")
for topic, prob in new_doc_topics:
    print(f"Topic {topic}: {prob:.3f}")

### Step 5: Coherence Score for Model Evaluation

from gensim.models import CoherenceModel

# Compute the coherence score for the model
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print(f"\nCoherence Score: {coherence_score:.4f}")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mattsalomon/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mattsalomon/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Dictionary: {'artificial': 0, 'field': 1, 'intelligence': 2, 'language': 3, 'natural': 4, 'nlp': 5, 'processing': 6, 'accuracy': 7, 'deep': 8, 'greatly': 9, 'improved': 10, 'learning': 11, 'methods': 12, 'models': 13, 'algorithms': 14, 'building': 15, 'essential': 16, 'machine': 17, 'systems': 18, 'analysis': 19, 'gensim': 20, 'library': 21, 'modeling': 22, 'popular': 23, 'text': 24, 'topic': 25, 'industries': 26, 'transforming': 27, 'various': 28}
Corpus: [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(3, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)], [(5, 1), (11, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(0, 1), (2, 1), (11, 1), (17, 1), (26, 1), (27, 1), (28, 1)]]

Topics found by the LDA Model:
(0, '0.069*"popular" + 0.069*"analysis" + 0.069*"topic" + 0.069*"gensim"')
(1, '0.081*"learning" + 0.058*"machine" + 0.058*"artificial" + 0.057*"nlp"')

Topic Distribution for the New Do

In [24]:
import gensim.downloader as api
path = api.load("word2vec-google-news-300", return_path=True)
print(path)

/Users/mattsalomon/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [29]:
!gunzip /Users/mattsalomon/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz

In [33]:
!ls /Users/mattsalomon/gensim-data/word2vec-google-news-300/*

/Users/mattsalomon/gensim-data/word2vec-google-news-300/__init__.py
/Users/mattsalomon/gensim-data/word2vec-google-news-300/word2vec-google-news-300

/Users/mattsalomon/gensim-data/word2vec-google-news-300/__pycache__:
__init__.cpython-310.pyc


In [35]:
#from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# Load pretrained model (since intermediate data is not included, the model cannot be refined with additional data)
model = KeyedVectors.load_word2vec_format('/Users/mattsalomon/gensim-data/word2vec-google-news-300/word2vec-google-news-300',binary=True)

dog = model['dog']
print(dog.shape)
print(dog[:10])

# Deal with an out of dictionary word: Özgür
if 'Özgür' in model:
    print(model['Özgür'].shape)
else:
    print('{0} is an out of dictionary word'.format('Özgür'))

# Some predefined functions that show content related information for given words
print(model.most_similar(positive=['woman', 'king'], negative=['man']))

print(model.doesnt_match("breakfast cereal dinner lunch".split()))

print(model.similarity('woman', 'man'))

(300,)
[ 0.05126953 -0.02233887 -0.17285156  0.16113281 -0.08447266  0.05737305
  0.05859375 -0.08251953 -0.01538086 -0.06347656]
(300,)
[('queen', 0.7118192911148071), ('monarch', 0.6189674735069275), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321243286133), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.5181134343147278), ('sultan', 0.5098593235015869), ('monarchy', 0.5087411403656006)]
cereal
0.76640123
