In [None]:
#Word2Vec Visualizations

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from gensim.models import Word2Vec

model = Word2Vec.load("models/word2vec.model")

In [None]:
top_n = 500
common_words = model.wv.index_to_key[:top_n]
common_vectors = model.wv[common_words]
tsne = TSNE(n_components=2, random_state=42, perplexity=100)
word_vectors_2d = tsne.fit_transform(common_vectors)
plt.figure(figsize=(15, 15))
plt.scatter(word_vectors_2d[:, 0], word_vectors_2d[:, 1], s=10)
for i, word in enumerate(common_words):
    plt.annotate(word, xy=(word_vectors_2d[i, 0], word_vectors_2d[i, 1]))
plt.show()

In [None]:
import plotly.graph_objects as go

word_vectors_2d = tsne.fit_transform(common_vectors)

trace = go.Scatter(
    x=word_vectors_2d[:, 0],
    y=word_vectors_2d[:, 1],
    mode='markers',
    text=common_words
)

data = [trace]

layout = go.Layout(
    title="Word2Vec t-SNE Visualization",
    hovermode='closest',
)

fig = go.Figure(data=data, layout=layout)
fig.show()



In [None]:
from sklearn.cluster import KMeans

word_vectors = model.wv.vectors

kmeans = KMeans(n_clusters=50)
labels = kmeans.fit_predict(word_vectors)

trace = go.Scatter(
    x=word_vectors_2d[:, 0],
    y=word_vectors_2d[:, 1],
    mode='markers',
    text=common_words,
    marker=dict(
        color=labels,
        colorscale='Viridis',
        size=8,
        line_width=1
    )
)

data = [trace]

layout = go.Layout(
    title="Word2Vec t-SNE Visualization with K-means Clusters",
    hovermode='closest',
)

fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
clusters = {}
for word, label in zip(common_words, labels):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(word)

# Print out the words in each cluster
for label, words in clusters.items():
    print(f'Cluster {label}: {words}')

In [None]:
kmeans = KMeans(n_clusters=50)
labels = kmeans.fit_predict(common_vectors)

trace = go.Scatter(
    x=word_vectors_2d[:, 0],
    y=word_vectors_2d[:, 1],
    mode='markers',
    text=common_words,
    marker=dict(
        color=labels,
        colorscale='Viridis',
        size=8,
        line_width=1
    )
)

data = [trace]

layout = go.Layout(
    title="Word2Vec t-SNE Visualization with K-means Clusters",
    hovermode='closest',
)

fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
import plotly.graph_objects as go

x_start, x_end = 4, 6.4 
y_start, y_end = 0.65, 8

indices = (word_vectors_2d[:, 0] >= x_start) & (word_vectors_2d[:, 0] <= x_end) & (word_vectors_2d[:, 1] >= y_start) & (word_vectors_2d[:, 1] <= y_end)
region_vectors = word_vectors_2d[indices]
region_labels = [common_words[i] for i in range(len(common_words)) if indices[i]]

trace = go.Scatter(
    x=region_vectors[:, 0],
    y=region_vectors[:, 1],
    mode='markers+text',
    text=region_labels,
    textposition="bottom center"
)

data = [trace]

layout = go.Layout(
    title="Word2Vec t-SNE Visualization",
    hovermode='closest',
    xaxis=dict(range=[x_start, x_end]),
    yaxis=dict(range=[y_start, y_end])
)

fig = go.Figure(data=data, layout=layout)
fig.show()


In [None]:
word1 = "francés"
word2 = "italiano"
similarity_score = model.wv.similarity(word1, word2)

print(f"Similarity between '{word1}' and '{word2}': {similarity_score}")

In [None]:
similar_words = model.wv.most_similar(['francés', 'francia'], topn=30)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar(['mexicano', 'méxico'], topn=30)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar(['literatura', 'escritor', 'poema', 'poesía', 'novela', 'libro'], topn=30)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('mexicano', topn=30)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('italiano', topn=30)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('cubano', topn=30)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('chino', topn=30)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('moda', topn=30)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('obrero', topn=30)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('artista', topn=30)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('', topn=30)
print(similar_words)

In [None]:
# Doc2Vec Visualizations

In [None]:
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from gensim.models import Doc2Vec

# Load the model
doc2vec_model = Doc2Vec.load("models/doc2vec.model")

doc_vectors = doc2vec_model.dv.vectors

tsne = TSNE(n_components=2, random_state=0, perplexity=50)
doc_vectors_2d = tsne.fit_transform(doc_vectors)

kmeans = KMeans(n_clusters=20)
labels = kmeans.fit_predict(doc_vectors)

plt.figure(figsize=(10, 10))
scatter = plt.scatter(doc_vectors_2d[:, 0], doc_vectors_2d[:, 1], c=labels)

handles, _ = scatter.legend_elements(prop='colors')
plt.legend(handles, [f'Cluster {i+1}' for i in range(20)], loc='upper right')

plt.show()

In [None]:
import pickle

with open('preprocessing_data/tagged_documents.pkl', 'rb') as f:
    tagged_documents = pickle.load(f)

clusters = [[] for _ in range(20)]

for label, doc_tag in enumerate(labels):
    clusters[doc_tag].append(label)

cluster_documents = [[tagged_documents[doc_tag].words[:15] for doc_tag in cluster] for cluster in clusters]

cluster_documents = [[(' '.join(doc)) for doc in cluster] for cluster in cluster_documents]

In [None]:
print(cluster_documents)

In [None]:
import numpy as np
from sklearn.manifold import TSNE

combined_vectors = np.load("models/combined_vectors.npy")

# Use t-SNE to reduce the combined vectors to two dimensions
tsne = TSNE(n_components=2, random_state=0, perplexity=50)
combined_vectors_2d = tsne.fit_transform(combined_vectors)

plt.figure(figsize=(10, 10))
plt.scatter(combined_vectors_2d[:, 0], combined_vectors_2d[:, 1])
plt.title("Combined Vectors t-SNE Visualization")
plt.show()

In [None]:
import numpy as np
from sklearn.manifold import TSNE
from scipy.spatial import distance

def find_closest_documents(target_document_index, num_closest=5):
    combined_vectors = np.load("models/combined_vectors.npy")

    tsne = TSNE(n_components=2, random_state=0, perplexity=50)
    combined_vectors_2d = tsne.fit_transform(combined_vectors)

    # Calculate the distances between the target document and all other documents
    target_vector = combined_vectors_2d[target_document_index]
    distances = [distance.euclidean(target_vector, vector) for vector in combined_vectors_2d]

    closest_indices = np.argsort(distances)[:num_closest]

    return closest_indices

In [None]:
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from gensim.models import Doc2Vec

combined_vectors = np.load("models/combined_vectors.npy")

tsne = TSNE(n_components=2, random_state=0, perplexity=50)
combined_vectors_2d = tsne.fit_transform(combined_vectors)

# Perform K-means clustering
kmeans = KMeans(n_clusters=20)
labels = kmeans.fit_predict(doc_vectors)

plt.figure(figsize=(10, 10))
plt.scatter(combined_vectors_2d[:, 0], combined_vectors_2d[:, 1], c=labels)
plt.title("Combined Vectors t-SNE Visualization")

handles, _ = scatter.legend_elements(prop='colors')
plt.legend(handles, [f'Cluster {i+1}' for i in range(20)], loc='upper right')  # moved legend

plt.show()

In [None]:
import numpy as np
from sklearn.manifold import TSNE
from scipy.spatial import distance

def find_closest_documents(target_document_index, num_closest=5):
    combined_vectors = np.load("models/combined_vectors.npy")

    tsne = TSNE(n_components=2, random_state=0, perplexity=50)
    combined_vectors_2d = tsne.fit_transform(combined_vectors)

    target_vector = combined_vectors_2d[target_document_index]
    distances = [distance.euclidean(target_vector, vector) for vector in combined_vectors_2d]

    closest_indices = np.argsort(distances)[:num_closest]

    return closest_indices

# Call the function and specify the index of the target document
target_document_index = 0
num_closest = 5

closest_indices = find_closest_documents(target_document_index, num_closest)
print("Closest document indices:", closest_indices)

In [None]:
tsne = TSNE(n_components=2, random_state=0)
reduced_vectors = tsne.fit_transform(combined_vectors)

# Get the indices of the top 10 most similar documents
top10_indices = np.argsort(similarities[0])[-10:]

plt.figure(figsize=(10, 10))
plt.scatter(reduced_vectors[:, 0], reduced_vectors[:, 1], color='blue', alpha=0.5)
plt.scatter(reduced_vectors[top10_indices, 0], reduced_vectors[top10_indices, 1], color='red', alpha=0.5)
plt.scatter(reduced_vectors[most_similar_index, 0], reduced_vectors[most_similar_index, 1], color='green', alpha=1)
plt.show()


In [None]:
import numpy as np
import pickle
from gensim.models import LdaModel
from gensim.corpora import Dictionary

with open('preprocessing_data/tagged_documents.pkl', 'rb') as f:
    tagged_documents = pickle.load(f)

lda_model = LdaModel.load('models/lda_model')

dictionary = Dictionary.load('models/dictionary')

top10_indices = np.argsort(similarities[0])[-10:]

for idx in top10_indices:
    print(f"Document index: {idx}, similarity: {similarities[0][idx]}")
    print(f"Document: {tagged_documents[idx].words}")

    bow_corpus = dictionary.doc2bow(tagged_documents[idx].words)

    topic_distribution = lda_model.get_document_topics(bow_corpus)

    # Print the topic distribution in each document
    for topic_id, prop in topic_distribution:
        print(f"Topic ID: {topic_id}, Proportion: {prop}")

    print("\n")

In [None]:
# Print 15 words from each topic
for idx, topic in lda_model.print_topics(-1, 15):
    print(f"Topic: {idx} \nWords: {topic}\n")


In [None]:
# Topic modeling visualizations

In [None]:
from gensim.corpora import MmCorpus
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.models import LdaModel
from gensim.corpora import Dictionary

# Load the saved dictionary, corpus and LDA model
dictionary = Dictionary.load('models/dictionary')
corpus = MmCorpus('models/corpus.mm')
lda_model = LdaModel.load('models/lda_model')

lda_display = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)

pyLDAvis.display(lda_display)

In [None]:
# def find_topics_with_word(lda_model, dictionary, word, num_topics):
#     word_id = dictionary.token2id.get(word)
#     if word_id is not None:
#         word_topics = lda_model.get_term_topics(word_id, minimum_probability=0.000001)
#         word_topics.sort(key=lambda x: x[1], reverse=True)
#         for topic_id, relevance in word_topics[:num_topics]:
#             print(f"Topic {topic_id}, relevance: {relevance}")
#             print(lda_model.show_topic(topic_id))
#     else:
#         print(f"The word '{word}' is not in the dictionary.")

# lda_model, dictionary, _ = topic_modeling_pipeline(lower_texts_words)
# find_topics_with_word(lda_model, dictionary, 'francia', 5)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

topics = lda_model.show_topics(formatted=False, num_topics=15)

# Create a WordCloud for each topic
for idx, topic in topics:
    wc = WordCloud(background_color="white", max_words=20, width=800, height=400)
    wordcloud = wc.generate_from_frequencies(dict(topic))

    plt.figure(figsize=(10, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title("Topic #" + str(idx))
    plt.show()


In [None]:
import pandas as pd

def document_topic_distribution(lda_model, corpus):
    # get the topic distribution for each document
    doc_topic_dist = [lda_model.get_document_topics(bow) for bow in corpus]
    
    # initialize a dataframe
    dist_df = pd.DataFrame.from_records([{v:0 for v in range(lda_model.num_topics)}])

    # populate the dataframe
    for i in doc_topic_dist:
        for j in i:
            dist_df.loc[0, j[0]] += 1
    
    # normalize the distribution (optional)
    dist_df = dist_df.div(dist_df.sum(axis=1), axis=0)
    
    return dist_df

# get the topic distribution
dist_df = document_topic_distribution(lda_model, corpus)

# plot the distribution
dist_df.transpose().plot(kind='bar', legend=False)
plt.title('Topic Distribution across all documents')
plt.ylabel('Frequency')
plt.xlabel('Topic ID')
plt.show()

In [None]:
def plot_top_words(lda_model, num_topics, num_words=20):
    top_words = [[word for word,_ in lda_model.show_topic(topic_id, topn=50)] for topic_id in range(lda_model.num_topics)]
    top_betas = [[beta for _,beta in lda_model.show_topic(topic_id, topn=50)] for topic_id in range(lda_model.num_topics)]

    for i in range(num_topics):
        plt.figure(figsize=(10,10))
        plt.barh(range(num_words), top_betas[i][:num_words], align='center',color='blue', ecolor='black')
        plt.yticks(range(num_words), top_words[i][:num_words])
        plt.xlabel('Beta')
        plt.title('Top Words in Topic ' + str(i) + ' and their Betas')
        plt.gca().invert_yaxis()
        plt.show()

# plot the top words
plot_top_words(lda_model, 15)