<a href="https://colab.research.google.com/github/mukul-mschauhan/GenerativeAI/blob/main/Word%20Embedding%20Using%20Glove.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-50")



In [None]:
# Most Similar...
model.most_similar([model['king']], topn = 10)

[('king', 1.0000001192092896),
 ('prince', 0.8236179351806641),
 ('queen', 0.7839043140411377),
 ('ii', 0.7746230363845825),
 ('emperor', 0.7736247777938843),
 ('son', 0.766719400882721),
 ('uncle', 0.7627150416374207),
 ('kingdom', 0.7542161345481873),
 ('throne', 0.7539914846420288),
 ('brother', 0.7492411136627197)]

In [None]:
# Word Embeddings from glove model
king_vectors = model['king']
print(king_vectors)

[ 0.50451   0.68607  -0.59517  -0.022801  0.60046  -0.13498  -0.08813
  0.47377  -0.61798  -0.31012  -0.076666  1.493    -0.034189 -0.98173
  0.68229   0.81722  -0.51874  -0.31503  -0.55809   0.66421   0.1961
 -0.13495  -0.11476  -0.30344   0.41177  -2.223    -1.0756   -1.0783
 -0.34354   0.33505   1.9927   -0.04234  -0.64319   0.71125   0.49159
  0.16754   0.34344  -0.25663  -0.8523    0.1661    0.40102   1.1685
 -1.0137   -0.21585  -0.15155   0.78321  -0.91241  -1.6106   -0.64426
 -0.51042 ]


In [None]:
# Word Embeddings from glove model
queen_vectors = model['queen']
print(queen_vectors)

[ 0.37854    1.8233    -1.2648    -0.1043     0.35829    0.60029
 -0.17538    0.83767   -0.056798  -0.75795    0.22681    0.98587
  0.60587   -0.31419    0.28877    0.56013   -0.77456    0.071421
 -0.5741     0.21342    0.57674    0.3868    -0.12574    0.28012
  0.28135   -1.8053    -1.0421    -0.19255   -0.55375   -0.054526
  1.5574     0.39296   -0.2475     0.34251    0.45365    0.16237
  0.52464   -0.070272  -0.83744   -1.0326     0.45946    0.25302
 -0.17837   -0.73398   -0.20025    0.2347    -0.56095   -2.2839
  0.0092753 -0.60284  ]


# prompt: What is reshape function

The `reshape()` function in the provided code is used to change the dimensions of a NumPy array.  Specifically, it's used to transform the 1D word embedding vectors (`king_vectors` and `queen_vectors`) into 2D arrays.

The `cosine_similarity` function from scikit-learn expects 2D arrays as input.  The original word embedding vectors are one-dimensional arrays representing the 50-dimensional word embedding for the word 'king' or 'queen'.  To calculate the cosine similarity between these vectors, they need to be reshaped into a 2D array where each row represents a vector.

`king_vectors.reshape(1, -1)` does the following:

* `1`: Specifies that the reshaped array should have one row.
* `-1`:  This is a special value in NumPy's reshape. It tells NumPy to automatically calculate the number of columns needed to maintain the original number of elements.  Since the original `king_vectors` has 50 elements, the reshaped array will have one row and 50 columns.


In essence, `reshape(1, -1)` converts the vector from a shape like `(50,)` to `(1, 50)`, making it compatible with functions that require 2D input, such as `cosine_similarity`.


In [None]:
# prompt: code for cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

# Reshape the vectors to 2D arrays for cosine_similarity
king_vectors_2d = king_vectors.reshape(1, -1)
queen_vectors_2d = queen_vectors.reshape(1, -1)

# Calculate cosine similarity
similarity = cosine_similarity(king_vectors_2d, queen_vectors_2d)

print(f"Cosine similarity between 'king' and 'queen': {similarity[0][0]}")


Cosine similarity between 'king' and 'queen': 0.7839043140411377


In [None]:
# prompt: Create couple of examples to understand word embedding using the above code.

# Cosine Similarity between words
similarity = cosine_similarity(model['king'].reshape(1,-1),model['man'].reshape(1,-1))
print(f"Cosine similarity between 'king' and 'man': {similarity[0][0]}")

similarity = cosine_similarity(model['king'].reshape(1,-1),model['woman'].reshape(1,-1))
print(f"Cosine similarity between 'king' and 'woman': {similarity[0][0]}")

similarity = cosine_similarity(model['queen'].reshape(1,-1),model['woman'].reshape(1,-1))
print(f"Cosine similarity between 'queen' and 'woman': {similarity[0][0]}")

similarity = cosine_similarity(model['queen'].reshape(1,-1),model['man'].reshape(1,-1))
print(f"Cosine similarity between 'queen' and 'man': {similarity[0][0]}")

Cosine similarity between 'king' and 'man': 0.5309377312660217
Cosine similarity between 'king' and 'woman': 0.41133782267570496
Cosine similarity between 'queen' and 'woman': 0.6003105640411377
Cosine similarity between 'queen' and 'man': 0.5366700291633606


In [None]:
# prompt: Create a small dataset with text and how Word Embedding is used to convert the text into numbers

import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity

# Download the pre-trained GloVe model (this may take some time)
try:
    model = api.load("glove-wiki-gigaword-50")
except Exception as e:
    print(f"Error loading the model: {e}")
    print("Please ensure you have the gensim library installed and internet access to download the model.")
    !pip install gensim
    model = api.load("glove-wiki-gigaword-50")


# Example words for demonstration
words = ["king", "queen", "man", "woman"]


# Function to print word vectors and similarities
def process_words(words_list):
    for word in words_list:
      try:
          word_vector = model[word]
          print(f"\nWord: {word}")
          print("Word Vector:")
          print(word_vector)

          # Calculate cosine similarity with other words
          for other_word in words_list:
              if word != other_word:
                  similarity = cosine_similarity(model[word].reshape(1, -1), model[other_word].reshape(1, -1))
                  print(f"Cosine similarity between '{word}' and '{other_word}': {similarity[0][0]}")
      except KeyError:
          print(f"Warning: Word '{word}' not found in the model's vocabulary.")


process_words(words)


Word: king
Word Vector:
[ 0.50451   0.68607  -0.59517  -0.022801  0.60046  -0.13498  -0.08813
  0.47377  -0.61798  -0.31012  -0.076666  1.493    -0.034189 -0.98173
  0.68229   0.81722  -0.51874  -0.31503  -0.55809   0.66421   0.1961
 -0.13495  -0.11476  -0.30344   0.41177  -2.223    -1.0756   -1.0783
 -0.34354   0.33505   1.9927   -0.04234  -0.64319   0.71125   0.49159
  0.16754   0.34344  -0.25663  -0.8523    0.1661    0.40102   1.1685
 -1.0137   -0.21585  -0.15155   0.78321  -0.91241  -1.6106   -0.64426
 -0.51042 ]
Cosine similarity between 'king' and 'queen': 0.7839043140411377
Cosine similarity between 'king' and 'man': 0.5309377312660217
Cosine similarity between 'king' and 'woman': 0.41133782267570496

Word: queen
Word Vector:
[ 0.37854    1.8233    -1.2648    -0.1043     0.35829    0.60029
 -0.17538    0.83767   -0.056798  -0.75795    0.22681    0.98587
  0.60587   -0.31419    0.28877    0.56013   -0.77456    0.071421
 -0.5741     0.21342    0.57674    0.3868    -0.12574    0.2

In [None]:
import seaborn as sns

tips = sns.load_dataset("tips")

In [None]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [None]:
# Convert the 'time' column to embeddings
def get_embedding(word, model):
    try:
        return model[word]
    except KeyError:
        # If word not in the vocabulary, return a vector of zeros
        return np.zeros(model.vector_size)

In [None]:
tips['time'].apply(lambda x: get_embedding(x.lower(), model))

TypeError: unhashable type: 'numpy.ndarray'