In [2]:
import pandas as pd

# Read the text file
with open('../data/glove.6B/glove.6B.50d.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Split each line into words and embedding vectors
data = [line.strip().split(' ', maxsplit=1) for line in lines]

# Create a DataFrame from the data
data = pd.DataFrame(data, columns=['word', 'embedding'])

### Create word to vector mapping dictionary

In [3]:
# Convert DataFrame to dictionary
glove_dict = data.set_index('word')['embedding'].to_dict()

# Convert string values to NumPy arrays
for key, value in glove_dict.items():
    glove_dict[key] = np.array(value.split(), dtype=np.float32)
    
# Print the head of the dictionary (first n elements)
n = 5
head_dict = {}
for key in list(glove_dict.keys())[:n]:
    head_dict[key] = glove_dict[key]

print(head_dict)

{'the': array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32), ',': array([ 0.013441,  0.23682 , -0.16899 ,  0.40951 ,  0.63812 ,  0.47709 ,
       -0.42852 , -0.55641 , -0.364   , -0.23938 ,  0.13001 , -0.063734,
       -0.39575 , -0.48162 ,  0.23291 ,  0.090201, -0.13324 ,  0.078639,
       -0.41634 , -0.1542

### Tokenize

In [4]:
import nltk
from nltk.tokenize import word_tokenize

# Tokenize the words
tokens = word_tokenize(" ".join(glove_dict.keys()))

In [5]:
from nltk.probability import FreqDist

# Create a frequency distribution
freq_dist = FreqDist(tokens)

# Get the most common words
most_common_words = freq_dist.most_common()
most_common_words

[('.', 942),
 ('--', 608),
 (':', 347),
 ('http', 337),
 ('@', 204),
 ('&', 196),
 (')', 170),
 ('(', 169),
 ('’', 95),
 ('globe.com', 71),
 ("'", 54),
 ('a', 46),
 ('d', 46),
 ('o', 40),
 ('#', 38),
 ('?', 37),
 ('l', 36),
 ('s', 28),
 ('!', 28),
 ('m', 28),
 ('p', 28),
 ('i', 27),
 ('c', 27),
 ('b', 24),
 ('-', 23),
 ('nytimes.com', 22),
 ('$', 21),
 ('r', 21),
 ('t', 20),
 (';', 16),
 ('e', 16),
 ('n', 15),
 ('w', 15),
 ('ajc.com', 15),
 ('h', 14),
 ('coxnews.com', 14),
 ('g', 12),
 ('f', 12),
 ('chron.com', 12),
 ('‘', 11),
 ('j', 11),
 ('ap.org', 11),
 ('latimes.com', 10),
 ('pbpost.com', 9),
 ('statesman.com', 9),
 ('u', 8),
 ('k', 7),
 ('hearstdc.com', 7),
 ('“', 6),
 ('”', 6),
 ('y', 6),
 ('*', 6),
 ('me', 5),
 ('amp', 5),
 ('v', 5),
 ('q', 5),
 (']', 5),
 ('hawai', 5),
 ('latimescolumnists.com', 5),
 ('at', 4),
 ('home', 4),
 ('art', 4),
 ('express', 4),
 ('ad', 4),
 ('ma', 4),
 ('na', 4),
 ('shi', 4),
 ('212', 4),
 ('ve', 4),
 ('baha', 4),
 ('latwp', 4),
 ('timesunion.com', 4

### Word to index mapping

In [24]:
words = ['<UNK>'] + list(glove_dict.keys())

word_index = {}
for index, word in enumerate(words):
    if word == '<UNK>':
        continue
    word_index[word] = index

### Create matrix

In [25]:
vocab_len = len(glove_dict)
embed_vector_len = 50

emb_matrix = np.zeros((vocab_len + 1, embed_vector_len)) 

for word, vector in glove_dict.items():
    index = word_index[word]
    emb_matrix[index, :] = vector

In [26]:
emb_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.41800001,  0.24968   , -0.41242   , ..., -0.18411   ,
        -0.11514   , -0.78580999],
       [ 0.013441  ,  0.23682   , -0.16899   , ..., -0.56656998,
         0.044691  ,  0.30392   ],
       ...,
       [-0.51181   ,  0.058706  ,  1.09130001, ..., -0.25003001,
        -1.125     ,  1.58630002],
       [-0.75897998, -0.47426   ,  0.47369999, ...,  0.78953999,
        -0.014116  ,  0.64480001],
       [ 0.072617  , -0.51393002,  0.47279999, ..., -0.18907   ,
        -0.59021002,  0.55558997]])

In [9]:
column_index = 3

specific_column = emb_matrix[:, column_index]

In [10]:
specific_column

array([ 0.        ,  0.1217    ,  0.40950999, ..., -0.55163002,
        0.77249998, -0.52201998])

In [13]:
word_index['of']

4

In [47]:
emb_matrix.mean(axis=0)

array([-0.12920029, -0.28866167, -0.01224891, -0.05676675, -0.20211058,
       -0.08389005,  0.33359654,  0.16045106,  0.03867486,  0.17833048,
        0.04696609, -0.00285778,  0.29099778,  0.04613712, -0.20923789,
       -0.06613084, -0.06822431,  0.07665865,  0.3133984 ,  0.17848468,
       -0.12257688, -0.09916903, -0.07495954,  0.0641319 ,  0.14441219,
        0.60894448,  0.17463057,  0.0533539 , -0.01273822,  0.03474099,
       -0.81239363, -0.04688716,  0.20193483,  0.20311064, -0.03935645,
        0.069675  , -0.01553651, -0.03405266, -0.06528008,  0.12250061,
        0.1399197 , -0.17446261, -0.08011821,  0.08495198, -0.01041642,
       -0.13704867,  0.20127038,  0.10069269,  0.00653005,  0.01685146])

In [29]:
list_of_lists = []
for row in emb_matrix:
    list_of_lists.append(list(row))

In [32]:
list_of_lists

[[0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.4180000126361847,
  0.24967999756336212,
  -0.41242000460624695,
  0.1216999962925911,
  0.3452700078487396,
  -0.044456999748945236,
  -0.4968799948692322,
  -0.17861999571323395,
  -0.0006602299981750548,
  -0.6565999984741211,
  0.2784300148487091,
  -0.14767000079154968,
  -0.5567700266838074,
  0.14657999575138092,
  -0.009509500116109848,
  0.011657999828457832,
  0.10204000025987625,
  -0.127920001745224,
  -0.8442999720573425,
  -0.12180999666452408,
  -0.016800999641418457,
  -0.33278998732566833,
  -0.15520000457763672,
  -0.23130999505519867,
  -0.1918099969625473,
  -1.8823000192642212,
  -0.7674599885940552,
  0.099050998687

In [49]:
mean_values = np.mean(list_of_lists, axis=0)

In [50]:
mean_values

array([-0.12920029, -0.28866167, -0.01224891, -0.05676675, -0.20211058,
       -0.08389005,  0.33359654,  0.16045106,  0.03867486,  0.17833048,
        0.04696609, -0.00285778,  0.29099778,  0.04613712, -0.20923789,
       -0.06613084, -0.06822431,  0.07665865,  0.3133984 ,  0.17848468,
       -0.12257688, -0.09916903, -0.07495954,  0.0641319 ,  0.14441219,
        0.60894448,  0.17463057,  0.0533539 , -0.01273822,  0.03474099,
       -0.81239363, -0.04688716,  0.20193483,  0.20311064, -0.03935645,
        0.069675  , -0.01553651, -0.03405266, -0.06528008,  0.12250061,
        0.1399197 , -0.17446261, -0.08011821,  0.08495198, -0.01041642,
       -0.13704867,  0.20127038,  0.10069269,  0.00653005,  0.01685146])

In [51]:
word_index['king']

692