In [108]:
import pandas as pd
from itertools import chain

# Read the text file
with open('../data/glove.6B/glove.6B.50d.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Split each line into words and embedding vectors
data = [line.strip().split(' ', maxsplit=1) for line in lines]

# Create a DataFrame from the data
data = pd.DataFrame(data, columns=['word', 'embedding'])

### Create word to vector mapping dictionary

In [109]:
# Convert DataFrame to dictionary
glove_dict = data.set_index('word')['embedding'].to_dict()

# Convert string values to NumPy arrays
for key, value in glove_dict.items():
    glove_dict[key] = np.array(value.split(), dtype=np.float32)
    
# Print the head of the dictionary (first n elements)
n = 5
head_dict = {}
for key in list(glove_dict.keys())[:n]:
    head_dict[key] = glove_dict[key]

print(head_dict)

{'the': array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32), ',': array([ 0.013441,  0.23682 , -0.16899 ,  0.40951 ,  0.63812 ,  0.47709 ,
       -0.42852 , -0.55641 , -0.364   , -0.23938 ,  0.13001 , -0.063734,
       -0.39575 , -0.48162 ,  0.23291 ,  0.090201, -0.13324 ,  0.078639,
       -0.41634 , -0.1542

### Tokenize

In [110]:
import nltk
from nltk.tokenize import word_tokenize

# Tokenize the words
tokens = word_tokenize(" ".join(glove_dict.keys()))

In [111]:
from nltk.probability import FreqDist

# Create a frequency distribution
freq_dist = FreqDist(tokens)

# Get the most common words
most_common_words = freq_dist.most_common()
most_common_words

[('.', 942),
 ('--', 608),
 (':', 347),
 ('http', 337),
 ('@', 204),
 ('&', 196),
 (')', 170),
 ('(', 169),
 ('’', 95),
 ('globe.com', 71),
 ("'", 54),
 ('a', 46),
 ('d', 46),
 ('o', 40),
 ('#', 38),
 ('?', 37),
 ('l', 36),
 ('s', 28),
 ('!', 28),
 ('m', 28),
 ('p', 28),
 ('i', 27),
 ('c', 27),
 ('b', 24),
 ('-', 23),
 ('nytimes.com', 22),
 ('$', 21),
 ('r', 21),
 ('t', 20),
 (';', 16),
 ('e', 16),
 ('n', 15),
 ('w', 15),
 ('ajc.com', 15),
 ('h', 14),
 ('coxnews.com', 14),
 ('g', 12),
 ('f', 12),
 ('chron.com', 12),
 ('‘', 11),
 ('j', 11),
 ('ap.org', 11),
 ('latimes.com', 10),
 ('pbpost.com', 9),
 ('statesman.com', 9),
 ('u', 8),
 ('k', 7),
 ('hearstdc.com', 7),
 ('“', 6),
 ('”', 6),
 ('y', 6),
 ('*', 6),
 ('me', 5),
 ('amp', 5),
 ('v', 5),
 ('q', 5),
 (']', 5),
 ('hawai', 5),
 ('latimescolumnists.com', 5),
 ('at', 4),
 ('home', 4),
 ('art', 4),
 ('express', 4),
 ('ad', 4),
 ('ma', 4),
 ('na', 4),
 ('shi', 4),
 ('212', 4),
 ('ve', 4),
 ('baha', 4),
 ('latwp', 4),
 ('timesunion.com', 4

In [112]:
freq_dist

FreqDist({'.': 942, '--': 608, ':': 347, 'http': 337, '@': 204, '&': 196, ')': 170, '(': 169, '’': 95, 'globe.com': 71, ...})

### Word to index mapping

In [113]:
words = ['<UNK>'] + list(glove_dict.keys())

word_index = {}
for index, word in enumerate(words):
    if word == '<UNK>':
        continue
    word_index[word] = index

### Create matrix

In [114]:
vocab_len = len(glove_dict)
embed_vector_len = 50

emb_matrix = np.zeros((vocab_len + 1, embed_vector_len)) 

for word, vector in glove_dict.items():
    index = word_index[word]
    emb_matrix[index, :] = vector

In [115]:
emb_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.41800001,  0.24968   , -0.41242   , ..., -0.18411   ,
        -0.11514   , -0.78580999],
       [ 0.013441  ,  0.23682   , -0.16899   , ..., -0.56656998,
         0.044691  ,  0.30392   ],
       ...,
       [-0.51181   ,  0.058706  ,  1.09130001, ..., -0.25003001,
        -1.125     ,  1.58630002],
       [-0.75897998, -0.47426   ,  0.47369999, ...,  0.78953999,
        -0.014116  ,  0.64480001],
       [ 0.072617  , -0.51393002,  0.47279999, ..., -0.18907   ,
        -0.59021002,  0.55558997]])

In [116]:
# column_index = 3

# specific_column = emb_matrix[:, column_index]

In [117]:
# specific_column

In [None]:
# word_index['of']

In [None]:
# emb_matrix.mean(axis=0)

In [None]:
# list_of_lists = []
# for row in emb_matrix:
#     list_of_lists.append(list(row))

In [None]:
# list_of_lists

In [None]:
# mean_values = np.mean(list_of_lists, axis=0)

In [None]:
# mean_values

In [None]:
# word_index['king']

In [118]:
data = pd.read_csv('../data/data_cleaned.csv',  encoding='latin-1')

In [119]:
data.text

0         upset update facebook texting cry result schoo...
1         hey long time see yes rain bite bit lol fine t...
2         want go promote gear groove unfornately ride b...
3         ok sick spend hour sit shower cause sick stand...
4                 ill tell ya story later good day ill hour
                                ...                        
207339    watchin espn first take favorite mornin show l...
207340    muhaha thankgoodness miss last date rmcaat jun...
207341    good morning people twitter tgifriday thats wa...
207342    today message church service deliver skype fre...
207343    back home thought do week call alter something...
Name: text, Length: 207344, dtype: object

In [120]:
from collections import Counter
from nltk.tokenize import word_tokenize

tok_text = data.text.apply(word_tokenize)
tok_text

0         [upset, update, facebook, texting, cry, result...
1         [hey, long, time, see, yes, rain, bite, bit, l...
2         [want, go, promote, gear, groove, unfornately,...
3         [ok, sick, spend, hour, sit, shower, cause, si...
4         [ill, tell, ya, story, later, good, day, ill, ...
                                ...                        
207339    [watchin, espn, first, take, favorite, mornin,...
207340    [muhaha, thankgoodness, miss, last, date, rmca...
207341    [good, morning, people, twitter, tgifriday, th...
207342    [today, message, church, service, deliver, sky...
207343    [back, home, thought, do, week, call, alter, s...
Name: text, Length: 207344, dtype: object

In [121]:
# expecting Series of documents as strings
def get_word_index_mapping(tok_text):
    
    flattened_tok_list = np.array(list(chain(*tok_text)))
    count_tok = Counter(flattened_tok_list)
    list_tupled_count_tok = count_tok.most_common()

    word2idx = {elem[0]: i + 1 for i, elem in enumerate(list_tupled_count_tok)}
    idx2word = {i + 1:elem[0] for i, elem in enumerate(list_tupled_count_tok)}
    
    return word2idx, idx2word

In [122]:
w2ix, ix2w = get_word_index_mapping(tok_text)

In [123]:
def corpus_mapping(w2ix, tok_text, glove_dict):
    corpus_arrayed = []
    for tweet in tok_text:
        tweet_arrayed = []
        for word in tweet:
            if word in glove_dict.keys():
                tweet_arrayed.append(w2ix[word])
            else:
                continue
        corpus_arrayed.append(tweet_arrayed)
    return corpus_arrayed

In [124]:
number_corpus = corpus_mapping(w2ix, tok_text, glove_dict)

In [125]:
number_corpus

[[482, 249, 512, 1953, 166, 794, 68, 9, 132, 783],
 [131, 71, 10, 16, 165, 114, 736, 153, 17, 451, 100],
 [18, 2, 2717, 1552, 6058, 423, 193, 2, 9179, 2915],
 [126, 93, 322, 56, 189, 392, 175, 93, 584, 485, 12, 2211, 5202, 61],
 [228, 97, 195, 454, 181, 7, 4, 228, 56],
 [61, 244, 5, 503, 244, 4, 37, 420, 20, 1634],
 [41, 41, 41, 21, 66, 386, 74, 38, 22],
 [299, 348, 171, 5947, 106, 471, 50, 41, 331, 137, 218],
 [216, 1321, 4849, 1125, 32, 22, 182, 336, 22, 18794],
 [196, 1013, 317, 16, 896, 94, 4141, 903, 110],
 [152, 740, 446, 359, 534, 306, 679, 556, 276, 297, 414],
 [138, 11891, 33, 18, 57, 14, 2212, 98, 730, 7665, 64, 1781],
 [41, 386, 1970, 2, 60, 1, 101, 35, 14, 60, 77, 229, 139, 1329],
 [8, 172, 14, 8, 127, 2540, 864, 791, 8, 50, 268, 171, 3152, 15],
 [31, 95, 179, 47, 261, 105, 729, 568, 6361],
 [53, 405, 57, 19, 32, 642, 136, 35, 131, 449, 64, 823, 28, 717],
 [503,
  12730,
  9,
  9180,
  27282,
  12731,
  406,
  474,
  760,
  1917,
  3408,
  560,
  472,
  144,
  368,
  106],


instantiate matrix (vocab_size+1, embed_dim)

FOR EACH WORD IN W2IX: GET VECTOR FROM GLOVE IF IT EXISTS by looking up word key:
        
        THEN w2idx --> integer value i
        
        assign word vector vfrom glove to ith row in matrix

In [126]:
def make_matrix(w2ix, number_corpus, glove_dict):
    
    embed_matrix = np.zeros((len(glove_dict) + 1, 50)) 

    for word, i in w2ix.items(): 
        if word in words: 
            word_vector = glove_dict[word] 
            embed_matrix[i] = word_vector
        else:
            continue
            
    return embed_matrix

In [127]:
make_matrix(w2ix, number_corpus, glove_dict)

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.1591    , -0.21427999,  0.63099003, ...,  0.021215  ,
        -0.14218999,  0.66955   ],
       [ 0.14827999,  0.17761   ,  0.42346001, ..., -0.2182    ,
         0.12971   ,  0.32953   ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])