In [13]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import gensim.downloader as api

# Load data

In [14]:
filepath = '../data/interim/clean_keywords_2019-01-01_2022-01-01.json'
df = pd.read_json(filepath, orient ='split', compression = 'infer')

# Load Word2vec model

In [15]:
# Loads word2vec google model
wv = api.load('word2vec-google-news-300')

In [4]:
print('Words are represented with a vector of dimension: ', wv.vector_size)
print('Number of words (vocabulary) in the model:', len(wv))
print('Example of words:', wv.index_to_key[0:10])

Words are represented with a vector of dimension:  300
Number of words (vocabulary) in the model: 3000000
Example of words: ['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The', 'with', 'said']


# Word embedding

In [5]:
# Extract keywords
keywords_str = df['keywordStringsCleanAfterFuzz'][0:3].astype(str)
keywords_str

0          ['NASA', 'OSIRIS-REx', 'Bennu', 'asteroid']
1    ['English Channel', 'migration', 'boats', 'ill...
2    ['Brazil', 'Jair Bolsonaro', 'Chicago economic...
Name: keywordStringsCleanAfterFuzz, dtype: object

In [6]:
# Creating the vectorizer 
vectorizer = CountVectorizer(stop_words='english')

# Fit the model with our data (each keyword becomes a feature, some are split)
X = vectorizer.fit_transform(keywords_str)

# Make an array and fills it in
CountVectorizedData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Words in the vocabulary (some keywords are split)
WordsVocab=CountVectorizedData.columns

CountVectorizedData.head()

Unnamed: 0,asteroid,bennu,boats,bolsonaro,brazil,channel,chicago,economics,english,guedes,hamilton,illegal,immigration,jair,migration,mourao,nasa,osiris,paulo,rex
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1
1,0,0,1,0,0,1,0,0,1,0,0,1,1,0,1,0,0,0,0,0
2,0,0,0,1,1,0,1,1,0,1,1,0,0,1,0,1,0,0,1,0


In [7]:
# Creating empty dataframe to hold sentences
W2Vec_Data=pd.DataFrame()

# Looping through each row for the data
for i in range(CountVectorizedData.shape[0]):

    # initiating a sentence with all zeros
    sentence = np.zeros(300)

    # Looping thru each word in the sentence and if its present in 
    # the Word2Vec model then storing its vector
    for word in WordsVocab[CountVectorizedData.iloc[i , :] >= 1]:
        if word in wv.index_to_key:   
            sentence = sentence + wv[word]
        else:
            print(word) 
    # Appending the sentence to the dataframe
    W2Vec_Data = W2Vec_Data.append(pd.DataFrame([sentence]))

bennu
osiris
bolsonaro
guedes
jair
mourao


In [8]:
W2Vec_Data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.182617,0.047119,0.154297,0.467773,-0.137695,-0.589355,0.196259,-0.612854,0.220215,-0.068726,...,0.154785,0.447266,-0.489746,0.336731,0.023804,-0.051758,-0.420166,-0.286865,0.342285,0.759033
0,-0.154053,0.11377,0.041992,-0.263184,-0.71936,-0.577271,-0.123596,0.30957,0.201172,0.320068,...,0.016357,0.215561,0.845909,0.062622,0.300781,0.254883,0.032059,-0.284668,0.453857,0.143555
0,-0.426941,0.242188,0.20929,1.017578,0.179688,0.282715,-0.306396,-0.980957,-0.179443,0.467773,...,-0.016602,0.634399,0.228271,0.607666,-0.173828,-0.10376,-0.198181,-1.099854,-0.002441,0.262787


# Link with data

In [12]:
df

Unnamed: 0,id,lastModifiedDate,keywordStrings,keywordStringsCleanAfterFuzz
0,46912921,2019-01-01T03:57:28.904Z,"[NASA, OSIRIS-REx, Bennu, asteroid]","[NASA, OSIRIS-REx, Bennu, asteroid]"
1,46911356,2019-01-01T06:11:50.527Z,"[English Channel, migration, boats, illegal im...","[English Channel, migration, boats, illegal im..."
2,46909694,2019-01-01T06:14:35.563Z,"[Brazil, Jair Bolsonaro, Chicago economics, Ha...","[Brazil, Jair Bolsonaro, Chicago economics, Ha..."
3,46912694,2019-01-01T08:26:11.599Z,"[Japan, Tokyo, Harajuku, attack]","[Japan, Tokyo, Harajuku, attack]"
4,46910092,2019-01-01T09:05:00.736Z,"[Asia, Bangladesh, elections, Kamal Hossain, S...","[Asia, Bangladesh, elections, Kamal Hossain, S..."
...,...,...,...,...
33824,60304782,2021-12-31T18:47:57.479Z,"[Putin, New Year, Coronavirus, Navalny, Ukraine]","[Putin, New Year, Coronavirus, Navalny, Ukraine]"
33825,60299904,2021-12-31T19:06:43.423Z,"[Germany, Olaf Scholz, New Year, New Year's ad...","[Germany, Olaf Scholz, New Year, New Year's ad..."
33826,60300458,2021-12-31T20:27:51.092Z,"[Colorado, wildfires, Boulder County, evacuati...","[Colorado, wildfires, Boulder County, evacuati..."
33827,60267980,2021-12-31T20:32:20.303Z,"[RCEP, Regional Comprehensive Economic Partner...","[RCEP, Regional Comprehensive Economic Partner..."
