# 1) Import the libraries
---

In [1]:
import tensorflow as tf
print(tf.__version__)

2.3.0


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import nltk 
import pandas as pd
import gensim 
from gensim.models import Word2Vec, KeyedVectors

# 2) Data Preporcessing
---
I will use google pretrained model from https://www.kaggle.com/rootuser/worldnews-on-reddit

In [9]:
# install kaggle API
!pip install -q kaggle

In [10]:
# create a directory as kaggle
! mkdir -p ~/.kaggle

In [12]:
# copy api key to kaggle directory, I got my api from kaggle my account side
! cp kaggle.json ~/.kaggle

In [13]:
# disable the api key
! chmod 600 /root/.kaggle/kaggle.json

In [15]:
# import the dataset
! kaggle datasets download -d rootuser/worldnews-on-reddit

Downloading worldnews-on-reddit.zip to /content
 94% 25.0M/26.6M [00:00<00:00, 77.9MB/s]
100% 26.6M/26.6M [00:00<00:00, 76.4MB/s]


In [16]:
# unzip the dataset
! unzip /content/worldnews-on-reddit

Archive:  /content/worldnews-on-reddit.zip
  inflating: reddit_worldnews_start_to_2016-11-22.csv  


In [17]:
df = pd.read_csv('/content/reddit_worldnews_start_to_2016-11-22.csv')

In [18]:
df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [19]:
# take title column 
news_titles = df['title'].values

In [20]:
news_titles

array(['Scores killed in Pakistan clashes',
       'Japan resumes refuelling mission',
       'US presses Egypt on Gaza border', ...,
       'Professor receives Arab Researchers Award',
       'Nigel Farage attacks response to Trump ambassador tweet',
       'Palestinian wielding knife shot dead in West Bank: Israel police'],
      dtype=object)

In [21]:
new_vec = [nltk.word_tokenize(title) for title in news_titles]

In [22]:
new_vec[0]

['Scores', 'killed', 'in', 'Pakistan', 'clashes']

# 3) Build the model
---

In [24]:
model = Word2Vec(new_vec, min_count = 1, size = 32)
# text, min word count, size of each vector

# 4) Prediction
---

In [25]:
# find 10 closest words in the vector space that we have created
model.wv.most_similar('man')

  if np.issubdtype(vec.dtype, np.int):


[('woman', 0.9614757895469666),
 ('girl', 0.9084784984588623),
 ('boy', 0.9049885272979736),
 ('teenager', 0.887347936630249),
 ('couple', 0.8825281858444214),
 ('mother', 0.8791694641113281),
 ('teacher', 0.8633185625076294),
 ('doctor', 0.8462897539138794),
 ('father', 0.8423820734024048),
 ('daughter', 0.834402322769165)]

In [26]:
# see the vector
model.wv['man']
# this is how man is represented in our vector space

array([-1.4602176 , -1.9595683 ,  4.535476  ,  2.7137613 ,  2.7913425 ,
        2.6203039 , -3.0124233 ,  3.0069335 , -1.7149007 ,  2.4456196 ,
       -4.0629725 ,  3.9683924 , -2.7026136 , -1.9403701 , -3.319267  ,
        1.8004957 ,  4.0040917 , -0.79773223, -1.1542594 ,  0.9860294 ,
        0.73917013,  1.8245063 ,  1.3079715 ,  2.3284605 , -3.1651723 ,
        3.5842786 , -0.9030513 , -1.6465981 , -3.8863165 , -0.75770605,
        3.5090864 , -1.345486  ], dtype=float32)

In [27]:
# elt us try the famous relationship
vec = model.wv['king'] - model.wv['man'] + model.wv['women']
model.wv.most_similar([vec])

  if np.issubdtype(vec.dtype, np.int):


[('women', 0.6970425844192505),
 ('religious', 0.6946334838867188),
 ('clerics', 0.6817834377288818),
 ('discrimination', 0.6480409502983093),
 ('Muslim', 0.645355224609375),
 ('minorities', 0.643412709236145),
 ('Muslims', 0.6427332162857056),
 ('organisations', 0.6398724913597107),
 ('poverty.Extreme', 0.6374896764755249),
 ('gay', 0.6313363313674927)]

In [28]:
# relationship
vec = model.wv['Germany'] - model.wv['Berlin'] + model.wv['Paris']
model.wv.most_similar([vec])

  if np.issubdtype(vec.dtype, np.int):


[('Belgium', 0.8639992475509644),
 ('France', 0.8631689548492432),
 ('Germany', 0.8304797410964966),
 ('Paris', 0.7994371056556702),
 ('Sweden', 0.7919212579727173),
 ('UK', 0.788618803024292),
 ('Britain', 0.7670121788978577),
 ('Brussels', 0.7649995684623718),
 ('Turkey', 0.7430115938186646),
 ('Norway', 0.7224845290184021)]

In [29]:
# relationship 

vec = model.wv['Messi'] - model.wv['Football'] + model.wv['Cricket']
model.wv.most_similar([vec])

  if np.issubdtype(vec.dtype, np.int):


[('tricycle', 0.7903921008110046),
 ('fainted', 0.7723792195320129),
 ('Rasputin-like', 0.7684838771820068),
 ('46-year-old', 0.7593949437141418),
 ('46-year', 0.7553173899650574),
 ('British-Iranian', 0.7524728775024414),
 ('77-year-old', 0.7462000250816345),
 ('10-month-old', 0.743880033493042),
 ('caucasian', 0.7244365811347961),
 ('mauling', 0.7212578058242798)]