# Headline Word2Vec

For this notebook, I am instatiating two Word2Vec models, but I don't plan to use them for my final model. Just trying to get an understanding of how it works. 

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk.corpus import stopwords, wordnet
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split, GridSearchCV
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, plot_confusion_matrix 
import string
import re
from sklearn.metrics import ConfusionMatrixDisplay

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../data/labeled_newscatcher_dataset.csv', sep=";")
df.head()

Unnamed: 0,topic,link,domain,published_date,title,lang
0,SCIENCE,https://www.eurekalert.org/pub_releases/2020-0...,eurekalert.org,2020-08-06 13:59:45,A closer look at water-splitting's solar fuel ...,en
1,SCIENCE,https://www.pulse.ng/news/world/an-irresistibl...,pulse.ng,2020-08-12 15:14:19,"An irresistible scent makes locusts swarm, stu...",en
2,SCIENCE,https://www.express.co.uk/news/science/1322607...,express.co.uk,2020-08-13 21:01:00,Artificial intelligence warning: AI will know ...,en
3,SCIENCE,https://www.ndtv.com/world-news/glaciers-could...,ndtv.com,2020-08-03 22:18:26,Glaciers Could Have Sculpted Mars Valleys: Study,en
4,SCIENCE,https://www.thesun.ie/tech/5742187/perseid-met...,thesun.ie,2020-08-12 19:54:36,Perseid meteor shower 2020: What time and how ...,en


In [4]:
df2 = pd.read_csv('../data/scrape_news_articles.csv', sep=";", index_col=[0])
df2.head()

Unnamed: 0,topic,link,domain,published_date,title,lang
0,BEAUTY,https://www.politico.com/newsletters/politico-...,politico.com,2022-03-08 15:45:00,Moderna eases up on vaccine patent protections,en
1,BEAUTY,https://stylecaster.com/queen-elizabeth-lilibet,stylecaster.com,2022-03-08 15:10:14,The Queen ‘Desperately' Wants to Meet Lilibet—...,en
2,BEAUTY,https://stylecaster.com/kim-kardashian-kanye-p...,stylecaster.com,2022-03-08 14:45:04,Here's What Kim Really Thinks of Kanye's ‘Dist...,en
3,BEAUTY,https://www.complex.com/style/novesta-ss22,complex.com,2022-03-08 14:44:22,Novesta Unveils Colourful Spring/Summer 2022 F...,en
4,BEAUTY,https://www.qatarliving.com/forum/fashion/ladi...,qatarliving.com,2022-03-08 13:50:35,Ladies! Dress up with the Perfect Dresses from...,en


In [5]:
corpus = pd.concat([df, df2], ignore_index=True)
corpus.head()

Unnamed: 0,topic,link,domain,published_date,title,lang
0,SCIENCE,https://www.eurekalert.org/pub_releases/2020-0...,eurekalert.org,2020-08-06 13:59:45,A closer look at water-splitting's solar fuel ...,en
1,SCIENCE,https://www.pulse.ng/news/world/an-irresistibl...,pulse.ng,2020-08-12 15:14:19,"An irresistible scent makes locusts swarm, stu...",en
2,SCIENCE,https://www.express.co.uk/news/science/1322607...,express.co.uk,2020-08-13 21:01:00,Artificial intelligence warning: AI will know ...,en
3,SCIENCE,https://www.ndtv.com/world-news/glaciers-could...,ndtv.com,2020-08-03 22:18:26,Glaciers Could Have Sculpted Mars Valleys: Study,en
4,SCIENCE,https://www.thesun.ie/tech/5742187/perseid-met...,thesun.ie,2020-08-12 19:54:36,Perseid meteor shower 2020: What time and how ...,en


In [6]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\deaud\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
# cleaning my test data
sw = stopwords.words('english') + list(string.punctuation)

corpus.title = [word.lower() for word in corpus.title]
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
corpus.title = [nltk.regexp_tokenize(line, pattern) for line in corpus.title]
lemma = WordNetLemmatizer()
corpus.title= [[lemma.lemmatize(word) for word in line] for line in corpus.title]
corpus.title = [[word for word in line if word not in sw] for line in corpus.title]

## Word2Vec

### Continuous Bag of Words (CBOW)

In [18]:
from gensim.models import Word2Vec

cbow_model= Word2Vec(corpus.title)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(21727891, 23588160)

In [19]:
cbow_model.wv.most_similar("phone")

[('smartphone', 0.6666989922523499),
 ('smartphones', 0.6441646814346313),
 ('gmail', 0.5499178767204285),
 ('android', 0.5434715747833252),
 ('laptop', 0.534205436706543),
 ('handset', 0.5260448455810547),
 ('qualcomm', 0.5201058983802795),
 ('nord', 0.5150251388549805),
 ('lte', 0.5147441625595093),
 ('mobile', 0.5103725790977478)]

In [20]:
cbow_model= Word2Vec(corpus.title, vector_size=50, min_count=2, window=3)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(22584308, 23588160)

In [21]:
cbow_model.wv.most_similar("phone")

[('smartphone', 0.7068692445755005),
 ('smartphones', 0.6994327306747437),
 ('apps', 0.6779199242591858),
 ('android', 0.6700101494789124),
 ('nord', 0.622630774974823),
 ('nokia', 0.6171594858169556),
 ('lte', 0.6152294874191284),
 ('oneplus', 0.6097477078437805),
 ('snapdragon', 0.6058156490325928),
 ('gmail', 0.6033675074577332)]

In [23]:
cbow_model= Word2Vec(corpus.title, vector_size=50, min_count=4, window=3)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(21961214, 23588160)

In [24]:
cbow_model.wv.most_similar("phone")

[('smartphone', 0.7119123339653015),
 ('smartphones', 0.7074189782142639),
 ('android', 0.6493915319442749),
 ('nokia', 0.6452564597129822),
 ('gmail', 0.6290686130523682),
 ('oneplus', 0.6285195350646973),
 ('apps', 0.6153964400291443),
 ('samsung', 0.5960041284561157),
 ('snapdragon', 0.5931641459465027),
 ('foldable', 0.5928024053573608)]

In [25]:
cbow_model= Word2Vec(corpus.title, vector_size=50, min_count=2, window=4)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(22585230, 23588160)

In [26]:
cbow_model.wv.most_similar("phone")

[('smartphones', 0.7083776593208313),
 ('smartphone', 0.6987574696540833),
 ('android', 0.6955434679985046),
 ('mobile', 0.6585747003555298),
 ('gmail', 0.6514365077018738),
 ('user', 0.6341714859008789),
 ('apps', 0.6273295283317566),
 ('malware', 0.6259533762931824),
 ("google's", 0.6248986124992371),
 ('lte', 0.6179178953170776)]

In [27]:
cbow_model= Word2Vec(corpus.title, vector_size=40, min_count=2, window=2)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(22583422, 23588160)

In [28]:
cbow_model.wv.most_similar("phone")

[('apps', 0.781324565410614),
 ('smartphones', 0.780327558517456),
 ('android', 0.7668615579605103),
 ('smartphone', 0.7590717673301697),
 ('gamers', 0.7000181674957275),
 ('device', 0.6812880635261536),
 ('nord', 0.6759445667266846),
 ('pin', 0.6703640818595886),
 ("samsung's", 0.6667801737785339),
 ('foldable', 0.6646199226379395)]

In [29]:
cbow_model= Word2Vec(corpus.title, vector_size=30, min_count=2, window=2)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(22583966, 23588160)

In [30]:
cbow_model.wv.most_similar("phone")

[('android', 0.8016839623451233),
 ('apps', 0.7938687205314636),
 ('smartphones', 0.7737550139427185),
 ('smartphone', 0.7488154172897339),
 ('gamers', 0.7452980279922485),
 ('tablet', 0.7114000916481018),
 ('gmail', 0.707675039768219),
 ('device', 0.7053384184837341),
 ('user', 0.6988659501075745),
 ('io', 0.6952767372131348)]

In [31]:
cbow_model= Word2Vec(corpus.title, vector_size=20, min_count=2, window=2)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(22584586, 23588160)

In [32]:
cbow_model.wv.most_similar("phone")

[('android', 0.8948840498924255),
 ('smartphone', 0.8523215055465698),
 ('smartphones', 0.8505614399909973),
 ('samsung', 0.8315171599388123),
 ('pin', 0.8278328776359558),
 ('user', 0.8080447912216187),
 ('gamers', 0.7988781332969666),
 ('apps', 0.7914980053901672),
 ('chrome', 0.7909753918647766),
 ('gmail', 0.7789469957351685)]

In [33]:
cbow_model= Word2Vec(corpus.title, vector_size=10, min_count=2, window=2)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(22584748, 23588160)

In [34]:
cbow_model.wv.most_similar("phone")

[('iphones', 0.9687383770942688),
 ('bug', 0.9558156728744507),
 ('flagship', 0.9426276087760925),
 ('android', 0.9367027282714844),
 ('apps', 0.9349798560142517),
 ('macbook', 0.91974937915802),
 ('redesign', 0.9187734127044678),
 ('tablet', 0.9159047603607178),
 ('iphone', 0.9141308069229126),
 ('smartphones', 0.9132530689239502)]

In [35]:
cbow_model.wv.most_similar(positive="apple",topn=5)

[('google', 0.9611325860023499),
 ("apple's", 0.9549362063407898),
 ('direct', 0.9398136138916016),
 ('microsoft', 0.9220123291015625),
 ('amazon', 0.9129121899604797)]

In [36]:
cbow_model= Word2Vec(corpus.title, vector_size=5, min_count=2, window=2)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(22584201, 23588160)

In [37]:
cbow_model.wv.most_similar("phone")

[('glass', 0.9962368011474609),
 ('model', 0.994067370891571),
 ('noise', 0.9936598539352417),
 ('accessory', 0.9933869242668152),
 ('android', 0.9931384325027466),
 ('ingredient', 0.9923892617225647),
 ('microsoft', 0.9923272728919983),
 ('clock', 0.9917428493499756),
 ('io', 0.9911788105964661),
 ('cheap', 0.9910125732421875)]

In [38]:
cbow_model.wv.most_similar(positive="tech",topn=5)

[('wral', 0.9993358254432678),
 ('biopharma', 0.9980018138885498),
 ('scope', 0.9968476891517639),
 ('downward', 0.9960850477218628),
 ('veracyte', 0.9959495067596436)]

In [39]:
cbow_model= Word2Vec(corpus.title, vector_size=10, min_count=2, window=2, alpha=0.005)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(22583474, 23588160)

In [40]:
cbow_model.wv.most_similar("phone")

[('ahepa', 0.9764924049377441),
 ('iphone', 0.9724928140640259),
 ('p', 0.969786524772644),
 ('android', 0.9671738147735596),
 ('smartphones', 0.9595897793769836),
 ('flagship', 0.9520423412322998),
 ('controller', 0.9514220952987671),
 ('window', 0.9469561576843262),
 ('version', 0.9420189261436462),
 ("survivor's", 0.9416032433509827)]

In [41]:
cbow_model.wv.most_similar(positive="rain",topn=5)

[('fredericton', 0.9710844159126282),
 ('temperature', 0.9709977507591248),
 ('europe', 0.9672083258628845),
 ('rate', 0.9606303572654724),
 ('alarm', 0.9552730321884155)]

In [42]:
cbow_model.wv.similarity('phone','meteor')

0.38735545

In [44]:
#i'm saving the whole thing for now just in case i need to retrain
cbow_model.save('cbow.model')

### Skip-Gram

In [46]:
skip_model= Word2Vec(corpus.title, sg=1)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(21727204, 23588160)

In [47]:
skip_model.wv.most_similar("phone")

[('android', 0.7761050462722778),
 ('smartphones', 0.698005199432373),
 ('axon', 0.6612927317619324),
 ('foldable', 0.6588737964630127),
 ('qualcomm', 0.6584909558296204),
 ('snapdragon', 0.6437031626701355),
 ('lte', 0.6431047320365906),
 ('stylus', 0.6357678771018982),
 ('smartphone', 0.632291853427887),
 ('google', 0.6283547878265381)]

In [48]:
skip_model= Word2Vec(corpus.title, sg=1, vector_size=50)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.wv.most_similar("phone")

[('smartphones', 0.8384672403335571),
 ('smartphone', 0.8342218995094299),
 ('android', 0.833698034286499),
 ('snapdragon', 0.7907917499542236),
 ('qualcomm', 0.7794982194900513),
 ('foldable', 0.7743945717811584),
 ('handset', 0.7705978155136108),
 ('chipset', 0.765535295009613),
 ('nokia', 0.7594048976898193),
 ('dsp', 0.7592836618423462)]

In [49]:
skip_model= Word2Vec(corpus.title, sg=1, vector_size=50, min_count=2)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.wv.most_similar("phone")

[('smartphones', 0.8648689985275269),
 ('android', 0.8354758620262146),
 ('smartphone', 0.8187859654426575),
 ('snapdragon', 0.7923830151557922),
 ('oneplus', 0.7889497876167297),
 ('foldable', 0.7808762192726135),
 ('qualcomm', 0.7760719656944275),
 ('downloading', 0.7647323608398438),
 ('stalkerware', 0.7631919980049133),
 ('mobile', 0.7627070546150208)]

In [50]:
skip_model= Word2Vec(corpus.title, sg=1, vector_size=50, min_count=2, window=2)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.wv.most_similar("phone")

[('smartphones', 0.8272982835769653),
 ('smartphone', 0.8183382153511047),
 ('android', 0.8006417751312256),
 ('lte', 0.7776314616203308),
 ('gmail', 0.7556366324424744),
 ('midrange', 0.745169997215271),
 ('foldable', 0.7415573000907898),
 ('snapdragon', 0.7351503968238831),
 ('oneplus', 0.7284305691719055),
 ('nokia', 0.7271036505699158)]

In [51]:
skip_model= Word2Vec(corpus.title, sg=1, vector_size=50, min_count=2, window=3)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.wv.most_similar("phone")

[('android', 0.8256362676620483),
 ('smartphones', 0.7978739738464355),
 ('lte', 0.7945852875709534),
 ('gmail', 0.7843851447105408),
 ('smartphone', 0.7826085686683655),
 ('apps', 0.7821012735366821),
 ('oneplus', 0.7796486020088196),
 ('mobile', 0.761017918586731),
 ('removable', 0.7566685080528259),
 ('snapdragon', 0.751564621925354)]

In [52]:
skip_model= Word2Vec(corpus.title, sg=1, vector_size=50, min_count=2, window=4)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.wv.most_similar("phone")

[('android', 0.8546546101570129),
 ('smartphones', 0.8370323181152344),
 ('smartphone', 0.822310745716095),
 ('gmail', 0.7807543873786926),
 ('apps', 0.7766550183296204),
 ('foldable', 0.7724974155426025),
 ('mobile', 0.7696863412857056),
 ('handset', 0.7621003985404968),
 ('pixel', 0.7595537304878235),
 ('dsp', 0.758064329624176)]

In [53]:
skip_model= Word2Vec(corpus.title, sg=1, vector_size=40, min_count=2)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.wv.most_similar("phone")

[('smartphones', 0.8766257762908936),
 ('smartphone', 0.8583959937095642),
 ('handset', 0.8362529873847961),
 ('android', 0.8349930047988892),
 ('pixel', 0.831344485282898),
 ('foldable', 0.8272930383682251),
 ('oneplus', 0.8102177977561951),
 ('samsung', 0.8083470463752747),
 ('qualcomm', 0.8072943687438965),
 ('google', 0.7987962365150452)]

In [54]:
skip_model= Word2Vec(corpus.title, sg=1, vector_size=30, min_count=2)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.wv.most_similar("phone")

[('smartphones', 0.9025223255157471),
 ('android', 0.8986257314682007),
 ('handset', 0.88899827003479),
 ('smartphone', 0.8825555443763733),
 ('pixel', 0.8608438372612),
 ('dsp', 0.851475179195404),
 ('snapdragon', 0.8506773710250854),
 ('samsung', 0.8422503471374512),
 ('oneplus', 0.8347543478012085),
 ('vivaldi', 0.8324556350708008)]

In [55]:
skip_model= Word2Vec(corpus.title, sg=1, vector_size=30, min_count=2, alpha=0.005)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.wv.most_similar("phone")

[('smartphones', 0.8883024454116821),
 ('android', 0.8837844729423523),
 ('smartphone', 0.8756042122840881),
 ('snapdragon', 0.8640411496162415),
 ('qualcomm', 0.8634973764419556),
 ('flagship', 0.8626512289047241),
 ('handset', 0.8619552254676819),
 ('midrange', 0.8567699193954468),
 ('apps', 0.8505262732505798),
 ('cheaper', 0.8491857051849365)]

In [56]:
skip_model= Word2Vec(corpus.title, sg=1, vector_size=20, min_count=2, alpha=0.005)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.wv.most_similar("phone")

[('android', 0.9400346875190735),
 ('iphone', 0.9228743314743042),
 ('oneplus', 0.9104474782943726),
 ('smartphones', 0.9068314433097839),
 ('samsung', 0.9065344333648682),
 ('smartphone', 0.8994105458259583),
 ('snapdragon', 0.8939518928527832),
 ('flaw', 0.888489305973053),
 ('pixel', 0.884287416934967),
 ('qualcomm', 0.8837185502052307)]

In [57]:
skip_model= Word2Vec(corpus.title, sg=1, vector_size=10, min_count=2, alpha=0.005)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.wv.most_similar("phone")

[('iphone', 0.9665110111236572),
 ("huawei's", 0.9591425061225891),
 ('macbook', 0.9569219946861267),
 ('affordable', 0.9556488394737244),
 ('android', 0.9543747305870056),
 ('internal', 0.9542542099952698),
 ('cheaper', 0.9541267156600952),
 ('flaw', 0.9531579613685608),
 ('mac', 0.9526991248130798),
 ('introduced', 0.9505051374435425)]

In [58]:
skip_model.wv.most_similar(positive="rain",topn=5)

[('flash', 0.9680849313735962),
 ('flooding', 0.9576914310455322),
 ('tornado', 0.9506045579910278),
 ('thunderstorm', 0.9504642486572266),
 ('heavy', 0.9439990520477295)]

In [59]:
skip_model.wv.similarity('phone','meteor')

0.5477683

In [60]:
skip_model= Word2Vec(corpus.title, sg=1, vector_size=20, min_count=2, alpha=0.005)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.wv.most_similar("phone")

[('android', 0.9390363693237305),
 ('smartphones', 0.9093863368034363),
 ('samsung', 0.9052277207374573),
 ('oneplus', 0.9050357341766357),
 ('snapdragon', 0.8870840072631836),
 ('flaw', 0.8840975165367126),
 ('pixel', 0.8810487985610962),
 ("google's", 0.8802744150161743),
 ('fitbit', 0.8749974966049194),
 ('smartphone', 0.8747354745864868)]

In [61]:
skip_model.wv.most_similar(positive="rain",topn=5)


[('flooding', 0.9336934089660645),
 ('flash', 0.9080342054367065),
 ('thunderstorm', 0.9023416638374329),
 ('karachi', 0.8830739855766296),
 ('heavy', 0.8815587759017944)]

In [63]:
skip_model.wv.similarity('phone','meteor')

0.43500376

In [64]:
skip_model.save("skipgram.model")