# Headline Word2Vec

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import nltk
from nltk.corpus import stopwords, wordnet
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split, GridSearchCV
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, plot_confusion_matrix 
import string
import re
from sklearn.metrics import ConfusionMatrixDisplay

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
corpus = pd.read_csv('data/labeled_newscatcher_dataset.csv', sep=";")
corpus.head()

Unnamed: 0,topic,link,domain,published_date,title,lang
0,SCIENCE,https://www.eurekalert.org/pub_releases/2020-0...,eurekalert.org,2020-08-06 13:59:45,A closer look at water-splitting's solar fuel ...,en
1,SCIENCE,https://www.pulse.ng/news/world/an-irresistibl...,pulse.ng,2020-08-12 15:14:19,"An irresistible scent makes locusts swarm, stu...",en
2,SCIENCE,https://www.express.co.uk/news/science/1322607...,express.co.uk,2020-08-13 21:01:00,Artificial intelligence warning: AI will know ...,en
3,SCIENCE,https://www.ndtv.com/world-news/glaciers-could...,ndtv.com,2020-08-03 22:18:26,Glaciers Could Have Sculpted Mars Valleys: Study,en
4,SCIENCE,https://www.thesun.ie/tech/5742187/perseid-met...,thesun.ie,2020-08-12 19:54:36,Perseid meteor shower 2020: What time and how ...,en


In [4]:
# cleaning my test data
sw = stopwords.words('english') + list(string.punctuation)

corpus.title = [word.lower() for word in corpus.title]
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
corpus.title = [nltk.regexp_tokenize(line, pattern) for line in corpus.title]
lemma = WordNetLemmatizer()
corpus.title= [[lemma.lemmatize(word) for word in line] for line in corpus.title]
corpus.title = [[word for word in line if word not in sw] for line in corpus.title]

## Word2Vec

### Continuous Bag of Words (CBOW)

In [5]:
from gensim.models import Word2Vec

cbow_model= Word2Vec(corpus.title)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(16911179, 18492420)

In [6]:
cbow_model.most_similar("phone")

[('smartphone', 0.6154003143310547),
 ('smartphones', 0.611376941204071),
 ('qualcomm', 0.5446385145187378),
 ('android', 0.5332754850387573),
 ('nord', 0.5194308161735535),
 ('handset', 0.5045873522758484),
 ('huawei', 0.5039869546890259),
 ('lte', 0.5010994076728821),
 ('mobile', 0.5003875494003296),
 ('flagship', 0.49242961406707764)]

In [7]:
cbow_model= Word2Vec(corpus.title, size=50, min_count=2, window=3)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(17624288, 18492420)

In [8]:
cbow_model.most_similar("phone")

[('smartphone', 0.7844327688217163),
 ('smartphones', 0.7375056147575378),
 ('samsung', 0.6573551297187805),
 ('android', 0.6406950950622559),
 ('apps', 0.6405025720596313),
 ('dni', 0.6323405504226685),
 ('gmail', 0.6300261616706848),
 ("samsung's", 0.6288827657699585),
 ('cal', 0.6248137950897217),
 ('flagship', 0.6238561272621155)]

In [9]:
cbow_model= Word2Vec(corpus.title, size=50, min_count=4, window=3)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(17110171, 18492420)

In [10]:
cbow_model.most_similar("phone")

[('smartphone', 0.785496175289154),
 ('smartphones', 0.7146581411361694),
 ('android', 0.6582589745521545),
 ('gmail', 0.6500667929649353),
 ('flagship', 0.6304707527160645),
 ('samsung', 0.6189596056938171),
 ('mobile', 0.6175329685211182),
 ('apps', 0.6033751964569092),
 ('malware', 0.5983377695083618),
 ('qualcomm', 0.5966525673866272)]

In [11]:
cbow_model= Word2Vec(corpus.title, size=50, min_count=2, window=4)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(17623944, 18492420)

In [12]:
cbow_model.most_similar("phone")

[('smartphone', 0.7294553518295288),
 ('smartphones', 0.6931628584861755),
 ('android', 0.6760907173156738),
 ('gmail', 0.6232016086578369),
 ("google's", 0.6175945997238159),
 ('malware', 0.6173057556152344),
 ('user', 0.6126487851142883),
 ('mobile', 0.6080271601676941),
 ('samsung', 0.5896323919296265),
 ('apps', 0.5843027830123901)]

In [13]:
cbow_model= Word2Vec(corpus.title, size=40, min_count=2, window=2)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(17624840, 18492420)

In [14]:
cbow_model.most_similar("phone")

[('user', 0.7644056081771851),
 ('smartphone', 0.74847811460495),
 ('android', 0.7450239062309265),
 ('apps', 0.7088167071342468),
 ('smartphones', 0.7059887051582336),
 ('samsung', 0.6790426969528198),
 ('device', 0.6654797196388245),
 ('qualcomm', 0.6642196178436279),
 ('flagship', 0.649077832698822),
 ('pixel', 0.6423380374908447)]

In [15]:
cbow_model= Word2Vec(corpus.title, size=30, min_count=2, window=2)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(17624697, 18492420)

In [16]:
cbow_model.most_similar("phone")

[('smartphone', 0.7832538485527039),
 ('apps', 0.7789807915687561),
 ('android', 0.7738235592842102),
 ('smartphones', 0.7379433512687683),
 ('user', 0.7279594540596008),
 ('flagship', 0.7254611253738403),
 ('pin', 0.7240836024284363),
 ("samsung's", 0.7182045578956604),
 ('bug', 0.7152889370918274),
 ('gmail', 0.7066550850868225)]

In [17]:
cbow_model= Word2Vec(corpus.title, size=20, min_count=2, window=2)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(17624227, 18492420)

In [18]:
cbow_model.most_similar("phone")

[('android', 0.8654757738113403),
 ('flagship', 0.8339540958404541),
 ('samsung', 0.831294596195221),
 ('apps', 0.7941136956214905),
 ('mac', 0.7933629155158997),
 ('user', 0.7875032424926758),
 ('smartphone', 0.7867851257324219),
 ('iphone', 0.7864605188369751),
 ('ipads', 0.7780517339706421),
 ('handset', 0.7746727466583252)]

In [29]:
cbow_model= Word2Vec(corpus.title, size=10, min_count=2, window=2)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(17624559, 18492420)

In [30]:
cbow_model.most_similar("phone")

[('smartphones', 0.9153740406036377),
 ('flagship', 0.9151458740234375),
 ('android', 0.9133061766624451),
 ('nokia', 0.9085803031921387),
 ('iphone', 0.8996119499206543),
 ('smartphone', 0.8941638469696045),
 ('iphones', 0.8874186277389526),
 ('cheaper', 0.8841144442558289),
 ('screen', 0.8760377168655396),
 ('vulnerability', 0.8757739663124084)]

In [32]:
cbow_model.wv.most_similar(positive="apple",topn=5)

[("apple's", 0.9351379871368408),
 ('ipad', 0.9248727560043335),
 ('macbook', 0.9237585663795471),
 ('pro', 0.9095596075057983),
 ('slim', 0.9075995683670044)]

In [21]:
cbow_model= Word2Vec(corpus.title, size=5, min_count=2, window=2)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(17624905, 18492420)

In [28]:
cbow_model.most_similar("phone")

[('cheap', 0.999397873878479),
 ('android', 0.9986871480941772),
 ('upgraded', 0.9952649474143982),
 ('content', 0.9902476668357849),
 ('parallel', 0.987636387348175),
 ('cheaper', 0.9862022399902344),
 ('internet', 0.9858311414718628),
 ('xcloud', 0.9854042530059814),
 ('ecg', 0.9849103093147278),
 ('include', 0.9845090508460999)]

In [23]:
cbow_model.wv.most_similar(positive="tech",topn=5)

[('xstream', 0.9962033629417419),
 ('cheese', 0.9937506914138794),
 ('marine', 0.9927660226821899),
 ('plastic', 0.9925734996795654),
 ('optic', 0.991940975189209)]

In [33]:
cbow_model= Word2Vec(corpus.title, size=10, min_count=2, window=2, alpha=0.005)
cbow_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(17623842, 18492420)

In [34]:
cbow_model.most_similar("phone")

[('apps', 0.9723376631736755),
 ('iphone', 0.968980610370636),
 ('android', 0.9649627208709717),
 ('feature', 0.9615161418914795),
 ('io', 0.9581089019775391),
 ('apple', 0.9544820785522461),
 ('nokia', 0.9525927305221558),
 ('unveils', 0.9463881850242615),
 ('chrome', 0.9462530016899109),
 ('app', 0.9457876682281494)]

In [45]:
cbow_model.wv.most_similar(positive="rain",topn=5)

[('country', 0.9843270778656006),
 ('thunderstorm', 0.9771208763122559),
 ('area', 0.9692520499229431),
 ('temperature', 0.9638366103172302),
 ('largest', 0.9625722169876099)]

In [46]:
cbow_model.similarity('phone','meteor')

0.24463597

In [47]:
#i'm saving the whole thing for now just in case i need to retrain
cbow_model.save('decent_cbow.model')

### Skip-Gram

In [49]:
skip_model= Word2Vec(corpus.title, sg=1)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)

(16911616, 18492420)

In [50]:
skip_model.most_similar("phone")

[('android', 0.7327057123184204),
 ('smartphone', 0.7226364612579346),
 ('smartphones', 0.6805227994918823),
 ('mobile', 0.6706423759460449),
 ('qualcomm', 0.65846848487854),
 ('samsung', 0.6353456974029541),
 ('oneplus', 0.621649980545044),
 ('snapdragon', 0.6216216087341309),
 ('huawei', 0.62026047706604),
 ('axon', 0.6186046004295349)]

In [51]:
skip_model= Word2Vec(corpus.title, sg=1, size=50)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.most_similar("phone")

[('smartphones', 0.8373992443084717),
 ('android', 0.805121898651123),
 ('smartphone', 0.7734361886978149),
 ('mobile', 0.7719205617904663),
 ('qualcomm', 0.7585907578468323),
 ('foldable', 0.7568054795265198),
 ('nokia', 0.7562271356582642),
 ('handset', 0.7496894001960754),
 ('snapdragon', 0.7433872222900391),
 ('rog', 0.7418987154960632)]

In [52]:
skip_model= Word2Vec(corpus.title, sg=1, size=50, min_count=2)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.most_similar("phone")

[('smartphones', 0.8356647491455078),
 ('android', 0.8088092803955078),
 ('smartphone', 0.8022580742835999),
 ('snapdragon', 0.7866570353507996),
 ('foldable', 0.7632825970649719),
 ('mobile', 0.7510761618614197),
 ('cal', 0.7489628791809082),
 ('qualcomm', 0.7479882836341858),
 ('dsp', 0.7445710897445679),
 ('vrr', 0.7396570444107056)]

In [53]:
skip_model= Word2Vec(corpus.title, sg=1, size=50, min_count=2, window=2)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.most_similar("phone")

[('android', 0.8271772265434265),
 ('smartphones', 0.7964219450950623),
 ('smartphone', 0.773986279964447),
 ('samsung', 0.7339814901351929),
 ('apps', 0.731086015701294),
 ("samsung's", 0.728066623210907),
 ('oneplus', 0.7279577255249023),
 ('wearable', 0.7265996932983398),
 ('stylus', 0.7235832810401917),
 ('midrange', 0.7217894792556763)]

In [54]:
skip_model= Word2Vec(corpus.title, sg=1, size=50, min_count=2, window=3)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.most_similar("phone")

[('android', 0.8061527609825134),
 ('smartphones', 0.7921007871627808),
 ('mobile', 0.7693087458610535),
 ('snapdragon', 0.7648885846138),
 ('smartphone', 0.7618951201438904),
 ('foldable', 0.7560170888900757),
 ('dsp', 0.7547757625579834),
 ('vrr', 0.7490206360816956),
 ('oneplus', 0.7483361959457397),
 ('pricey', 0.7448790073394775)]

In [55]:
skip_model= Word2Vec(corpus.title, sg=1, size=50, min_count=2, window=4)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.most_similar("phone")

[('android', 0.8185698390007019),
 ('smartphones', 0.8171932697296143),
 ('foldable', 0.7915523052215576),
 ('smartphone', 0.7801628112792969),
 ('mobile', 0.7771994471549988),
 ('qualcomm', 0.7723207473754883),
 ('snapdragon', 0.7701205611228943),
 ('cal', 0.7552905082702637),
 ('rog', 0.7538713812828064),
 ('removable', 0.7534988522529602)]

In [56]:
skip_model= Word2Vec(corpus.title, sg=1, size=40, min_count=2)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.most_similar("phone")

[('android', 0.8619210124015808),
 ('smartphones', 0.8495360612869263),
 ('dsp', 0.8066348433494568),
 ('qualcomm', 0.8013862371444702),
 ('smartphone', 0.8007746934890747),
 ('snapdragon', 0.7920383214950562),
 ('handset', 0.787638783454895),
 ('oneplus', 0.7784186601638794),
 ('foldable', 0.7762075662612915),
 ('nokia', 0.7737550735473633)]

In [57]:
skip_model= Word2Vec(corpus.title, sg=1, size=30, min_count=2)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.most_similar("phone")

[('smartphones', 0.9000354409217834),
 ('smartphone', 0.8800143003463745),
 ('android', 0.8718509674072266),
 ('dsp', 0.8603364825248718),
 ('altzlife', 0.8558120131492615),
 ('nokia', 0.853026807308197),
 ('qualcomm', 0.8472353219985962),
 ('samsung', 0.8448466062545776),
 ('pixel', 0.8415809273719788),
 ('handset', 0.8410998582839966)]

In [58]:
skip_model= Word2Vec(corpus.title, sg=1, size=30, min_count=2, alpha=0.005)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.most_similar("phone")

[('qualcomm', 0.892565906047821),
 ('pixel', 0.8905010223388672),
 ('smartphone', 0.8892290592193604),
 ('snapdragon', 0.8871335387229919),
 ('gmail', 0.8795095086097717),
 ('android', 0.8766889572143555),
 ('nokia', 0.8732967972755432),
 ('achilles', 0.8689762353897095),
 ('gadget', 0.8684418201446533),
 ('handset', 0.8678849339485168)]

In [59]:
skip_model= Word2Vec(corpus.title, sg=1, size=20, min_count=2, alpha=0.005)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.most_similar("phone")

[('android', 0.9311205148696899),
 ('smartphone', 0.930152416229248),
 ('smartphones', 0.9230425953865051),
 ('gmail', 0.9167789816856384),
 ('snapdragon', 0.9108394384384155),
 ('achilles', 0.910788893699646),
 ('qualcomm', 0.9099057912826538),
 ('pixel', 0.9082645773887634),
 ('samsung', 0.9082576632499695),
 ('nokia', 0.906872570514679)]

In [61]:
skip_model= Word2Vec(corpus.title, sg=1, size=10, min_count=2, alpha=0.005)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.most_similar("phone")

[('achilles', 0.9563428163528442),
 ('android', 0.9560943841934204),
 ('gadget', 0.9556223750114441),
 ('flaw', 0.9552171230316162),
 ('smartphone', 0.9550544619560242),
 ('flagship', 0.9523184299468994),
 ('tablet', 0.9504671096801758),
 ('iphone', 0.9492576122283936),
 ('gmail', 0.9483956098556519),
 ('smartphones', 0.9483415484428406)]

In [62]:
skip_model.wv.most_similar(positive="rain",topn=5)

[('tornado', 0.972477912902832),
 ('flooding', 0.9565954804420471),
 ('recorded', 0.9557629823684692),
 ('overnight', 0.9525306224822998),
 ('fatality', 0.9473415017127991)]

In [63]:
skip_model.similarity('phone','meteor')

0.58668965

In [64]:
skip_model= Word2Vec(corpus.title, sg=1, size=20, min_count=2, alpha=0.005)
skip_model.train(corpus.title, total_examples=len(corpus.title), epochs=20)
skip_model.most_similar("phone")

[('android', 0.9340760707855225),
 ('smartphones', 0.9209192991256714),
 ('gmail', 0.9206162095069885),
 ('smartphone', 0.920419454574585),
 ('qualcomm', 0.9128236770629883),
 ('snapdragon', 0.9104718565940857),
 ('achilles', 0.909115195274353),
 ('rolling', 0.9010049104690552),
 ('tablet', 0.8996930122375488),
 ('handset', 0.8996843695640564)]

In [65]:
skip_model.wv.most_similar(positive="rain",topn=5)


[('flooding', 0.945341944694519),
 ('thunderstorm', 0.9417746067047119),
 ('weather', 0.9193456768989563),
 ('karachi', 0.9133889675140381),
 ('isaias', 0.9068201780319214)]

In [66]:
skip_model.similarity('phone','meteor')

0.53411216

In [67]:
skip_model.save("skipgram.model")