In [1]:
import datetime, os, re
import pickle

import nltk
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

import gensim
from gensim import corpora
from gensim.models import Phrases

import datetime
from newsplease import NewsPlease

In [2]:
## helper method to filter articles by date & length
def get_relevant_articles(articles):
	filtered_articles = []
	for article in articles:
		if article.date_publish is not None:
			if article.date_publish < start or article.date_publish >= end:
				continue
		if article.text is None:
			continue
		elif len(article.text.split()) <= 100:
			continue 
		filtered_articles.append(article)
	return filtered_articles


## helper method to retrieve articles from path
def get_articles_from_filepath(path):
	articles = []
	for (dirpath, dirnames, filenames) in os.walk(path):
		for filename in filenames:
			if filename.endswith(".pkl"):
				filepath = os.path.join(dirpath, filename)
				with open(filepath, 'rb') as input_file:
					e = pickle.load(input_file)
					articles.append(e)
	filtered_articles = get_relevant_articles(articles)
	return filtered_articles

In [3]:

## helper method for text cleanup
def text_cleanup(text):
	def get_lemma(word):
		return WordNetLemmatizer().lemmatize(word)

	stop_words = set(stopwords.words('english'))
	months = ["january","february","march","april","may","june","july","august","september","october","november","december"]

	tokens = nltk.word_tokenize(text)
	tokens = [token for token in tokens if (token).isalpha() and len(token)>2]
	tokens = [token for token in tokens if token.lower() not in stop_words]
	tokens = [token for token in tokens if token.lower() not in months]
	tokens = [get_lemma(token) for token in tokens]
	return tokens


In [6]:
global start
global end
start = datetime.datetime(2012, 6, 1)
end = datetime.datetime(2013, 7, 1)
    
articles = get_articles_from_filepath("Thesis-2019/")

In [8]:
docs = []
for article in articles:
    sentences = tokenize.sent_tokenize(article.text)
    sentences = [gensim.utils.simple_preprocess(s) for s in sentences]
    docs += sentences

model = gensim.models.Word2Vec(
    docs,
    size=150,
    window=10,
    min_count=2,
    workers=10)

model.train(docs, total_examples=len(docs), epochs=10)

(1196499, 1591400)

In [10]:
model.wv.most_similar(positive="obama",topn=20)

[('romney', 0.9185003638267517),
 ('karzai', 0.887995183467865),
 ('bush', 0.83978670835495),
 ('mitt', 0.8270744681358337),
 ('sucked', 0.8187577724456787),
 ('outrageous', 0.8186334371566772),
 ('venizelos', 0.8179527521133423),
 ('signaled', 0.8080883026123047),
 ('party', 0.8036176562309265),
 ('verrilli', 0.7991970181465149),
 ('clinton', 0.7894609570503235),
 ('republican', 0.7821438908576965),
 ('messina', 0.7807579636573792),
 ('campaign', 0.7807223796844482),
 ('relentless', 0.7764478325843811),
 ('barden', 0.7763988971710205),
 ('abbas', 0.775560736656189),
 ('gerald', 0.774519681930542),
 ('myanmar', 0.774312436580658),
 ('term', 0.7719337940216064)]

In [12]:
model.wv.most_similar(positive="romney",topn=20)

[('obama', 0.9185003638267517),
 ('mitt', 0.9115840196609497),
 ('mr', 0.8750324845314026),
 ('sucked', 0.8718211650848389),
 ('presidency', 0.8685856461524963),
 ('bush', 0.8644351363182068),
 ('karzai', 0.8534864783287048),
 ('underscore', 0.8522833585739136),
 ('venizelos', 0.8520539402961731),
 ('clinton', 0.8518027067184448),
 ('campaign', 0.8389893770217896),
 ('shalit', 0.8386415839195251),
 ('verrilli', 0.8340803980827332),
 ('probability', 0.8307307362556458),
 ('existed', 0.8305507898330688),
 ('barden', 0.8277979493141174),
 ('messina', 0.8234184384346008),
 ('govern', 0.819625198841095),
 ('clement', 0.8174552321434021),
 ('vote', 0.8162410259246826)]

In [20]:
model.wv.most_similar(positive="democrats",topn=50)

[('deciders', 0.9504735469818115),
 ('gallup', 0.9487624168395996),
 ('polls', 0.9483890533447266),
 ('pathway', 0.9469236731529236),
 ('electorate', 0.946081817150116),
 ('pacs', 0.9433695077896118),
 ('vigorous', 0.9410778284072876),
 ('younger', 0.940981388092041),
 ('voters', 0.9366654753684998),
 ('houses', 0.9321030378341675),
 ('slightly', 0.9306408166885376),
 ('populists', 0.9294018149375916),
 ('fewer', 0.9267346858978271),
 ('reach', 0.9257991909980774),
 ('vitriol', 0.9257526993751526),
 ('clearer', 0.924370527267456),
 ('republicans', 0.9223094582557678),
 ('undecided', 0.9216620326042175),
 ('latino', 0.9211448431015015),
 ('bullish', 0.9207743406295776),
 ('choosing', 0.9191440343856812),
 ('points', 0.9188754558563232),
 ('certainly', 0.9181658625602722),
 ('away', 0.9180541634559631),
 ('politically', 0.9175894260406494),
 ('repair', 0.9169368743896484),
 ('independents', 0.9162728190422058),
 ('demanding', 0.9147025346755981),
 ('votes', 0.914176344871521),
 ('disagre

In [21]:
model.wv.most_similar(positive="republicans",topn=50)

[('probability', 0.9417006373405457),
 ('democrats', 0.9223093390464783),
 ('vote', 0.9072288870811462),
 ('even', 0.902953565120697),
 ('leaning', 0.8940107822418213),
 ('wed', 0.8919506669044495),
 ('persuasive', 0.8892962336540222),
 ('voters', 0.8805223703384399),
 ('far', 0.8795624375343323),
 ('disagree', 0.8720409274101257),
 ('polls', 0.8718546628952026),
 ('win', 0.8590664267539978),
 ('estrada', 0.8581550717353821),
 ('choosing', 0.8504106402397156),
 ('negotiate', 0.8498350977897644),
 ('reach', 0.8483158946037292),
 ('signaled', 0.8472540378570557),
 ('gallup', 0.8452020883560181),
 ('frustrated', 0.8431543111801147),
 ('nowhere', 0.8418095707893372),
 ('childish', 0.8413732647895813),
 ('run', 0.8369129300117493),
 ('politically', 0.836152195930481),
 ('clear', 0.8360646367073059),
 ('govern', 0.8357300758361816),
 ('still', 0.8354736566543579),
 ('presidency', 0.832348108291626),
 ('repair', 0.8321173191070557),
 ('say', 0.8303933143615723),
 ('electorate', 0.827650070190

In [23]:
model.wv.most_similar(positive="liberals",topn=20)

[('conservatives', 0.9893098473548889),
 ('slower', 0.9802690148353577),
 ('peers', 0.9750739336013794),
 ('alternatives', 0.9726169109344482),
 ('divide', 0.9720026850700378),
 ('ideas', 0.9679479002952576),
 ('assimilation', 0.9660034775733948),
 ('already', 0.9657910466194153),
 ('populists', 0.9656911492347717),
 ('looted', 0.9656208753585815),
 ('whom', 0.9648095369338989),
 ('hurt', 0.963603675365448),
 ('disappeared', 0.9625953435897827),
 ('away', 0.9624364972114563),
 ('suspect', 0.962234616279602),
 ('vitriol', 0.9591072797775269),
 ('judging', 0.9586976766586304),
 ('definitely', 0.9586789011955261),
 ('persuade', 0.9582372903823853),
 ('topic', 0.9582177996635437)]

In [26]:
model.wv.most_similar(positive="men",topn=20)

[('young', 0.9602299928665161),
 ('among', 0.9573314189910889),
 ('adopters', 0.9525433778762817),
 ('fastest', 0.9495570063591003),
 ('hispanics', 0.9479889869689941),
 ('african', 0.9436430931091309),
 ('asian', 0.9426509141921997),
 ('percentage', 0.9414992332458496),
 ('compared', 0.9388124942779541),
 ('invested', 0.9370081424713135),
 ('usage', 0.9332296848297119),
 ('majority', 0.9313706159591675),
 ('cases', 0.9293162822723389),
 ('overweight', 0.9284192323684692),
 ('population', 0.9270365834236145),
 ('thousands', 0.9249094128608704),
 ('diagnoses', 0.9247716069221497),
 ('resonates', 0.9236060380935669),
 ('illicit', 0.9211124777793884),
 ('incapable', 0.9196344614028931)]

In [37]:
model.wv.most_similar(positive="white",topn=20)

[('supporters', 0.9827489256858826),
 ('whiter', 0.9759966135025024),
 ('construction', 0.9670623540878296),
 ('moms', 0.9666427373886108),
 ('tied', 0.966031551361084),
 ('photos', 0.9646417498588562),
 ('devised', 0.9640914797782898),
 ('juvenile', 0.9606707692146301),
 ('narrow', 0.958895206451416),
 ('afghanistan', 0.9559107422828674),
 ('assigned', 0.9558216333389282),
 ('premises', 0.9557905197143555),
 ('jesse', 0.9557439088821411),
 ('antonio', 0.9552028179168701),
 ('castro', 0.9544353485107422),
 ('alienating', 0.954036295413971),
 ('franklin', 0.9537797570228577),
 ('featuring', 0.9535329937934875),
 ('roosevelt', 0.9532613754272461),
 ('foreign', 0.953176736831665)]