## Creating word vectors to investigate relationships between words to see if there is potential bias

In [2]:
from __future__ import print_function, division
import pandas as pd 
import numpy as np
import pickle
from matplotlib import pyplot as plt
%matplotlib inline
# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics.pairwise as smp
from nltk.tokenize import sent_tokenize
from sklearn.decomposition import NMF

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

with open('doj_c.pkl','rb') as f:
    doj = pickle.load(f)
data = doj['contents']

In [3]:
corpus = ''
for each in data:
    corpus += each.lower()
    
import string
import unidecode
corpus = unidecode.unidecode(corpus)

In [4]:
sentences = sent_tokenize(corpus)

In [5]:
from nltk.tokenize import word_tokenize

phrases = []
for each in sentences:
    phrases.append(word_tokenize(each))

In [6]:
doj_vecs = models.word2vec.Word2Vec(phrases, size=100, window=5, min_count=5, workers=4)

2018-11-16 21:27:01,281 : INFO : collecting all words and their counts
2018-11-16 21:27:01,282 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-11-16 21:27:01,356 : INFO : PROGRESS: at sentence #10000, processed 339800 words, keeping 15231 word types
2018-11-16 21:27:01,436 : INFO : PROGRESS: at sentence #20000, processed 695328 words, keeping 23550 word types
2018-11-16 21:27:01,516 : INFO : PROGRESS: at sentence #30000, processed 1041117 words, keeping 28567 word types
2018-11-16 21:27:01,596 : INFO : PROGRESS: at sentence #40000, processed 1388818 words, keeping 33211 word types
2018-11-16 21:27:01,676 : INFO : PROGRESS: at sentence #50000, processed 1736964 words, keeping 37514 word types
2018-11-16 21:27:01,762 : INFO : PROGRESS: at sentence #60000, processed 2082734 words, keeping 41974 word types
2018-11-16 21:27:01,843 : INFO : PROGRESS: at sentence #70000, processed 2430141 words, keeping 44988 word types
2018-11-16 21:27:01,925 : INFO : PROGRESS

### Testing words related to Muslims and Islam

In [7]:
doj_vecs.most_similar('islam')

  """Entry point for launching an IPython kernel.
2018-11-16 21:27:29,995 : INFO : precomputing L2-norms of word weight vectors


[('sword', 0.6522782444953918),
 ('crusaders', 0.6412355303764343),
 ('muslims', 0.6121676564216614),
 ('infidels', 0.5829067826271057),
 ('hatred', 0.5803992748260498),
 ('backdrop', 0.5726880431175232),
 ('prejudice', 0.5712781548500061),
 ('retaliate', 0.5708475112915039),
 ('intolerance', 0.5645120739936829),
 ('enemies', 0.5580153465270996)]

In [8]:
doj_vecs.most_similar('muslim')

  """Entry point for launching an IPython kernel.


[('somali', 0.6999996304512024),
 ('jewish', 0.6500884294509888),
 ('latino', 0.6450031995773315),
 ('native', 0.639795184135437),
 ('sikh', 0.6370640993118286),
 ('african', 0.6262637376785278),
 ('amish', 0.6157890558242798),
 ('pregnant', 0.6096385717391968),
 ('gay', 0.607383668422699),
 ('young', 0.5889614820480347)]

In [9]:
doj_vecs.most_similar('mosque')

  """Entry point for launching an IPython kernel.


[('motel', 0.7222611308097839),
 ('entrance', 0.6853206157684326),
 ('church', 0.6730661392211914),
 ('shotgun', 0.6614291667938232),
 ('bombs', 0.6608598232269287),
 ('restaurant', 0.6592603325843811),
 ('synagogue', 0.6552081108093262),
 ('shoot', 0.6549220681190491),
 ('gunpoint', 0.6405906081199646),
 ('location', 0.6367930173873901)]

In [10]:
doj_vecs.most_similar('allah')

  """Entry point for launching an IPython kernel.


[('me', 0.7312315106391907),
 ('you', 0.6976401209831238),
 ('...', 0.6827571988105774),
 ('something', 0.6817971467971802),
 ('die', 0.6699077486991882),
 ('let', 0.6330971717834473),
 ('[', 0.6289156675338745),
 ("'m", 0.6286628246307373),
 ('see', 0.6107150912284851),
 ('speak', 0.5957574844360352)]

### Comparing with words in corpus that exist for other religions

In [16]:
doj_vecs.most_similar('christ')

  """Entry point for launching an IPython kernel.


[('macedonia', 0.8161101341247559),
 ('god', 0.6900001168251038),
 ('church', 0.6628494262695312),
 ('walnut', 0.6463581919670105),
 ('boulder', 0.6450909972190857),
 ('glade', 0.6440331935882568),
 ('conneaut', 0.6425847411155701),
 ('reverend', 0.6404812335968018),
 ('tuscaloosa', 0.6396113634109497),
 ('highland', 0.6354373693466187)]

In [11]:
doj_vecs.most_similar('church')

  """Entry point for launching an IPython kernel.


[('congregation', 0.6941131353378296),
 ('synagogue', 0.6867195963859558),
 ('mosque', 0.6730661392211914),
 ('christ', 0.6628494262695312),
 ('motel', 0.6553056240081787),
 ('neighborhood', 0.6517730951309204),
 ('park', 0.6368703246116638),
 ('god', 0.623152494430542),
 ('restaurant', 0.6084576845169067),
 ('macedonia', 0.6068291068077087)]

In [20]:
doj_vecs.most_similar('christian')

  """Entry point for launching an IPython kernel.


[('elizabeth', 0.7822296023368835),
 ('joshua', 0.7735604047775269),
 ('pamela', 0.7709989547729492),
 ('michelle', 0.7707457542419434),
 ('kyle', 0.7706141471862793),
 ('van', 0.7702932953834534),
 ('estrada', 0.7655531167984009),
 ('aaron', 0.7650339603424072),
 ('jeremy', 0.7648410201072693),
 ('nicholas', 0.7627168893814087)]

In [19]:
doj_vecs.most_similar('jewish')

  """Entry point for launching an IPython kernel.


[('muslim', 0.6500884890556335),
 ('latino', 0.6453544497489929),
 ('male', 0.6229506134986877),
 ('lived', 0.6065863370895386),
 ('violently', 0.5581741333007812),
 ('african-americans', 0.5568193197250366),
 ('gay', 0.5551722049713135),
 ('girls', 0.5549834966659546),
 ('impoverished', 0.554520308971405),
 ('pregnant', 0.5518898963928223)]

In [12]:
doj_vecs.most_similar('synagogue')

  """Entry point for launching an IPython kernel.


[('restaurant', 0.708699643611908),
 ('church', 0.6867195963859558),
 ('congregation', 0.6682054996490479),
 ('backyard', 0.6602610945701599),
 ('mosque', 0.6552080512046814),
 ('motel', 0.6551803946495056),
 ('crowded', 0.631760835647583),
 ('nightclub', 0.6286150813102722),
 ('trailer', 0.6282986402511597),
 ('machete', 0.6265313625335693)]

### Looking at names of presidents

In [13]:
doj_vecs.most_similar('trump')

  """Entry point for launching an IPython kernel.


[('obama', 0.7955515384674072),
 ('barack', 0.6396726369857788),
 ('bush', 0.6285247802734375),
 ('biden', 0.6207939982414246),
 ('keeper', 0.5928440093994141),
 ('chun', 0.5360488295555115),
 ('president', 0.5348680019378662),
 ('hrc', 0.5300405621528625),
 ('committee', 0.5146782398223877),
 ('formally', 0.5133247375488281)]

In [14]:
doj_vecs.most_similar('obama')

  """Entry point for launching an IPython kernel.


[('barack', 0.8377145528793335),
 ('trump', 0.795551598072052),
 ('president', 0.6371892690658569),
 ('keeper', 0.5776113867759705),
 ('committee', 0.54123455286026),
 ('championed', 0.5251625776290894),
 ('biden', 0.5005807876586914),
 ('declassification', 0.49753686785697937),
 ('clemency', 0.49681779742240906),
 ('interagency', 0.49499958753585815)]

In [15]:
doj_vecs.most_similar('bush')

  """Entry point for launching an IPython kernel.


[('biden', 0.6811291575431824),
 ('clinton', 0.6657521724700928),
 ('trump', 0.6285247802734375),
 ('leahy', 0.622563898563385),
 ('wiley', 0.6110332012176514),
 ('cooke', 0.6040787100791931),
 ('roosevelt', 0.6008068323135376),
 ('ceo', 0.5986308455467224),
 ('hillary', 0.5924505591392517),
 ('janet', 0.5904005169868469)]

### Looking at different races

In [21]:
doj_vecs.most_similar('black')

  """Entry point for launching an IPython kernel.


[('horns', 0.7097222208976746),
 ('elephant', 0.6883867979049683),
 ('ivory', 0.6882531642913818),
 ('rhinoceros', 0.6848993301391602),
 ('coral', 0.6826488971710205),
 ('rhino', 0.6808892488479614),
 ('powder', 0.6472471356391907),
 ('deer', 0.6288437843322754),
 ('synthetic', 0.6279706954956055),
 ('turtle', 0.6259269118309021)]

In [22]:
doj_vecs.most_similar('african')

  """Entry point for launching an IPython kernel.


[('native', 0.7752351760864258),
 ('samoa', 0.7111507058143616),
 ('indians', 0.7051369547843933),
 ('natives', 0.6471330523490906),
 ('latino', 0.6406341791152954),
 ('muslim', 0.6262638568878174),
 ('eels', 0.6235378980636597),
 ('dream', 0.6160451769828796),
 ('african-american', 0.5895832777023315),
 ('asian', 0.5815019011497498)]

In [23]:
doj_vecs.most_similar('african-american')

  """Entry point for launching an IPython kernel.


[('hispanic', 0.7146862149238586),
 ('interracial', 0.7027782797813416),
 ('female', 0.674068808555603),
 ('woman', 0.6406142711639404),
 ('predominantly', 0.6387567520141602),
 ('arrestee', 0.6347012519836426),
 ('male', 0.6327521204948425),
 ('latino', 0.6239545345306396),
 ('tenants', 0.6197518706321716),
 ('pregnant', 0.605392575263977)]

In [24]:
doj_vecs.most_similar('hispanic')

  """Entry point for launching an IPython kernel.


[('asian/pacific', 0.7177717685699463),
 ('african-american', 0.7146862745285034),
 ('borrowers', 0.6560112833976746),
 ('tenants', 0.6470454931259155),
 ('hispanics', 0.6349380016326904),
 ('female', 0.6300790905952454),
 ('islander', 0.6165289878845215),
 ('elderly', 0.5847363471984863),
 ('latino', 0.5710458755493164),
 ('pregnant', 0.558635950088501)]

In [25]:
doj_vecs.most_similar('latino')

  """Entry point for launching an IPython kernel.


[('amish', 0.6721432209014893),
 ('somali', 0.6702308058738708),
 ('predominantly', 0.6497206687927246),
 ('pregnant', 0.6465837955474854),
 ('bisexual', 0.6457959413528442),
 ('jewish', 0.6453544497489929),
 ('muslim', 0.6450033187866211),
 ('african', 0.6406341195106506),
 ('gay', 0.6405932307243347),
 ('african-american', 0.6239545345306396)]

In [26]:
doj_vecs.most_similar('mexican')

  """Entry point for launching an IPython kernel.


[('colombian', 0.7612261772155762),
 ('philippine', 0.665427565574646),
 ('salvadoran', 0.6654013395309448),
 ('colombia', 0.6227753162384033),
 ('romanian', 0.6035144329071045),
 ('canadian', 0.5905613899230957),
 ('brazilian', 0.5902118682861328),
 ('japanese', 0.5792630910873413),
 ('ukrainian', 0.5683580636978149),
 ('sinaloa', 0.5681815147399902)]

In [27]:
doj_vecs.most_similar('asian')

  """Entry point for launching an IPython kernel.


[('africa', 0.6763811111450195),
 ('east', 0.6450565457344055),
 ('descent', 0.6407113075256348),
 ('adjacent', 0.6246041655540466),
 ('sargasso', 0.6220220923423767),
 ('coral', 0.5958296060562134),
 ('fork', 0.5894865989685059),
 ('slope', 0.5855900049209595),
 ('african', 0.581501841545105),
 ('antiques', 0.5763877630233765)]

In [30]:
doj_vecs.most_similar('arab')

  """Entry point for launching an IPython kernel.


[('emirates', 0.937386155128479),
 ('kingdom-based', 0.8184065818786621),
 ('kingdom', 0.8140121698379517),
 ('states-based', 0.786821186542511),
 ('states.the', 0.7718714475631714),
 ('therapeutics', 0.7572633624076843),
 ('airlines', 0.7431352138519287),
 ('ama', 0.7358680367469788),
 ('newark/columbia', 0.7345608472824097),
 ('delta', 0.7266659736633301)]

In [32]:
doj_vecs.most_similar('cuban')

  """Entry point for launching an IPython kernel.


[('russian', 0.695533812046051),
 ('vietnamese', 0.6044529676437378),
 ('chinese', 0.5887904763221741),
 ('syrian', 0.5766353607177734),
 ('venezuelan', 0.5460034608840942),
 ('foreign', 0.5426225662231445),
 ('prc', 0.5318738222122192),
 ('cuba', 0.5309222936630249),
 ('haitian', 0.527751624584198),
 ('clandestine', 0.5158886909484863)]

In [36]:
doj_vecs.most_similar('white')

  """Entry point for launching an IPython kernel.


[('linden', 0.5440186262130737),
 ('vance', 0.5274033546447754),
 ('subcommittee', 0.5052212476730347),
 ('male', 0.4933227300643921),
 ('predominantly', 0.49181997776031494),
 ('house', 0.48627427220344543),
 ('supremacy/separatism', 0.4849385917186737),
 ('supremacist', 0.48281997442245483),
 ('african-american', 0.4760252833366394),
 ('40-year-old', 0.4663178324699402)]

### Looking at political parties

In [38]:
doj_vecs.most_similar('republican')

  """Entry point for launching an IPython kernel.


[('mcclain', 0.4784807860851288),
 ('hoc', 0.4412802755832672),
 ('azerbaijan', 0.4404869079589844),
 ('ibori', 0.43897145986557007),
 ('soviet', 0.43720245361328125),
 ('panamanian', 0.4346635639667511),
 ('186,600', 0.43110916018486023),
 ('al-assad', 0.43054527044296265),
 ('governor', 0.42465198040008545),
 ('guinean', 0.4147576093673706)]

In [39]:
doj_vecs.most_similar('democratic') #democrat does not exist in corpus

  """Entry point for launching an IPython kernel.


[('electoral', 0.5676555037498474),
 ('political', 0.548275351524353),
 ('guinean', 0.5172073841094971),
 ('sovereignty', 0.5091806054115295),
 ('furthering', 0.48730793595314026),
 ('competitiveness', 0.4866899251937866),
 ('government-to-government', 0.4819740653038025),
 ('soccer', 0.4811435341835022),
 ('bidding', 0.4773799777030945),
 ('weakens', 0.4754981994628906)]

In [40]:
doj_vecs.most_similar('partisan')

  """Entry point for launching an IPython kernel.


[('couch', 0.7727024555206299),
 ('shave', 0.738537609577179),
 ('denomination', 0.7216039299964905),
 ('noka', 0.7139601111412048),
 ('bcfl', 0.7085400819778442),
 ('dishwasher', 0.7064012885093689),
 ('retainers', 0.7049406170845032),
 ('irrelevant', 0.7001870274543762),
 ('exhibition', 0.6992817521095276),
 ('provocation', 0.6988611221313477)]

In [41]:
doj_vecs.most_similar('party')

  """Entry point for launching an IPython kernel.


[('bidder', 0.6397594213485718),
 ('consultant', 0.6176154613494873),
 ('buyer', 0.5998591184616089),
 ('vendor', 0.5894853472709656),
 ('winning', 0.5756090879440308),
 ('person', 0.5597124099731445),
 ('parties', 0.5400657653808594),
 ('servicemember', 0.5336257219314575),
 ('entity', 0.5169912576675415),
 ('seller', 0.5057758092880249)]