In [1]:
from pyspark.sql import SparkSession, SQLContext

import sparknlp
# required imports.  It's probably better to import specific classes rather than *
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline

spark = (SparkSession.builder
         .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.0")
         .config("spark.driver.memory","16g") 
         .getOrCreate())

sc = spark.sparkContext
print(type(sc))
sqlContext = SQLContext(sc)
print(type(sqlContext))

print(sparknlp.version())

spark

<class 'pyspark.context.SparkContext'>
<class 'pyspark.sql.context.SQLContext'>
2.4.0


In [None]:
VER = 'addvec'

In [2]:
# load wrangled data
raw_sdf = spark.read.json('./sample4.json')
raw_sdf.printSchema()

root
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- date_download: string (nullable = true)
 |-- date_modify: string (nullable = true)
 |-- date_publish: string (nullable = true)
 |-- description: string (nullable = true)
 |-- language: string (nullable = true)
 |-- language_guess: string (nullable = true)
 |-- month: long (nullable = true)
 |-- published: string (nullable = true)
 |-- source_domain: string (nullable = true)
 |-- text: string (nullable = true)
 |-- text_or_desc: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- year: long (nullable = true)



In [3]:
raw_sdf.take(2)

[Row(authors=[], date_download='2019-11-27 22:51:37', date_modify='2019-11-27 22:51:37', date_publish='2015-03-27 17:17:42', description="Federal Reserve Chairman Ben Bernanke will begin holding news conferences four times a year to answer questions about the Federal Reserve's policy decisions. It represents a significant shift in...", language='en', language_guess='en', month=3, published='2015-03-27T17:17:42.000Z', source_domain='foxnews.com', text='Federal Reserve Chairman Ben Bernanke will begin holding news conferences four times a year to answer questions about the Federal Reserve\'s policy decisions. It represents a significant shift in strategy for the central bank, one that will give it a chance to defend actions that in recent months have faced harsh criticism.\nThe decision, which was announced Thursday, comes after the Fed held an unusual videoconference last fall in large part to discuss the need to improve its communications with the public. A Fed committee also had been 

In [4]:
import pyspark.sql.functions as F
from pyspark.sql.functions import when, col

In [5]:
# add id column
processed_sdf = raw_sdf.select("*").withColumn("id", F.monotonically_increasing_id())

In [6]:
import contractions
contractions_udf = F.udf(contractions.fix)
# simple fix to contractions
processed_sdf = processed_sdf.withColumn('clean_text', contractions_udf('text_or_desc'))

In [7]:
processed_sdf.select('clean_text').take(2)

[Row(clean_text='Federal Reserve Chairman Ben Bernanke will begin holding news conferences four times a year to answer questions about the Federal Reserve\'s policy decisions. It represents a significant shift in strategy for the central bank, one that will give it a chance to defend actions that in recent months have faced harsh criticism.\nThe decision, which was announced Thursday, comes after the Fed held an unusual videoconference last fall in large part to discuss the need to improve its communications with the public. A Fed committee also had been studying whether to begin holding periodic news conferences.\nBernanke\'s first news conference will take place after the Fed\'s April 27 meeting. That will augment the current communications strategy: a brief statement released after each of the Fed\'s eight policy-making meetings with no officials available to answer questions.\nit is a notable change for the Fed. Its chairmen rarely take questions from reporters, and when they do, t

In [8]:
# remove possessives
processed_sdf = processed_sdf.withColumn('clean_text', F.regexp_replace('clean_text', "'s", ""))

In [9]:
processed_sdf.select("clean_text").take(2)

[Row(clean_text='Federal Reserve Chairman Ben Bernanke will begin holding news conferences four times a year to answer questions about the Federal Reserve policy decisions. It represents a significant shift in strategy for the central bank, one that will give it a chance to defend actions that in recent months have faced harsh criticism.\nThe decision, which was announced Thursday, comes after the Fed held an unusual videoconference last fall in large part to discuss the need to improve its communications with the public. A Fed committee also had been studying whether to begin holding periodic news conferences.\nBernanke first news conference will take place after the Fed April 27 meeting. That will augment the current communications strategy: a brief statement released after each of the Fed eight policy-making meetings with no officials available to answer questions.\nit is a notable change for the Fed. Its chairmen rarely take questions from reporters, and when they do, they are ofte

In [10]:
# remove non text 
processed_sdf = processed_sdf.withColumn('clean_text', F.regexp_replace('clean_text',"[^A-Za-z\. \n]", ""))

In [11]:
# standardize US
processed_sdf = processed_sdf.withColumn('clean_text', F.regexp_replace('clean_text',"(US|U.S.|USA)", "United States"))

In [12]:
processed_sdf.select("clean_text").take(2)

[Row(clean_text='Federal Reserve Chairman Ben Bernanke will begin holding news conferences four times a year to answer questions about the Federal Reserve policy decisions. It represents a significant shift in strategy for the central bank one that will give it a chance to defend actions that in recent months have faced harsh criticism.\nThe decision which was announced Thursday comes after the Fed held an unusual videoconference last fall in large part to discuss the need to improve its communications with the public. A Fed committee also had been studying whether to begin holding periodic news conferences.\nBernanke first news conference will take place after the Fed April  meeting. That will augment the current communications strategy a brief statement released after each of the Fed eight policymaking meetings with no officials available to answer questions.\nit is a notable change for the Fed. Its chairmen rarely take questions from reporters and when they do they are often guarded

In [13]:
documentAssembler = DocumentAssembler().setInputCol("clean_text").setOutputCol("document")

In [14]:
sentenceDetector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")

In [15]:
tokenizer = (Tokenizer() 
    .setInputCols(["sentence"]) 
    .setOutputCol("token") 
            )

In [16]:
lemma = (LemmatizerModel.pretrained('lemma_antbnc')
        .setInputCols(['token'])
         .setOutputCol('lemma')
        )

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [17]:
stop_words_cleaner = (StopWordsCleaner()
                      .setInputCols(['lemma'])
                      .setOutputCol('stopless')
                     )
stop_words = stop_words_cleaner.getStopWords()
stop_words.remove('not')
stop_words.remove('cannot')
stop_words.remove('against')
stop_words.remove('nor')
stop_words.remove('no')
stop_words = [word.replace("'", "") for word in stop_words]
stop_words_cleaner.setStopWords(stop_words)
stop_words_cleaner.getStopWords()

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'only',
 'own',
 'same',
 's

In [18]:
normalizer = (Normalizer() \
    .setInputCols(["stopless"]) \
    .setOutputCol("normalized")
             )

In [19]:
word_embed = (WordEmbeddingsModel.pretrained()
             .setInputCols(['sentence', 'stopless'])
             .setOutputCol('embedding')
             )

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [20]:
ner_dl = (NerDLModel().pretrained()
          .setInputCols(['document', 'stopless', 'embedding'])
          .setOutputCol('ner')
         )

ner_dl download started this may take some time.
Approximate size to download 13.5 MB
[OK!]


In [21]:
ner_conv = (NerConverter()
            .setInputCols(["document", "normalized", "ner"])
            .setOutputCol('ner_converted')
           )
print(ner_conv.explainParams())

inputCols: previous annotations columns, if renamed (current: ['document', 'normalized', 'ner'])
lazyAnnotator: Whether this AnnotatorModel acts as lazy in RecursivePipelines (default: False)
outputCol: output annotation column. can be left default. (current: ner_converted)
whiteList: If defined, list of entities to process. The rest will be ignored. Do not include IOB prefix on labels (undefined)


In [22]:
sentiment_model = (ViveknSentimentModel.pretrained()
                   .setInputCols(['document', 'token'])
                   .setOutputCol('sentiment')
                  )

sentiment_vivekn download started this may take some time.
Approximate size to download 873.6 KB
[OK!]


In [23]:
finisher = Finisher().setInputCols(["sentence", "token", "lemma", "stopless", "embedding", "normalized", "ner", "ner_converted", 'sentiment']).setCleanAnnotations(True)

In [24]:
pipeline = (Pipeline()
    .setStages([
        documentAssembler,
        sentenceDetector,
        tokenizer,
        lemma,
        stop_words_cleaner,
        normalizer,
        word_embed,
        ner_dl,
        ner_conv,
        sentiment_model,
        finisher
    ]))

In [25]:
pipeline_model = pipeline.fit(processed_sdf)

In [26]:
processed_sdf = pipeline_model.transform(processed_sdf)

In [27]:
processed_sdf.printSchema()

root
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- date_download: string (nullable = true)
 |-- date_modify: string (nullable = true)
 |-- date_publish: string (nullable = true)
 |-- description: string (nullable = true)
 |-- language: string (nullable = true)
 |-- language_guess: string (nullable = true)
 |-- month: long (nullable = true)
 |-- published: string (nullable = true)
 |-- source_domain: string (nullable = true)
 |-- text: string (nullable = true)
 |-- text_or_desc: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- year: long (nullable = true)
 |-- id: long (nullable = false)
 |-- clean_text: string (nullable = true)
 |-- finished_sentence: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- finished_token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- finished_lemma: array (nullable = true)
 |    |-- element: string (conta

In [28]:
processed_sdf.select('clean_text', 'finished_sentiment').take(50)

[Row(clean_text='Federal Reserve Chairman Ben Bernanke will begin holding news conferences four times a year to answer questions about the Federal Reserve policy decisions. It represents a significant shift in strategy for the central bank one that will give it a chance to defend actions that in recent months have faced harsh criticism.\nThe decision which was announced Thursday comes after the Fed held an unusual videoconference last fall in large part to discuss the need to improve its communications with the public. A Fed committee also had been studying whether to begin holding periodic news conferences.\nBernanke first news conference will take place after the Fed April  meeting. That will augment the current communications strategy a brief statement released after each of the Fed eight policymaking meetings with no officials available to answer questions.\nit is a notable change for the Fed. Its chairmen rarely take questions from reporters and when they do they are often guarded

In [29]:
processed_sdf = processed_sdf.withColumn('sentiment',  
          when(col('finished_sentiment')[0] == 'positive', 1).when(col('finished_sentiment')[0] == 'negative', -1).otherwise(0)
                )

In [30]:
processed_sdf.select('finished_sentiment', 'sentiment').take(5)

[Row(finished_sentiment=['positive'], sentiment=1),
 Row(finished_sentiment=['negative'], sentiment=-1),
 Row(finished_sentiment=['positive'], sentiment=1),
 Row(finished_sentiment=['positive'], sentiment=1),
 Row(finished_sentiment=['negative'], sentiment=-1)]

In [1]:
processed_sdf.take(3)

NameError: name 'processed_sdf' is not defined

In [31]:
from pyspark.ml.feature import CountVectorizer, IDF

In [32]:
cv = CountVectorizer(inputCol="finished_ner_converted", 
                     outputCol="ner_vectors",
                     maxDF=0.8,
                     minDF=3
                    )

In [33]:
print(cv.explainParams())

binary: Binary toggle to control the output vector values. If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False (default: False)
inputCol: input column name. (current: finished_ner_converted)
maxDF: Specifies the maximum number of different documents a term could appear in to be included in the vocabulary. A term that appears more than the threshold will be ignored. If this is an integer >= 1, this specifies the maximum number of documents the term could appear in; if this is a double in [0,1), then this specifies the maximum fraction of documents the term could appear in. Default (2^63) - 1 (default: 9.223372036854776e+18, current: 0.8)
minDF: Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a 

In [34]:
cv.getVocabSize()

262144

In [35]:
cv_model = cv.fit(processed_sdf)

In [36]:
processed_sdf = cv_model.transform(processed_sdf)

In [37]:
processed_sdf.printSchema()

root
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- date_download: string (nullable = true)
 |-- date_modify: string (nullable = true)
 |-- date_publish: string (nullable = true)
 |-- description: string (nullable = true)
 |-- language: string (nullable = true)
 |-- language_guess: string (nullable = true)
 |-- month: long (nullable = true)
 |-- published: string (nullable = true)
 |-- source_domain: string (nullable = true)
 |-- text: string (nullable = true)
 |-- text_or_desc: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- year: long (nullable = true)
 |-- id: long (nullable = false)
 |-- clean_text: string (nullable = true)
 |-- finished_sentence: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- finished_token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- finished_lemma: array (nullable = true)
 |    |-- element: string (conta

In [38]:
processed_sdf.take(1)

[Row(authors=[], date_download='2019-11-27 22:51:37', date_modify='2019-11-27 22:51:37', date_publish='2015-03-27 17:17:42', description="Federal Reserve Chairman Ben Bernanke will begin holding news conferences four times a year to answer questions about the Federal Reserve's policy decisions. It represents a significant shift in...", language='en', language_guess='en', month=3, published='2015-03-27T17:17:42.000Z', source_domain='foxnews.com', text='Federal Reserve Chairman Ben Bernanke will begin holding news conferences four times a year to answer questions about the Federal Reserve\'s policy decisions. It represents a significant shift in strategy for the central bank, one that will give it a chance to defend actions that in recent months have faced harsh criticism.\nThe decision, which was announced Thursday, comes after the Fed held an unusual videoconference last fall in large part to discuss the need to improve its communications with the public. A Fed committee also had been 

In [39]:
cv_model.vocabulary

['Trump',
 'United States',
 'Obama',
 'American',
 'Democrats',
 'Americans',
 'Congress',
 'Republican',
 'Bush',
 'Republicans',
 'America',
 'Democratic',
 'White House',
 'Russian',
 'Senate',
 'Russia',
 'Iraq',
 'Washington',
 'China',
 'Israel',
 'Clinton',
 'New York',
 'GOP',
 'Chinese',
 'Iran',
 'McCain',
 'Donald Trump',
 'House',
 'Hillary',
 'British',
 'Trumps',
 'Israeli',
 'FBI',
 'Facebook',
 'God',
 'French',
 'Privacy Policy',
 'Supreme Court',
 'Syria',
 'Nations',
 'European',
 'Europe',
 'Texas',
 'California',
 'Take Action',
 'Chip',
 'Nation Travels',
 'Ukraine',
 'Iraqi',
 'EU',
 'Chicago',
 'Houston',
 'Romney',
 'Florida',
 'Google',
 'AP',
 'Barack Obama',
 'Syrian',
 'Afghanistan',
 'New York Times',
 'Jewish',
 'Reuters',
 'Muslim',
 'Moscow',
 'CNN',
 'German',
 'NPR',
 'Edwards',
 'Palestinian',
 'Americas',
 'Japanese',
 'India',
 'Internet',
 'Iowa',
 'UK',
 'Medicare',
 'English',
 'Britain',
 'Western',
 'Putin',
 'Iranian',
 'NFL',
 'Johnson',
 '

In [40]:
cv.getVocabSize()

262144

In [41]:
idf = IDF(inputCol="ner_vectors", outputCol="ner_vectors_idf")

In [42]:
idfModel = idf.fit(processed_sdf)

In [43]:
processed_sdf = idfModel.transform(processed_sdf)

In [44]:
processed_sdf.take(1)

[Row(authors=[], date_download='2019-11-27 22:51:37', date_modify='2019-11-27 22:51:37', date_publish='2015-03-27 17:17:42', description="Federal Reserve Chairman Ben Bernanke will begin holding news conferences four times a year to answer questions about the Federal Reserve's policy decisions. It represents a significant shift in...", language='en', language_guess='en', month=3, published='2015-03-27T17:17:42.000Z', source_domain='foxnews.com', text='Federal Reserve Chairman Ben Bernanke will begin holding news conferences four times a year to answer questions about the Federal Reserve\'s policy decisions. It represents a significant shift in strategy for the central bank, one that will give it a chance to defend actions that in recent months have faced harsh criticism.\nThe decision, which was announced Thursday, comes after the Fed held an unusual videoconference last fall in large part to discuss the need to improve its communications with the public. A Fed committee also had been 

In [45]:
ner_vectors = processed_sdf.select('ner_vectors')
sv_size = ner_vectors.first()[0].size
sv_size

7379

In [46]:
ner_vectors_idf = processed_sdf.select('ner_vectors_idf')

In [47]:
data_size = processed_sdf.count()
data_size

10000

In [48]:
import numpy as np
sum = np.zeros(sv_size)


In [49]:
sum_scale = np.zeros(sv_size)

In [50]:
for row in ner_vectors.take(data_size):
    sum += row[0].toArray()

In [51]:
for row in ner_vectors_idf.take(data_size):
    sum_scale += row[0].toArray()

In [52]:
sum[0:20]

array([2799., 2630., 2552., 2002., 1224., 1045., 1042., 1011., 1001.,
        969.,  962.,  896.,  879.,  848.,  840.,  833.,  766.,  731.,
        719.,  668.])

In [53]:
accumulate = []
for i,j in enumerate(sum):
    accumulate.append((i,j))

In [54]:
accumulate.sort(key=lambda x: -x[1])

In [55]:
accumulate_scale = []
for i,j in enumerate(sum_scale):
    accumulate_scale.append((i,j))
accumulate_scale.sort(key=lambda x: -x[1])

In [56]:
for i in range(len(accumulate)):
    item = accumulate[i]
    print(i+1, "|", cv_model.vocabulary[item[0]], "| ", item[1])


1 | Trump |  2799.0
2 | United States |  2630.0
3 | Obama |  2552.0
4 | American |  2002.0
5 | Democrats |  1224.0
6 | Americans |  1045.0
7 | Congress |  1042.0
8 | Republican |  1011.0
9 | Bush |  1001.0
10 | Republicans |  969.0
11 | America |  962.0
12 | Democratic |  896.0
13 | White House |  879.0
14 | Russian |  848.0
15 | Senate |  840.0
16 | Russia |  833.0
17 | Iraq |  766.0
18 | Washington |  731.0
19 | China |  719.0
20 | Israel |  668.0
21 | Clinton |  617.0
22 | New York |  597.0
23 | GOP |  581.0
24 | Chinese |  558.0
25 | Iran |  544.0
26 | McCain |  538.0
27 | Donald Trump |  527.0
28 | House |  511.0
29 | Hillary |  473.0
30 | British |  444.0
31 | Trumps |  431.0
32 | Israeli |  427.0
33 | FBI |  406.0
34 | Facebook |  403.0
35 | God |  396.0
36 | French |  395.0
37 | Privacy Policy |  395.0
38 | Supreme Court |  390.0
39 | Syria |  384.0
40 | Nations |  376.0
41 | European |  375.0
42 | Europe |  370.0
43 | Texas |  369.0
44 | California |  367.0
45 | Take Action | 

253 | Jets |  89.0
254 | Cuba |  89.0
255 | Arizona |  89.0
256 | Read Full Post |  88.0
257 | Visit |  88.0
258 | New Orleans |  88.0
259 | EBITDA |  88.0
260 | Ohio |  87.0
261 | Kremlin |  87.0
262 | Vladimir Putin |  87.0
263 | Asia |  87.0
264 | Swiss |  87.0
265 | Hong Kong |  87.0
266 | San Diego |  87.0
267 | Kennedy |  87.0
268 | Italy |  86.0
269 | Wisconsin |  86.0
270 | Olympics |  86.0
271 | South Carolina |  85.0
272 | AfricanAmerican |  85.0
273 | Britains |  85.0
274 | Amendment |  85.0
275 | Kavanaugh |  84.0
276 | Irish |  84.0
277 | Jordan |  84.0
278 | Soviet Union |  84.0
279 | Alaska |  83.0
280 | Israelis |  83.0
281 | Harvard |  83.0
282 | Starbucks |  83.0
283 | Slate |  83.0
284 | Latinos |  83.0
285 | Assad |  83.0
286 | Gore |  82.0
287 | Israels |  82.0
288 | Hispanic |  82.0
289 | Patriots |  82.0
290 | Tehran |  82.0
291 | Eagles |  81.0
292 | New Hampshire |  81.0
293 | District |  81.0
294 | Huawei |  81.0
295 | NASA |  81.0
296 | Spain |  81.0
297 | Ra

617 | IAEA |  41.0
618 | Cabinet |  41.0
619 | Epstein |  40.0
620 | Brady |  40.0
621 | African Americans |  40.0
622 | AFP |  40.0
623 | Court |  40.0
624 | Abbas |  40.0
625 | Rasmussen |  40.0
626 | Powell |  40.0
627 | Romneys |  40.0
628 | Bruins |  40.0
629 | Martinez |  40.0
630 | Macron |  39.0
631 | F |  39.0
632 | Brennan |  39.0
633 | Mattis |  39.0
634 | Graham |  39.0
635 | UNITED STATES |  39.0
636 | Evans |  39.0
637 | Magic |  39.0
638 | George W. Bush |  39.0
639 | Morgan |  39.0
640 | Astros |  39.0
641 | Mitch McConnell |  39.0
642 | Maduro |  38.0
643 | FIFA |  38.0
644 | PAC |  38.0
645 | alQaeda |  38.0
646 | Duncan |  38.0
647 | McCarthy |  38.0
648 | Newsday |  38.0
649 | Andrew |  38.0
650 | Redskins |  38.0
651 | Marines |  38.0
652 | Merkel |  38.0
653 | Colombia |  38.0
654 | Morris |  38.0
655 | Sweden |  38.0
656 | Ohio State |  38.0
657 | Lee |  38.0
658 | Puerto Rico |  37.0
659 | West Virginia |  37.0
660 | OConnor |  37.0
661 | Pruitt |  37.0
662 | Ho

983 | Ronaldo |  25.0
984 | Chelsea |  25.0
985 | Tampa Bay |  25.0
986 | Robert |  25.0
987 | Medicareforall |  25.0
988 | Marine |  25.0
989 | Booker |  25.0
990 | eBay |  25.0
991 | Iraq War |  25.0
992 | Jeb Bush |  25.0
993 | Secret Service |  25.0
994 | Bidens |  25.0
995 | Britons |  25.0
996 | Ted Cruz |  25.0
997 | Baker |  25.0
998 | Rhode Island |  25.0
999 | Iraq and Afghanistan |  25.0
1000 | Sunnis |  25.0
1001 | Scotland |  25.0
1002 | Peru |  25.0
1003 | Giffords |  25.0
1004 | New Republic |  25.0
1005 | Adams |  25.0
1006 | Tories |  25.0
1007 | Dylan |  25.0
1008 | South African |  25.0
1009 | Duffy |  25.0
1010 | Lakewood |  24.0
1011 | Ward |  24.0
1012 | Geneva |  24.0
1013 | NEW YORK |  24.0
1014 | Jazz |  24.0
1015 | Santorum |  24.0
1016 | Boehner |  24.0
1017 | Acosta |  24.0
1018 | Washington DC |  24.0
1019 | Leonard |  24.0
1020 | Saudis |  24.0
1021 | Karachi |  24.0
1022 | Chevron |  24.0
1023 | IG |  24.0
1024 | WhatsApp |  24.0
1025 | Labor Department |

1321 | Man |  19.0
1322 | Phillies |  19.0
1323 | Confederate |  19.0
1324 | CDC |  19.0
1325 | Nashville |  19.0
1326 | Five |  19.0
1327 | Juul |  19.0
1328 | Little |  19.0
1329 | United States of America |  19.0
1330 | Portugal |  19.0
1331 | Diaz |  19.0
1332 | ACA |  19.0
1333 | Greeks |  19.0
1334 | Wray |  19.0
1335 | Steve Jobs |  19.0
1336 | Morales |  19.0
1337 | Basra |  19.0
1338 | Conservative Party |  19.0
1339 | Shaw |  19.0
1340 | Aguilar |  19.0
1341 | Robert Muellers |  19.0
1342 | Minneapolis |  19.0
1343 | Financial Times |  19.0
1344 | Diallo |  19.0
1345 | Communist Party |  19.0
1346 | Garrett |  19.0
1347 | Ladens |  19.0
1348 | Scaife |  19.0
1349 | Kuwait |  19.0
1350 | Times Square |  19.0
1351 | West African |  19.0
1352 | Vietnamese |  19.0
1353 | Mark Zuckerberg |  19.0
1354 | Michael Brown |  19.0
1355 | Chuck Schumer |  19.0
1356 | Government |  19.0
1357 | Senegal |  19.0
1358 | SNP |  19.0
1359 | Palestinian Authority |  19.0
1360 | Sharpton |  19.0
1

1658 | North Koreans |  16.0
1659 | Andrew Cuomo |  16.0
1660 | Ku Klux Klan |  16.0
1661 | Adolf Hitler |  16.0
1662 | Kos |  16.0
1663 | Gase |  16.0
1664 | National Guard |  16.0
1665 | MRI |  16.0
1666 | Buenos Aires |  16.0
1667 | AFDC |  16.0
1668 | Bachmann |  16.0
1669 | Kingwood |  16.0
1670 | Ivanka |  16.0
1671 | Issue View |  16.0
1672 | Rio |  16.0
1673 | Delaney |  16.0
1674 | ONeill |  16.0
1675 | Bills |  16.0
1676 | National Assembly |  16.0
1677 | Justices |  16.0
1678 | Brent |  16.0
1679 | Holloway |  16.0
1680 | Jesus Christ |  16.0
1681 | Oliver |  16.0
1682 | Marco Rubio |  16.0
1683 | BLM |  16.0
1684 | Kirk |  16.0
1685 | Verizon |  16.0
1686 | Enquirer |  16.0
1687 | Tampa |  15.0
1688 | Seven |  15.0
1689 | Zionism |  15.0
1690 | Clements |  15.0
1691 | Russia and China |  15.0
1692 | Beto ORourke |  15.0
1693 | John F. Kennedy |  15.0
1694 | NATOs |  15.0
1695 | Alas |  15.0
1696 | Company |  15.0
1697 | Kim Jongun |  15.0
1698 | Fort Bend County |  15.0
169

1995 | Williamson |  13.0
1996 | Progressive |  13.0
1997 | Dean Baker |  13.0
1998 | Ted |  13.0
1999 | Hoyer |  13.0
2000 | Stevenson |  13.0
2001 | Warner |  13.0
2002 | Zika |  13.0
2003 | Angel |  13.0
2004 | Guantanamo |  13.0
2005 | China and Russia |  13.0
2006 | Chicagos |  13.0
2007 | Donald |  13.0
2008 | Ahmadinejad |  13.0
2009 | Nichols |  13.0
2010 | KBRA |  13.0
2011 | SCOTUnited States |  13.0
2012 | ResearchAndMarkets.com |  13.0
2013 | Albanian |  13.0
2014 | General Motors |  13.0
2015 | Rubios |  13.0
2016 | Communists |  13.0
2017 | LinkedIn |  13.0
2018 | Filipino |  13.0
2019 | GQ |  13.0
2020 | Rosenthal |  13.0
2021 | Kuwaiti |  13.0
2022 | Fitzgerald |  13.0
2023 | Gomez |  13.0
2024 | Minsk |  13.0
2025 | AFC |  13.0
2026 | Crosby |  13.0
2027 | Harriet Miers |  13.0
2028 | Vladimir Putins |  13.0
2029 | UN General Assembly |  13.0
2030 | ICC |  13.0
2031 | Rookie |  13.0
2032 | Arum |  13.0
2033 | WSU |  13.0
2034 | Route |  13.0
2035 | de Blasio |  13.0
20

2326 | Arnold Schwarzenegger |  11.0
2327 | Womens |  11.0
2328 | Department of Education |  11.0
2329 | Cisco |  11.0
2330 | LA |  11.0
2331 | Circuit |  11.0
2332 | Iowans |  11.0
2333 | Waymo |  11.0
2334 | Ricci |  11.0
2335 | Donetsk |  11.0
2336 | Najarian |  11.0
2337 | Thatcher |  11.0
2338 | Sidney Blumenthal |  11.0
2339 | Jobs |  11.0
2340 | Tech |  11.0
2341 | United States and China |  11.0
2342 | Mitterrand |  11.0
2343 | Gill |  11.0
2344 | SunTimes |  11.0
2345 | Bureau of Labor Statistics |  11.0
2346 | Obama White House |  11.0
2347 | Serb |  11.0
2348 | Arsenal |  11.0
2349 | Department |  11.0
2350 | Miliband |  11.0
2351 | Valentine |  11.0
2352 | Sports Illustrated |  11.0
2353 | World |  11.0
2354 | John Boehner |  11.0
2355 | Condoleezza Rice |  11.0
2356 | Churkin |  11.0
2357 | Intel |  11.0
2358 | Master |  11.0
2359 | Satan |  11.0
2360 | Puerto Rican |  11.0
2361 | Spears |  11.0
2362 | Northport |  11.0
2363 | Dunleavy |  11.0
2364 | SLs |  11.0
2365 | Mej

2646 | Roman Catholic |  10.0
2647 | La |  10.0
2648 | Serena |  10.0
2649 | Sochi |  10.0
2650 | UC Davis |  10.0
2651 | Gaffney |  10.0
2652 | Italys |  10.0
2653 | Academy Award |  10.0
2654 | RKy |  10.0
2655 | University of Virginia |  10.0
2656 | Eisenhower |  10.0
2657 | Telegraph |  10.0
2658 | Malcolm |  10.0
2659 | Balkans |  10.0
2660 | Dublin |  10.0
2661 | Mark |  10.0
2662 | Egypts |  10.0
2663 | IVF |  10.0
2664 | Jericho |  10.0
2665 | Chanel |  10.0
2666 | antiTrump |  10.0
2667 | World Trade Organization |  10.0
2668 | Warner Bros |  10.0
2669 | United StatesD |  10.0
2670 | Busch |  10.0
2671 | Dolphins |  10.0
2672 | Central Park |  10.0
2673 | Douglas |  10.0
2674 | Toms |  10.0
2675 | Alan Greenspan |  10.0
2676 | CSU |  10.0
2677 | Rob |  10.0
2678 | Heisman Trophy |  10.0
2679 | Mannings |  10.0
2680 | Gallagher |  10.0
2681 | Masters |  10.0
2682 | Sean Spicer |  10.0
2683 | BLS |  10.0
2684 | Majority |  10.0
2685 | National Enquirer |  10.0
2686 | Oprah Winfr

2985 | UIL |  9.0
2986 | Israelis and Palestinians |  9.0
2987 | JPMorgan Chase |  9.0
2988 | PhD |  9.0
2989 | Martin Luther King |  9.0
2990 | Gary Johnson |  9.0
2991 | West Texas |  9.0
2992 | Santiago |  9.0
2993 | Spaniard |  9.0
2994 | Wales |  9.0
2995 | Roy Moore |  9.0
2996 | Sisi |  9.0
2997 | Zhao |  9.0
2998 | Bushera |  9.0
2999 | Corcoran |  8.0
3000 | Jens Stoltenberg |  8.0
3001 | Ban |  8.0
3002 | Koreans |  8.0
3003 | Sarah |  8.0
3004 | Nikkei |  8.0
3005 | Akron |  8.0
3006 | Des Moines |  8.0
3007 | Berman |  8.0
3008 | Heaven |  8.0
3009 | United StatesB |  8.0
3010 | Ralph Northam |  8.0
3011 | Macrons |  8.0
3012 | Nicolas Cage |  8.0
3013 | FSB |  8.0
3014 | Bradys |  8.0
3015 | Homs |  8.0
3016 | University of Maryland |  8.0
3017 | AllAmerican |  8.0
3018 | State House |  8.0
3019 | Jon Stewart |  8.0
3020 | Democratic Republic of the Congo |  8.0
3021 | Nike |  8.0
3022 | Fairfax |  8.0
3023 | CFPB |  8.0
3024 | DoD |  8.0
3025 | Washington Nationals |  8.0

3333 | Daily Telegraph |  8.0
3334 | TrumpRussia |  8.0
3335 | Center |  8.0
3336 | UAE |  8.0
3337 | Alex Jones |  8.0
3338 | John McCains |  8.0
3339 | Said |  8.0
3340 | Floridians |  8.0
3341 | Sam Darnold |  8.0
3342 | UWs |  8.0
3343 | South Sudan |  8.0
3344 | Free |  8.0
3345 | Steyn |  8.0
3346 | Abraham |  8.0
3347 | Tucker Carlson |  8.0
3348 | Environmental Protection Agency EPA |  8.0
3349 | State Duma |  8.0
3350 | Pete Buttigieg |  8.0
3351 | Florence |  8.0
3352 | FrankWalter Steinmeier |  8.0
3353 | Qatari |  8.0
3354 | Drew |  8.0
3355 | Serena Williams |  8.0
3356 | Office |  8.0
3357 | Idol |  8.0
3358 | Newt |  8.0
3359 | Trayvon Martin |  8.0
3360 | Clarke |  8.0
3361 | Lindsey |  8.0
3362 | Hyde Park |  8.0
3363 | Deshaun Watson |  8.0
3364 | Super Bowls |  8.0
3365 | Singer |  8.0
3366 | Amy Klobuchar |  8.0
3367 | Indianapolis Colts |  8.0
3368 | Elliotts |  8.0
3369 | FEC |  8.0
3370 | Australian Open |  8.0
3371 | Great Recession |  8.0
3372 | Glenn |  8.0
33

3666 | Paraguay |  7.0
3667 | Robin Williams |  7.0
3668 | According |  7.0
3669 | Siberian |  7.0
3670 | Frisco |  7.0
3671 | Scandinavian |  7.0
3672 | Warren Buffett |  7.0
3673 | Xerox |  7.0
3674 | Vasquez |  7.0
3675 | Joe Bidens |  7.0
3676 | Best Buy |  7.0
3677 | Sergei Lavrov |  7.0
3678 | Second World War |  7.0
3679 | MELISSA BLOCK |  7.0
3680 | Kochs |  7.0
3681 | Netflixs |  7.0
3682 | Chicago Cubs |  7.0
3683 | Sadrists |  7.0
3684 | Highway |  7.0
3685 | antiAmericanism |  7.0
3686 | ACL |  7.0
3687 | CORNISH |  7.0
3688 | Atlantic Ocean |  7.0
3689 | AIPAC |  7.0
3690 | Latina |  7.0
3691 | Jefferson |  7.0
3692 | Islands |  7.0
3693 | Advertisement Continue |  7.0
3694 | Bundesliga |  7.0
3695 | Orleans |  7.0
3696 | Hollywood Reporter |  7.0
3697 | Pulitzers |  7.0
3698 | Philadelphia Eagles |  7.0
3699 | Joey |  7.0
3700 | Belgrade |  7.0
3701 | McGahn |  7.0
3702 | Obama and Edwards |  7.0
3703 | Dominican Republic |  7.0
3704 | Syria and Iraq |  7.0
3705 | Army Co

3992 | Village Voice |  6.0
3993 | Securities and Exchange Commission SEC |  6.0
3994 | Pope John Paul |  6.0
3995 | Charles Koch |  6.0
3996 | Myanmar |  6.0
3997 | Bumble |  6.0
3998 | Valero |  6.0
3999 | Lib Dem |  6.0
4000 | Douma |  6.0
4001 | Washington PostABC News |  6.0
4002 | BBC Radio |  6.0
4003 | Nagy |  6.0
4004 | Kellyanne Conway |  6.0
4005 | Heathrow |  6.0
4006 | Z |  6.0
4007 | Republicancontrolled Congress |  6.0
4008 | Department of Corrections |  6.0
4009 | Special |  6.0
4010 | Department of Public Works |  6.0
4011 | Madison Square Garden |  6.0
4012 | Mayan |  6.0
4013 | Little League |  6.0
4014 | House Minority |  6.0
4015 | antiBush |  6.0
4016 | Congolese |  6.0
4017 | Trojan |  6.0
4018 | GoFundMe |  6.0
4019 | Daily Caller |  6.0
4020 | Make |  6.0
4021 | Willie |  6.0
4022 | United Nations Security Council |  6.0
4023 | Winston Churchill |  6.0
4024 | Michael Bloomberg |  6.0
4025 | House Budget Committee |  6.0
4026 | Carters |  6.0
4027 | Sunday Times

4309 | Lake Houston |  6.0
4310 | Freedom Caucus |  6.0
4311 | Sanaa |  6.0
4312 | Veterans Day |  6.0
4313 | Tyler |  6.0
4314 | Andrew M. Cuomo |  6.0
4315 | Ralph Nader |  5.0
4316 | George Tenet |  5.0
4317 | Home Office |  5.0
4318 | Disneyland |  5.0
4319 | Asher |  5.0
4320 | Bass |  5.0
4321 | CBS Evening News |  5.0
4322 | Rapid Recap |  5.0
4323 | Jake Tapper |  5.0
4324 | Ayatollah Ali Khamenei |  5.0
4325 | House Energy and Commerce Committee |  5.0
4326 | Byrd |  5.0
4327 | Jair Bolsonaro |  5.0
4328 | Judaism |  5.0
4329 | Northeastern University |  5.0
4330 | Brownstein |  5.0
4331 | Free Syrian Army |  5.0
4332 | Frankfurt |  5.0
4333 | Georgians |  5.0
4334 | Lazio |  5.0
4335 | Black Caucus |  5.0
4336 | Milwaukees |  5.0
4337 | Charlie Black |  5.0
4338 | Blake |  5.0
4339 | Khans |  5.0
4340 | DC Comics |  5.0
4341 | Cornyn |  5.0
4342 | ALBANY |  5.0
4343 | North Africa |  5.0
4344 | Salah |  5.0
4345 | South Vietnam |  5.0
4346 | Ventura |  5.0
4347 | East and Wes

4607 | Csuite |  5.0
4608 | Corrections |  5.0
4609 | Tulane |  5.0
4610 | Boston University |  5.0
4611 | Nancy Pelosi DCalif |  5.0
4612 | Mayor Villaraigosa |  5.0
4613 | UFO |  5.0
4614 | Sandy Hook |  5.0
4615 | Mediaite |  5.0
4616 | Tanzania |  5.0
4617 | alMaliki |  5.0
4618 | Cleveland Cavaliers |  5.0
4619 | Silverman |  5.0
4620 | Leafs |  5.0
4621 | Latin Americas |  5.0
4622 | Mike Smith |  5.0
4623 | Feiler Faster Thesis |  5.0
4624 | Emmanuel |  5.0
4625 | Mega Millions |  5.0
4626 | Toulouse |  5.0
4627 | MOSCOW |  5.0
4628 | Libyas |  5.0
4629 | Oslo Accords |  5.0
4630 | Air Forces |  5.0
4631 | Timothy Geithner |  5.0
4632 | Naval |  5.0
4633 | New Zealands |  5.0
4634 | Dakota |  5.0
4635 | Federal Reserve System |  5.0
4636 | Mohamed Salah |  5.0
4637 | Stalins |  5.0
4638 | Transportation Security Administration |  5.0
4639 | Clearly |  5.0
4640 | Coke |  5.0
4641 | Spirit |  5.0
4642 | Mecca |  5.0
4643 | Jerusalem as Israels |  5.0
4644 | Alan Dershowitz |  5.0


4932 | Anglo American |  5.0
4933 | ANSA |  5.0
4934 | Arne Duncan |  5.0
4935 | United States Open |  5.0
4936 | Pentagons |  5.0
4937 | Bill Kristol |  5.0
4938 | Aristotle |  5.0
4939 | Morrissey |  5.0
4940 | United States and Mexico |  5.0
4941 | Welsh |  5.0
4942 | Title IX |  5.0
4943 | Name |  5.0
4944 | Evo Morales |  5.0
4945 | New Orleans Pelicans |  5.0
4946 | Board of Elections |  5.0
4947 | Hofstra |  5.0
4948 | British Open |  5.0
4949 | Cape Cod |  5.0
4950 | United Arab Emirates |  5.0
4951 | Kessler |  5.0
4952 | Golden State Warriors |  5.0
4953 | Copyright  The Associated Press |  5.0
4954 | Cologne |  5.0
4955 | Scott Pruitt |  5.0
4956 | FM |  5.0
4957 | EXCLUnited StatesIVE  Dem Andrew Yang |  5.0
4958 | White House Office of Management and Budget |  5.0
4959 | C. |  5.0
4960 | Walter |  5.0
4961 | Romenesko |  5.0
4962 | Kenyas |  5.0
4963 | Burbank |  5.0
4964 | Robert Wright |  5.0
4965 | Trustees |  5.0
4966 | antiCatholic |  5.0
4967 | Bellingcat |  5.0
4968

5242 | Newport |  4.0
5243 | Democratic National Committee DNC |  4.0
5244 | Kirkuk |  4.0
5245 | Indianapolis Star |  4.0
5246 | Minnesota Timberwolves |  4.0
5247 | Utahs |  4.0
5248 | Clayton |  4.0
5249 | Siri |  4.0
5250 | Jonah Hill |  4.0
5251 | Altman |  4.0
5252 | Lou |  4.0
5253 | Lizzo |  4.0
5254 | Statue of Liberty |  4.0
5255 | Cardiff |  4.0
5256 | Stockholm |  4.0
5257 | BuzzFeed |  4.0
5258 | Armenia |  4.0
5259 | Cheryl |  4.0
5260 | Regional Analysis
North America |  4.0
5261 | Robert Mueller Robert Bob Swan MuellerTrump |  4.0
5262 | Evan Longoria |  4.0
5263 | Pulse |  4.0
5264 | Cleveland Indians |  4.0
5265 | Sixties |  4.0
5266 | nonAmerican |  4.0
5267 | Snowdens |  4.0
5268 | Books |  4.0
5269 | Article |  4.0
5270 | ABS |  4.0
5271 | RS. |  4.0
5272 | Christiane Amanpour |  4.0
5273 | East German |  4.0
5274 | Israel and the Palestinians |  4.0
5275 | Newsmax |  4.0
5276 | Burger King |  4.0
5277 | Tony Rezko |  4.0
5278 | Stadium |  4.0
5279 | Camp |  4.0
52

5549 | Xi Jinping |  4.0
5550 | Creole |  4.0
5551 | Thanks |  4.0
5552 | Santas |  4.0
5553 | Andy Murray |  4.0
5554 | Lindsay Graham |  4.0
5555 | Carmen |  4.0
5556 | Geraldine Ferraro |  4.0
5557 | Bmovie |  4.0
5558 | Jo |  4.0
5559 | Eastwood |  4.0
5560 | Emily Dickinson |  4.0
5561 | Academy Awards |  4.0
5562 | Pacific Northwest |  4.0
5563 | Pol Pot |  4.0
5564 | Police |  4.0
5565 | Easter |  4.0
5566 | Fort Wayne |  4.0
5567 | Clinton and Obama |  4.0
5568 | Santa Monica |  4.0
5569 | Tiananmen Square |  4.0
5570 | Dear Abby |  4.0
5571 | Joe Allbaugh |  4.0
5572 | Santa Barbara |  4.0
5573 | Raul Castro |  4.0
5574 | S.Mexica |  4.0
5575 | Thomas Friedman |  4.0
5576 | Joe Girardi |  4.0
5577 | J.J. Watt |  4.0
5578 | Hills |  4.0
5579 | Ohio State University |  4.0
5580 | Jon Gruden |  4.0
5581 | Ben Rhodes |  4.0
5582 | Corker |  4.0
5583 | Accenture |  4.0
5584 | Al Davis |  4.0
5585 | World War Two |  4.0
5586 | North Vietnamese |  4.0
5587 | Mark Zaid |  4.0
5588 | A

5862 | Leno |  4.0
5863 | Molotov |  4.0
5864 | Stephen Colbert |  4.0
5865 | Slavoj Zizek |  4.0
5866 | Jonathan Martin |  4.0
5867 | Dick Gephardt |  4.0
5868 | GOP Senate |  4.0
5869 | Wayne |  4.0
5870 | Eurasian |  4.0
5871 | Ron Wyden DOre |  4.0
5872 | Nawaz Sharif |  4.0
5873 | Forrester |  4.0
5874 | Xray |  4.0
5875 | NRAs |  4.0
5876 | John Kelly |  4.0
5877 | Mercer |  4.0
5878 | Lady Gaga |  4.0
5879 | Mitch McConnell Addison Mitch Mitchell McConnellSenate |  4.0
5880 | Dylann Roof |  4.0
5881 | Gap |  4.0
5882 | Packer |  4.0
5883 | Iran and Russia |  4.0
5884 | Austrians |  4.0
5885 | Secure Fence Act |  4.0
5886 | Dispatch |  4.0
5887 | DWI |  4.0
5888 | United States House of Representatives |  4.0
5889 | Area Central |  4.0
5890 | Hank |  4.0
5891 | Mayor Michael Bloomberg |  4.0
5892 | Lindsay Lohan |  4.0
5893 | Bradleys |  4.0
5894 | Kazakh |  4.0
5895 | Kansas State |  4.0
5896 | Manuel |  4.0
5897 | Centers for Disease Control and Prevention CDC |  4.0
5898 | Sec

6162 | Scarlett Johansson |  3.0
6163 | Emma Stone |  3.0
6164 | Kamath |  3.0
6165 | David Barrett |  3.0
6166 | Terry Branstad |  3.0
6167 | AWR Hawkins |  3.0
6168 | Pulitzer Prizewinning |  3.0
6169 | Tommy Thompson |  3.0
6170 | Peshmerga |  3.0
6171 | Ian Hanchett |  3.0
6172 | Freeport |  3.0
6173 | Beats |  3.0
6174 | CHICAGO AP |  3.0
6175 | Martha Coakley |  3.0
6176 | Adam Serwer |  3.0
6177 | Arianna Huffington |  3.0
6178 | PASADENA Calif |  3.0
6179 | Ryan Zinke |  3.0
6180 | Lexus |  3.0
6181 | Mignon Clyburn |  3.0
6182 | Lewis Carroll |  3.0
6183 | TMZ.co |  3.0
6184 | Democrat in the House |  3.0
6185 | Malcolm Gladwell |  3.0
6186 | Dearborn Michigan |  3.0
6187 | Zion Williamson |  3.0
6188 | Jim Armstrong |  3.0
6189 | Wall Streeters |  3.0
6190 | al Qaedalinked |  3.0
6191 | Following |  3.0
6192 | Marcos |  3.0
6193 | Besides |  3.0
6194 | Keynes |  3.0
6195 | Freddie |  3.0
6196 | Grahams |  3.0
6197 | NPR.or |  3.0
6198 | Chris Hayes |  3.0
6199 | Helen Regan |

6445 | Scott Gottlieb |  3.0
6446 | Anthony Weiner |  3.0
6447 | New York Stock Exchange NYSE |  3.0
6448 | Robert Kuttner |  3.0
6449 | Norman Mailer |  3.0
6450 | Ice |  3.0
6451 | Beatle |  3.0
6452 | Cali |  3.0
6453 | Aleksey Navalny |  3.0
6454 | Eric Shinseki |  3.0
6455 | Michael Graham |  3.0
6456 | Honey |  3.0
6457 | Kwame Brown |  3.0
6458 | Galen |  3.0
6459 | Dana |  3.0
6460 | Peterson Institute for International Economics |  3.0
6461 | HTML |  3.0
6462 | Trump in Senate |  3.0
6463 | David Denby |  3.0
6464 | CAIR |  3.0
6465 | John Kennedy |  3.0
6466 | Center for American Progress |  3.0
6467 | National Book Award |  3.0
6468 | Exxon Mobil Corp |  3.0
6469 | Paul Johnson |  3.0
6470 | George Springer |  3.0
6471 | Veterans Affairs Department |  3.0
6472 | Lisa Murkowski |  3.0
6473 | George H |  3.0
6474 | Securities Exchange Act |  3.0
6475 | Slate on Facebook |  3.0
6476 | Federal Government |  3.0
6477 | LaGuardia Airport |  3.0
6478 | Zionists |  3.0
6479 | Old So

6741 | Pat Toomey |  3.0
6742 | PTA |  3.0
6743 | Puritan |  3.0
6744 | State Senate |  3.0
6745 | Eastern Washington |  3.0
6746 | Go |  3.0
6747 | Joe the Plumber |  3.0
6748 | Moroccan |  3.0
6749 | Conan OBrien |  3.0
6750 | Best Defense |  3.0
6751 | Florida Lotterys Pick |  3.0
6752 | Obamacares |  3.0
6753 | WTI |  3.0
6754 | Fitbit |  3.0
6755 | Upton |  3.0
6756 | Boston Celtics |  3.0
6757 | Peter Dawson |  3.0
6758 | Independence Day |  3.0
6759 | Wise |  3.0
6760 | Yitzhak Rabin |  3.0
6761 | Fray |  3.0
6762 | Vermonts |  3.0
6763 | Ban KiMoon |  3.0
6764 | Carlos Carrasco |  3.0
6765 | Libertarians |  3.0
6766 | Air Raid |  3.0
6767 | Nick Cleggs |  3.0
6768 | Khabibs |  3.0
6769 | Tesla Model |  3.0
6770 | John F. Kerry |  3.0
6771 | Bernies |  3.0
6772 | Courtesy |  3.0
6773 | Blink |  3.0
6774 | Concord |  3.0
6775 | Rocky |  3.0
6776 | Albert Almora Jr |  3.0
6777 | TEXAS UNITED STATES |  3.0
6778 | Oakland Athletics |  3.0
6779 | Chamberlain |  3.0
6780 | Clint Eastw

7034 | Pedro Sanchez |  3.0
7035 | Foundation for Defense of Democracies |  3.0
7036 | Black Panthers |  3.0
7037 | Merrick |  3.0
7038 | Housing and Urban Development |  3.0
7039 | Copyright Business Wire .
PUB   PMDISC |  3.0
7040 | Plain |  3.0
7041 | nonCatholic |  3.0
7042 | Midshipmen |  3.0
7043 | Joanne Ostrow |  3.0
7044 | HBOs |  3.0
7045 | Country |  3.0
7046 | Rafik Hariri |  3.0
7047 | Ankaras |  3.0
7048 | Downton Abbey |  3.0
7049 | Scotland Yard |  3.0
7050 | Arabs and Muslims |  3.0
7051 | Lone Star State |  3.0
7052 | Moscow Region |  3.0
7053 | Diane Sawyer |  3.0
7054 | Parkland Florida |  3.0
7055 | Big Oil |  3.0
7056 | Bachelor |  3.0
7057 | Pepsi |  3.0
7058 | Crime Stoppers |  3.0
7059 | Englands |  3.0
7060 | App Store |  3.0
7061 | alNusra Front |  3.0
7062 | Central Valley |  3.0
7063 | Marx |  3.0
7064 | Catalan |  3.0
7065 | Jason Campbell |  3.0
7066 | Legislatures |  3.0
7067 | Super Bowl MVP |  3.0
7068 | Jinpings |  3.0
7069 | Henry Hazlitt |  3.0
7070

7328 | Miamis |  3.0
7329 | Bono |  3.0
7330 | Clint Capela |  3.0
7331 | notat |  3.0
7332 | Carolina Panthers |  3.0
7333 | Adam Silver |  3.0
7334 | Qaeda in Iraq |  3.0
7335 | DENVER |  3.0
7336 | Free Democrats |  3.0
7337 | Daniel Patrick Moynihan |  3.0
7338 | Justin Rose |  3.0
7339 | BMI |  3.0
7340 | Chinayou. |  3.0
7341 | Boy Scout |  3.0
7342 | Chuck Grassley |  3.0
7343 | Israeli Jews |  3.0
7344 | Rep. Ron Paul |  3.0
7345 | Indepth |  3.0
7346 | antiIslamic |  3.0
7347 | Charlize Theron |  3.0
7348 | Javy Baez |  3.0
7349 | Budapest |  3.0
7350 | Rebecca Hennes |  3.0
7351 | National Oceanic and Atmospheric Administration |  3.0
7352 | Patti |  3.0
7353 | Hubert Humphrey |  3.0
7354 | Hudson Valley |  3.0
7355 | People on the Move |  3.0
7356 | Mike Brown |  3.0
7357 | Gingrichs |  3.0
7358 | Henry Waxman |  3.0
7359 | Public Religion Research Institute |  3.0
7360 | Volcker |  3.0
7361 | Sandra Maler |  3.0
7362 | Bear |  3.0
7363 | Ellen Ladowsky |  3.0
7364 | Nativit

In [57]:
processed_sdf.count()

10000

In [58]:
for i in range(len(accumulate_scale)):
    item = accumulate_scale[i]
    print(i+1, "|", cv_model.vocabulary[item[0]], "| ", item[1])


1 | Trump |  7380.291799844406
2 | Obama |  6801.311483350598
3 | United States |  5627.45376305372
4 | American |  4401.046795485617
5 | Democrats |  3654.719491746246
6 | Bush |  3396.7225362203126
7 | Congress |  3016.6758248032265
8 | Republican |  2989.1342825104593
9 | Republicans |  2973.282469586091
10 | Americans |  2919.4249482650716
11 | Russia |  2867.2854361952395
12 | Russian |  2847.785728347319
13 | Iraq |  2792.774689094833
14 | Democratic |  2792.7160548505085
15 | America |  2722.765752533795
16 | Senate |  2666.965504939473
17 | White House |  2649.302694240963
18 | China |  2602.3869848688078
19 | Israel |  2600.0700092523803
20 | McCain |  2546.409711244091
21 | Clinton |  2366.2949288833875
22 | Hillary |  2353.8059169053895
23 | Washington |  2219.7942663074978
24 | Iran |  2200.8359898208323
25 | Chinese |  2174.6567884675806
26 | GOP |  2035.4347893942643
27 | New York |  1941.4533573462045
28 | Israeli |  1813.723379577993
29 | House |  1786.8175639097192
30 

235 | Huawei |  499.43935226950066
236 | Michigan |  497.8141030753176
237 | Donald Trumps |  497.36917954675033
238 | Greek |  496.0693535176289
239 | San Francisco |  494.1881775974614
240 | NCAA |  493.14693440451424
241 | Hong Kong |  491.9930305773778
242 | Asian |  491.23256736778916
243 | Swiss |  487.1584535269095
244 | Kennedy |  482.57844871367746
245 | Dallas |  482.4928747686546
246 | Treasury |  481.9954199952082
247 | New Orleans |  481.6038258849971
248 | Capitol Hill Publishing Corp |  480.98682244654947
249 | News Communications Inc |  480.98682244654947
250 | Hill  K Street NW Suite  Washington DC |  480.98682244654947
251 | Rangers |  479.3251633686653
252 | Snowden |  478.5325231935345
253 | Flynn |  477.5128939203846
254 | Pelosi |  471.55914517780514
255 | San Diego |  470.1286753169395
256 | Instagram |  469.79482737636613
257 | Thompson |  468.6097626232351
258 | Miami |  468.3529380285213
259 | Assad |  464.76036370958036
260 | Olympics |  462.83471146191795
26

396 | Kansas |  360.7908135593035
397 | North |  360.7908135593035
398 | Federal Reserve |  360.2804861084448
399 | Politico |  358.9458019291488
400 | Kosovo |  357.6314591230869
401 | Bradley |  357.6232398966795
402 | NYPD |  357.16207910517426
403 | Washington D. |  355.8314187351997
404 | Cold War |  353.6671871948966
405 | Kurdish |  353.3798984232052
406 | Disney |  352.6941514080789
407 | Indians |  352.41295022634176
408 | Eddie |  352.08657026354814
409 | Ebola |  351.4573219674262
410 | Philadelphia |  351.38245874165386
411 | FDA |  351.188285908347
412 | Northam |  350.6607894820148
413 | Qaeda |  350.61572294020027
414 | NPRs |  350.2979716051925
415 | Verbtm Inc. an NPR |  350.2979716051925
416 | Walker |  349.878419124457
417 | Blair |  347.4397998030854
418 | Silicon Valley |  347.1703921686021
419 | Bell |  345.5655324947789
420 | Howard |  345.2914040381732
421 | Seattle |  343.05873697705334
422 | Kim |  343.047003056387
423 | New England |  341.57086971426986
424 |

628 | New Yorkers |  247.0222919596019
629 | Cain |  246.9940274093209
630 | Rasmussen |  246.63671717012363
631 | Syracuse |  245.88316735205422
632 | Broadway |  245.69911063693166
633 | Latin America |  245.41207055358183
634 | TSA |  245.25162339041327
635 | McCabe |  244.8164696256992
636 | Court |  244.775916544728
637 | Redskins |  244.6383624999959
638 | Macron |  244.37405412459293
639 | North America |  242.94868016595362
640 | State |  242.9486801659536
641 | HBO |  242.6110328246376
642 | Cabinet |  242.49874353985973
643 | Sarah Palin |  241.95762142637741
644 | Huskies |  241.77493458938648
645 | Watson |  241.77493458938648
646 | Senate Majority |  240.80191294249835
647 | Sandy |  240.5884361373494
648 | Andrew |  240.16260714505336
649 | Merkel |  240.16260714505336
650 | Ohio State |  240.16260714505336
651 | Midwest |  239.9851778314216
652 | Einstein |  239.72949719139967
653 | Hernandez |  239.23767634253926
654 | Guardian |  237.5138768304583
655 | Nancy |  237.39

856 | Pistons |  185.40596945771367
857 | Ben |  185.40596945771364
858 | Gods |  184.52265869760612
859 | McDonald |  183.9987250461152
860 | IDF |  183.99872504611517
861 | Spurs |  183.99872504611517
862 | Koch |  183.99872504611517
863 | Apollo |  183.99872504611514
864 | Johnsons |  183.99872504611514
865 | Goldman |  183.93871754280994
866 | Montreal |  183.581937408546
867 | Theresa |  183.581937408546
868 | North Dakota |  183.581937408546
869 | Latin |  183.35270853013787
870 | Page  Buy Reprints |  182.84584977632122
871 | Lakewood |  182.424058909018
872 | Roy |  182.424058909018
873 | Edison |  182.424058909018
874 | Chevron |  182.42405890901796
875 | Sandoval |  182.42405890901796
876 | Ehrlich |  182.42405890901796
877 | NBC News |  182.24838453142098
878 | Colts |  182.0669246444806
879 | Hmm |  182.0669246444806
880 | Duffy |  181.61325544803006
881 | Angela Merkel |  181.1374806466914
882 | New Yorks |  180.97159609885708
883 | Goldman Sachs |  180.97159609885705
884 

1079 | WNBA |  154.68727549533585
1080 | Christie |  154.68727549533585
1081 | Athens |  154.68727549533585
1082 | Mormon |  154.68727549533585
1083 | Big East |  154.68727549533583
1084 | Kings |  154.68727549533583
1085 | IG |  154.50843947368165
1086 | Holder |  154.50843947368162
1087 | Sullivan |  154.50843947368162
1088 | Labor Department |  154.5084394736816
1089 | Green New Deal |  154.29074737208654
1090 | Electoral College |  154.29074737208654
1091 | TPP |  154.29074737208654
1092 | Mexicos |  154.2907473720865
1093 | Robert |  154.14794823132732
1094 | Copyright   NPR |  153.77969102527692
1095 | Trump Donald John TrumpTrump |  153.77969102527692
1096 | Geneva |  153.05344855008715
1097 | Board |  153.05344855008715
1098 | Ted Cruz |  152.984947840455
1099 | New Republic |  152.984947840455
1100 | Chris |  152.84629321884455
1101 | Austria |  152.84629321884455
1102 | Ortega |  152.02004909084832
1103 | Zelenskiy |  152.02004909084832
1104 | INSKEEP |  152.02004909084832
11

1304 | Sean Illing |  133.01048209956264
1305 | Beaumont |  133.01048209956264
1306 | Roseanne |  133.01048209956264
1307 | Becker |  133.01048209956264
1308 | Ryder Cup |  133.01048209956264
1309 | AIG |  133.0104820995626
1310 | Caribbean |  132.90982019029963
1311 | Korea |  132.9098201902996
1312 | Bloom |  132.9098201902996
1313 | Munich |  132.90982019029957
1314 | United StatesA Today |  132.72144079068738
1315 | Labour Party |  132.72144079068738
1316 | David Cameron |  131.58602914401163
1317 | National Weather Service |  131.58602914401163
1318 | Emmanuel Macron |  131.5860291440116
1319 | Forbes |  131.5860291440116
1320 | Cheney |  131.42766074722513
1321 | Richardson |  131.42766074722513
1322 | ObamaCare |  131.42766074722513
1323 | Premier League |  131.42766074722513
1324 | Henry |  131.42766074722513
1325 | United StatesC |  131.42766074722513
1326 | Phillies |  131.24925020566695
1327 | Orange |  131.24925020566693
1328 | Katy |  130.76154392258164
1329 | Drake |  130

1526 | Vodafone |  117.36219008784937
1527 | Leningrad |  117.36219008784937
1528 | Benjamin |  117.36219008784937
1529 | Qaddafi |  117.36219008784937
1530 | Brewers |  117.36219008784937
1531 | Diamond |  117.36219008784937
1532 | Nigerian |  117.04302298573751
1533 | Austrian |  117.0430229857375
1534 | Marxist |  117.0430229857375
1535 | Pro Bowl |  117.04302298573748
1536 | Blues |  116.23248348673926
1537 | Safeway |  116.23248348673926
1538 | Duma |  116.23248348673926
1539 | Victoria |  116.23248348673926
1540 | Michigans |  116.23248348673926
1541 | Bachmann |  116.23248348673926
1542 | Kirk |  116.23248348673926
1543 | Islamophobia |  116.23248348673924
1544 | EIA |  116.23248348673924
1545 | Enquirer |  116.23248348673924
1546 | Nasdaq |  115.88132960526124
1547 | EUs |  115.88132960526123
1548 | Foreign Ministry |  115.88132960526123
1549 | Snyder |  115.81326660102849
1550 | Cubans |  115.81326660102849
1551 | Four |  115.81326660102849
1552 | Santa Ana |  115.813266601028

1739 | NonGAAP |  103.86153256847845
1740 | Greene |  103.86153256847845
1741 | Freeman |  103.86153256847845
1742 | Turin |  103.86153256847845
1743 | Trump Jr |  103.86153256847845
1744 | Aggies |  103.86153256847845
1745 | Schwartz |  103.86153256847845
1746 | Callaway |  103.86153256847845
1747 | Atkinson |  103.86153256847845
1748 | Feinstein |  103.86153256847845
1749 | Mack |  103.86153256847845
1750 | Chandler |  103.86153256847844
1751 | St. Petersburg |  103.61782910973707
1752 | Delta |  103.61782910973707
1753 | Bob Wright |  103.61782910973707
1754 | Villanova |  103.61782910973706
1755 | Georgian |  103.61782910973704
1756 | Gulf of Mexico |  103.61782910973704
1757 | Anbar |  103.61782910973704
1758 | Maguire |  103.61782910973704
1759 | PGA Tour |  103.61782910973704
1760 | Kemp |  103.61782910973704
1761 | Houstons |  103.61782910973704
1762 | Century |  103.00562631578778
1763 | Catholic Church |  103.00562631578777
1764 | Army Corps of Engineers |  103.00562631578777

1962 | ACC |  94.1574720406392
1963 | Huffman |  93.8897520702795
1964 | Aruba |  93.8897520702795
1965 | Bearcats |  93.8897520702795
1966 | MAX |  93.8897520702795
1967 | Katy ISD |  93.8897520702795
1968 | South Africas |  93.8897520702795
1969 | Flores |  93.8897520702795
1970 | Curran |  93.8897520702795
1971 | McCrory |  93.8897520702795
1972 | Gibbons |  93.8897520702795
1973 | Romero |  93.8897520702795
1974 | Agent Orange |  93.8897520702795
1975 | Scotts |  93.8897520702795
1976 | Southeast Asians |  93.88975207027948
1977 | Iceland |  93.88975207027948
1978 | American Heart Association |  93.88975207027948
1979 | Carla |  93.88975207027948
1980 | Beatrice |  93.88975207027948
1981 | Tinder |  93.88975207027948
1982 | NRSC |  93.88975207027948
1983 | Rossi |  93.88975207027948
1984 | Allah |  93.03687413320974
1985 | Kim Jong Un |  93.03687413320972
1986 | Michael Cohen |  93.03687413320972
1987 | Persian Gulf |  93.03687413320972
1988 | Hamid Karzai |  93.03687413320971
1989

2181 | S.Russi |  85.57198590356016
2182 | Fred |  85.57198590356016
2183 | Webb |  85.57198590356016
2184 | DCCC |  85.57198590356016
2185 | Corps |  85.57198590356016
2186 | University of Chicago Medical Center |  85.57198590356016
2187 | De Blasio |  85.57198590356016
2188 | Houston Texans |  85.57198590356016
2189 | OSHA |  85.57198590356016
2190 | Bush White House |  85.57198590356016
2191 | Gulf Coast |  85.42797948569634
2192 | Judiciary Committee |  85.42797948569634
2193 | Michael Flynn |  85.42797948569634
2194 | Many |  85.42797948569634
2195 | Union |  85.42797948569634
2196 | fromMarch   Page  View |  85.42797948569634
2197 | Rush Limbaugh |  85.42797948569634
2198 | Dean Baker |  85.42797948569634
2199 | ResearchAndMarkets.com |  85.42797948569634
2200 | Vladimir Putins |  85.42797948569634
2201 | Lawmakers |  85.42797948569634
2202 | Democrats in Congress |  85.42797948569634
2203 | QA |  85.42797948569634
2204 | FOX News Network LLC |  85.42797948569634
2205 | Centers f

2397 | UW |  78.24146005856625
2398 | Mendez |  78.24146005856625
2399 | Nassau Class |  78.24146005856625
2400 | Coliseum |  78.24146005856625
2401 | NRC |  78.24146005856625
2402 | BST |  78.24146005856625
2403 | Continental |  78.24146005856625
2404 | Elsa |  78.24146005856625
2405 | Jericho |  78.24146005856625
2406 | Bundy |  78.24146005856625
2407 | Customs and Border Protection |  77.14537368604327
2408 | Nick |  77.14537368604327
2409 | Auschwitz |  77.14537368604327
2410 | Abe |  77.14537368604327
2411 | United States Supreme Court |  77.14537368604327
2412 | Womens |  77.14537368604327
2413 | Iowans |  77.14537368604327
2414 | United States and China |  77.14537368604327
2415 | House Judiciary Committee |  77.14537368604327
2416 | Sergei Skripal |  77.14537368604327
2417 | Franklin |  77.14537368604327
2418 | Cambridge |  77.14537368604327
2419 | Julian Assange |  77.14537368604327
2420 | Ferrari |  77.14537368604327
2421 | Shia |  77.14537368604327
2422 | Instapundit |  77.1

2613 | Slovakia |  70.41731405270963
2614 | Iseman |  70.41731405270963
2615 | Saturn |  70.41731405270963
2616 | Coons |  70.41731405270963
2617 | Baez |  70.41731405270963
2618 | Alaskas |  70.41731405270963
2619 | Girardi |  70.41731405270963
2620 | Siena |  70.41731405270963
2621 | Slovak |  70.41731405270963
2622 | Antetokounmpo |  70.41731405270963
2623 | Hallmark |  70.41731405270963
2624 | Landrieu |  70.41731405270963
2625 | Blackburn |  70.41731405270963
2626 | Avs |  70.41731405270963
2627 | Verizon Wireless |  70.41731405270963
2628 | Herrera |  70.41731405270963
2629 | Rusal |  70.41731405270963
2630 | Dylans |  70.41731405270963
2631 | Gujarat |  70.41731405270963
2632 | Northwell |  70.41731405270963
2633 | Cavs |  70.41731405270963
2634 | Lind |  70.41731405270963
2635 | Ingles |  70.41731405270963
2636 | Commerce Clause |  70.41731405270963
2637 | AMOS |  70.41731405270963
2638 | Shimkus |  70.41731405270963
2639 | Sinclair |  70.41731405270963
2640 | Salinas |  70.417

2834 | Jonathan |  65.38077196129083
2835 | OBAMA |  65.38077196129083
2836 | Emanuel |  65.38077196129083
2837 | OMB |  65.38077196129083
2838 | Chechnya |  65.38077196129083
2839 | Hawkins |  65.38077196129083
2840 | Orwell |  65.38077196129083
2841 | Reyes |  65.38077196129083
2842 | Ohios |  65.38077196129083
2843 | Armenian |  65.38077196129083
2844 | Bolivian |  65.38077196129083
2845 | Boise |  65.38077196129083
2846 | Bush Administration |  65.38077196129083
2847 | Titanic |  65.38077196129083
2848 | Dodd |  65.38077196129083
2849 | Jason |  65.38077196129083
2850 | Matt Damon |  65.38077196129083
2851 | OMalley |  65.38077196129083
2852 | IPCC |  65.38077196129083
2853 | PLO |  65.38077196129083
2854 | Algeria |  65.38077196129083
2855 | Microsofts |  65.38077196129083
2856 | East Jerusalem |  65.38077196129083
2857 | Martin Luther King |  65.38077196129083
2858 | Spaniard |  65.38077196129083
2859 | Frazier |  65.38077196129082
2860 | Democraticleaning |  65.38077196129082
28

3054 | Daniel |  62.17069746584223
3055 | WASHINGTON AP |  62.17069746584223
3056 | Mark Penn |  62.17069746584223
3057 | Dmitry Peskov |  62.17069746584223
3058 | Sarah Huckabee Sanders |  62.17069746584223
3059 | Senate Finance Committee |  62.17069746584223
3060 | Department of Defense |  62.17069746584223
3061 | Congressional Democrats |  62.17069746584223
3062 | Wales |  62.17069746584223
3063 | Corcoran |  60.80801963633933
3064 | Simeon |  60.80801963633933
3065 | Navajo |  60.80801963633933
3066 | Calgary |  60.80801963633933
3067 | Kong |  60.80801963633933
3068 | Xavier |  60.80801963633933
3069 | Hodges |  60.80801963633933
3070 | Arlington |  60.80801963633933
3071 | EC |  60.80801963633933
3072 | NSC |  60.80801963633933
3073 | MuslimAmerican |  60.80801963633933
3074 | Marion |  60.80801963633933
3075 | Sinn Fein |  60.80801963633933
3076 | Noonan |  60.80801963633933
3077 | Uzbekistan |  60.80801963633933
3078 | Sinai |  60.80801963633933
3079 | Najaf |  60.8080196363393

3275 | Getty Images |  57.04799060237344
3276 | Corner |  57.04799060237344
3277 | Bayern Munich |  57.04799060237344
3278 | United States and Britain |  57.04799060237344
3279 | Buttigieg Warren |  57.04799060237344
3280 | Baltic |  57.04799060237344
3281 | FOX News |  57.04799060237344
3282 | NBA Finals |  57.04799060237344
3283 | Gary Hart |  57.04799060237344
3284 | Rose Garden |  57.04799060237344
3285 | Floridas |  57.04799060237344
3286 | BYU |  57.04799060237344
3287 | Steny Hoyer |  57.04799060237344
3288 | Mark Steyn |  57.04799060237344
3289 | Liberia |  57.04799060237344
3290 | George Steinbrenner |  57.04799060237344
3291 | Black Sea |  57.04799060237344
3292 | Ontario |  57.04799060237344
3293 | Madeleine Albright |  57.04799060237344
3294 | Brotherhood |  57.04799060237344
3295 | Novak Djokovic |  57.04799060237344
3296 | New York Daily News |  57.04799060237344
3297 | Grammys |  57.04799060237344
3298 | Staten Island |  57.04799060237344
3299 | Board of Supervisors |  5

3487 | Big Four |  53.207017181796914
3488 | Paterson |  53.207017181796914
3489 | Paraguay |  53.207017181796914
3490 | Xerox |  53.207017181796914
3491 | Sergei Lavrov |  53.207017181796914
3492 | Jefferson |  53.207017181796914
3493 | Bundesliga |  53.207017181796914
3494 | Kim Jong Il |  53.207017181796914
3495 | Bay of Pigs |  53.207017181796914
3496 | Blackwater |  53.207017181796914
3497 | DSCC |  53.207017181796914
3498 | Patti Solis Doyle |  53.207017181796914
3499 | Dinkins |  53.207017181796914
3500 | Daly |  53.20701718179691
3501 | DoJ |  53.20701718179691
3502 | Portlands |  53.20701718179691
3503 | Derek Carr |  53.20701718179691
3504 | University of Oregon |  53.20701718179691
3505 | UNC |  53.20701718179691
3506 | Gladwell |  53.20701718179691
3507 | Golan Heights |  53.20701718179691
3508 | Ellison |  53.20701718179691
3509 | Texas Tech |  53.20701718179691
3510 | Temple Mount |  53.20701718179691
3511 | Lane |  53.20701718179691
3512 | Joker |  53.20701718179691
3513

3700 | IL |  50.851711525448415
3701 | ICE |  50.851711525448415
3702 | House Judiciary |  50.851711525448415
3703 | Rahm Emanuel |  50.851711525448415
3704 | API |  50.851711525448415
3705 | Allies |  50.851711525448415
3706 | Tour |  50.851711525448415
3707 | postSoviet |  49.91699177707676
3708 | American Civil Liberties Union |  49.91699177707676
3709 | Hat |  49.91699177707676
3710 | New York Citys |  49.91699177707676
3711 | Alexis Tsipras |  49.91699177707676
3712 | Far East |  49.91699177707676
3713 | Roger Ailes |  49.91699177707676
3714 | ISISISIL |  49.91699177707676
3715 | Subscribe to the Entertainment |  49.91699177707676
3716 | HoustonChronicle.com  Follow him on Twitter at JayRJordan |  49.91699177707676
3717 | Michael Booth |  49.91699177707676
3718 | ROBERT SIEGEL |  49.91699177707676
3719 | Nikki Haley |  49.91699177707676
3720 | Roma |  49.91699177707676
3721 | Marxism |  49.91699177707676
3722 | Noam Scheiber |  49.91699177707676
3723 | Himalayas |  49.916991777076

3906 | Tikrit |  46.94487603513975
3907 | Huntington Beach |  46.94487603513975
3908 | General Mills |  46.94487603513975
3909 | Twins |  46.94487603513975
3910 | Negroes |  46.94487603513975
3911 | Costas |  46.94487603513975
3912 | Christopher Hitchens |  46.94487603513975
3913 | TSX Venture Exchange |  46.94487603513975
3914 | McIntyre |  46.94487603513975
3915 | Monrovia |  46.94487603513975
3916 | Walton |  46.94487603513975
3917 | Plymouth |  46.94487603513975
3918 | Lula |  46.94487603513975
3919 | Boucher |  46.94487603513975
3920 | Tilman Fertitta |  46.94487603513975
3921 | Springer |  46.94487603513975
3922 | Andrew Breitbart |  46.94487603513975
3923 | Spartan |  46.94487603513975
3924 | Roethlisberger |  46.94487603513975
3925 | Dobson |  46.94487603513975
3926 | Kennesaw State |  46.94487603513975
3927 | Robinsons |  46.94487603513975
3928 | DiCaprio |  46.94487603513975
3929 | ElBaradei |  46.94487603513975
3930 | Simmonds |  46.94487603513975
3931 | UCLAs |  46.94487603

4127 | Fukushima Daiichi |  44.51208538649076
4128 | Village Voice |  44.51208538649076
4129 | Pope John Paul |  44.51208538649076
4130 | Kellyanne Conway |  44.51208538649076
4131 | Little League |  44.51208538649076
4132 | antiBush |  44.51208538649076
4133 | Trojan |  44.51208538649076
4134 | Daily Caller |  44.51208538649076
4135 | Harry |  44.51208538649076
4136 | Belgiums |  44.51208538649076
4137 | Juan Williams |  44.51208538649076
4138 | Sugar Bowl |  44.51208538649076
4139 | Fish and Wildlife Service |  44.51208538649076
4140 | Fifth Amendment |  44.51208538649076
4141 | Trump Administration |  44.51208538649076
4142 | Jennings |  44.51208538649076
4143 | Himalayan |  44.51208538649076
4144 | BNP Paribas |  44.51208538649076
4145 | Jeff Macke |  44.51208538649076
4146 | Churchs |  44.51208538649076
4147 | Zacks Investment Research |  44.51208538649076
4148 | Belmont |  44.51208538649076
4149 | Geological Survey |  44.51208538649076
4150 | Aaron |  44.51208538649076
4151 | Luc

4328 | Baucus |  39.12073002928312
4329 | Gutierrez |  39.12073002928312
4330 | Fowler |  39.12073002928312
4331 | Insta |  39.12073002928312
4332 | Tustin |  39.12073002928312
4333 | Instagrams |  39.12073002928312
4334 | West Bank and Gaza Strip |  39.12073002928312
4335 | Dickson |  39.12073002928312
4336 | ICBM |  39.12073002928312
4337 | Caps |  39.12073002928312
4338 | Peoples Daily |  39.12073002928312
4339 | MacDonald |  39.12073002928312
4340 | Edsall |  39.12073002928312
4341 | Waze |  39.12073002928312
4342 | Christian Coalition |  39.12073002928312
4343 | ProPublica |  39.12073002928312
4344 | Bulgarian |  39.12073002928312
4345 | Ventura County |  39.12073002928312
4346 | Summer Creek |  39.12073002928312
4347 | Hemingway |  39.12073002928312
4348 | Poles |  39.12073002928312
4349 | Ben Ali |  39.12073002928312
4350 | Jim Acosta |  39.12073002928312
4351 | Detroits |  39.12073002928312
4352 | Gibsons |  39.12073002928312
4353 | Belfast |  39.12073002928312
4354 | Sunni Isl

4548 | IR |  38.00501227271208
4549 | New York Rangers |  38.00501227271208
4550 | Air Force One |  38.00501227271208
4551 | Kris Kobach |  38.00501227271208
4552 | Hayden |  38.00501227271208
4553 | Ethiopian |  38.00501227271208
4554 | Mullen |  38.00501227271208
4555 | Peace Corps |  38.00501227271208
4556 | TripleA |  38.00501227271208
4557 | Pritzker |  38.00501227271208
4558 | White Helmets |  38.00501227271208
4559 | Darling |  38.00501227271208
4560 | Alphabet |  38.00501227271208
4561 | Kaufman |  38.00501227271208
4562 | MaraLago |  38.00501227271208
4563 | Burnett |  38.00501227271208
4564 | Islamophobic |  38.00501227271208
4565 | Hyundai |  38.00501227271208
4566 | Jesse Jackson |  38.00501227271208
4567 | Fields |  38.00501227271208
4568 | ADL |  38.00501227271208
4569 | Northern Hemisphere |  38.00501227271208
4570 | Federal Deposit Insurance Corp |  38.00501227271208
4571 | Alberta |  38.00501227271208
4572 | JCPOA |  38.00501227271208
4573 | LEGO |  38.00501227271208
4

4757 | UEFA |  37.0934044887423
4758 | Philadelphia Phillies |  37.0934044887423
4759 | Central Intelligence Agency |  37.0934044887423
4760 | Midway |  37.0934044887423
4761 | Peyton Manning |  37.0934044887423
4762 | CALIFORNIA UNITED STATES |  37.0934044887423
4763 | Social Democrats |  37.0934044887423
4764 | Facebook and Twitter |  37.0934044887423
4765 | St. Louis Cardinals |  37.0934044887423
4766 | Brian Cashman |  37.0934044887423
4767 | Salon |  37.0934044887423
4768 | James Madison |  37.0934044887423
4769 | MacArthur |  37.0934044887423
4770 | Elizabeth Edwards |  37.0934044887423
4771 | American Prospect |  37.0934044887423
4772 | Edmonton |  37.0934044887423
4773 | Rhodes |  37.0934044887423
4774 | Robert Reich |  37.0934044887423
4775 | Award |  37.0934044887423
4776 | Arapahoe County |  37.0934044887423
4777 | New America Foundation |  37.0934044887423
4778 | PITTSBURGH |  37.0934044887423
4779 | Aaron Boone |  37.0934044887423
4780 | Rochester |  37.0934044887423
4781 

4952 | Lear |  37.0934044887423
4953 | Kaiser |  37.0934044887423
4954 | European Tour |  37.0934044887423
4955 | Consumer Reports |  37.0934044887423
4956 | John McCain RAriz |  37.0934044887423
4957 | antiSemites |  37.0934044887423
4958 | ATM |  37.0934044887423
4959 | Andrew McCabe |  37.0934044887423
4960 | Vic Fangio |  37.0934044887423
4961 | Russell Westbrook |  37.0934044887423
4962 | Oliver Darcy |  37.0934044887423
4963 | Joseph Stalin |  37.0934044887423
4964 | H.R. McMaster |  37.0934044887423
4965 | Oval |  37.0934044887423
4966 | MAGA |  37.0934044887423
4967 | Blue |  37.0934044887423
4968 | UNITED STATES NORTH AMERICA CALIFORNIA |  37.0934044887423
4969 | Health Department |  37.0934044887423
4970 | Bruce Rauner |  37.0934044887423
4971 | AUDIE CORNISH |  37.0934044887423
4972 | United States Army |  37.0934044887423
4973 | North America AsiaPacific UK Europe Central and South America Middle East and Africa |  37.0934044887423
4974 | Carney |  37.0934044887423
4975 | K

5159 | BandAid |  31.2965840234265
5160 | Fresno |  31.2965840234265
5161 | Metra |  31.2965840234265
5162 | Western Pennsylvania |  31.2965840234265
5163 | Barry Manilow |  31.2965840234265
5164 | Bruce Ohr |  31.2965840234265
5165 | Revolutionary War |  31.2965840234265
5166 | East End |  31.2965840234265
5167 | Hoffmans |  31.2965840234265
5168 | Hebron |  31.2965840234265
5169 | UNICEF |  31.2965840234265
5170 | Jimenez |  31.2965840234265
5171 | Karen |  31.2965840234265
5172 | Central Division |  31.2965840234265
5173 | Student Council |  31.2965840234265
5174 | Anthony Davis |  31.2965840234265
5175 | Kandahar |  31.2965840234265
5176 | Press Association |  31.2965840234265
5177 | Antetokounmpos |  31.2965840234265
5178 | Golden Globes |  31.2965840234265
5179 | DeAndre Hopkins |  31.2965840234265
5180 | Tulsa |  31.2965840234265
5181 | Eagle |  31.2965840234265
5182 | DIA |  31.2965840234265
5183 | Nova |  31.2965840234265
5184 | Frankens |  31.2965840234265
5185 | Lollapalooza

5379 | Kosovo Liberation Army |  31.2965840234265
5380 | Baqouba |  31.2965840234265
5381 | Sudans |  31.2965840234265
5382 | SPLC |  31.2965840234265
5383 | UA |  31.2965840234265
5384 | Space.co |  31.2965840234265
5385 | RNCs |  31.2965840234265
5386 | Lisa |  31.2965840234265
5387 | Deutsche Welle |  31.2965840234265
5388 | National League East |  31.2965840234265
5389 | Abdullah Abdullah |  31.2965840234265
5390 | Kathleen Sebelius |  31.2965840234265
5391 | BlackLivesMatter |  31.2965840234265
5392 | Emmett Till |  31.2965840234265
5393 | Murdochs |  31.2965840234265
5394 | Google Glass |  31.2965840234265
5395 | Iraqi Shiites |  31.2965840234265
5396 | Eugene |  31.2965840234265
5397 | Algerian |  31.2965840234265
5398 | Montauk |  30.404009818169662
5399 | University of California at Berkeley |  30.404009818169662
5400 | WiFi |  30.404009818169662
5401 | Bolsonaro |  30.404009818169662
5402 | Helmand |  30.404009818169662
5403 | Syrian Observatory for Human Rights |  30.4040098

5573 | Paramount |  30.404009818169662
5574 | Boris Johnsons |  30.404009818169662
5575 | Didi Gregorius |  30.404009818169662
5576 | Ben Smith |  30.404009818169662
5577 | Howard Kurtz |  30.404009818169662
5578 | Brad |  30.404009818169662
5579 | American Federation of Teachers |  30.404009818169662
5580 | El |  30.404009818169662
5581 | Beverly Hills |  30.404009818169662
5582 | Hempstead Town |  30.404009818169662
5583 | Didier Drogba |  30.404009818169662
5584 | Hill Morning Report  Sponsored by AdvaMed |  30.404009818169662
5585 | Texas Department of Public Safety |  30.404009818169662
5586 | MondayFriday |  30.404009818169662
5587 | Britney Spears |  30.404009818169662
5588 | nonEnglish |  30.404009818169662
5589 | VIDEO |  30.404009818169662
5590 | IVt |  30.404009818169662
5591 | Mayor Rahm Emanuel |  30.404009818169662
5592 | Huh |  30.404009818169662
5593 | ATLANTA |  30.404009818169662
5594 | OHare |  30.404009818169662
5595 | Mika Brzezinski |  30.404009818169662
5596 | Ra

5762 | Florida State |  30.404009818169662
5763 | British Airways |  30.404009818169662
5764 | Millwall |  30.404009818169662
5765 | Dean Rusk |  30.404009818169662
5766 | High |  30.404009818169662
5767 | Vogue |  30.404009818169662
5768 | Drug Administration |  30.404009818169662
5769 | Washington Redskins |  30.404009818169662
5770 | Page |  30.404009818169662
5771 | Antarctica |  30.404009818169662
5772 | Britain and the United States |  30.404009818169662
5773 | Charles and David Koch |  30.404009818169662
5774 | Phoenix Coyotes |  30.404009818169662
5775 | CALIFORNIA UNITED STATES NORTH AMERICA |  30.404009818169662
5776 | Federal Reserve Board |  30.404009818169662
5777 | Berry |  30.404009818169662
5778 | Occupy |  30.404009818169662
5779 | Howard University |  30.404009818169662
5780 | James Dobson |  30.404009818169662
5781 | Siegel |  30.404009818169662
5782 | Bogan |  30.404009818169662
5783 | Woodrow Wilson |  30.404009818169662
5784 | Wang |  30.404009818169662
5785 | Enh

5954 | RUtah |  30.404009818169662
5955 | Stanley McChrystal |  30.404009818169662
5956 | Great Plains |  30.404009818169662
5957 | Toronto Blue Jays |  30.404009818169662
5958 | Pedro Ribeiro |  30.404009818169662
5959 | Va |  30.404009818169662
5960 | California Highway Patrol |  30.404009818169662
5961 | Paul Volcker |  30.404009818169662
5962 | Harvard Law |  30.404009818169662
5963 | Security |  30.404009818169662
5964 | Indiana University |  30.404009818169662
5965 | Rings |  30.404009818169662
5966 | Protestants |  30.404009818169662
5967 | Abdel Fattah alSisi |  30.404009818169662
5968 | Bristol |  30.404009818169662
5969 | FiveThirtyEights |  30.404009818169662
5970 | Ferguson Missouri |  30.404009818169662
5971 | Tip ONeill |  30.404009818169662
5972 | Jason Chaffetz |  30.404009818169662
5973 | Lai |  30.404009818169662
5974 | Farage |  30.404009818169662
5975 | White House and Congress |  30.404009818169662
5976 | Sunshine State |  30.404009818169662
5977 | Miami Herald |  

6148 | Pam Key |  23.472438017569875
6149 | Urban Institute |  23.472438017569875
6150 | St. Paul |  23.472438017569875
6151 | AMERICA TEXAS |  23.472438017569875
6152 | Los Angeles Angels |  23.472438017569875
6153 | Berthoud |  23.472438017569875
6154 | Paul Wellstone |  23.472438017569875
6155 | House Democratic Caucus |  23.472438017569875
6156 | Force |  23.472438017569875
6157 | Eleven GOP |  23.472438017569875
6158 | Communications |  23.472438017569875
6159 | Jay Bybee |  23.472438017569875
6160 | Vonnegut |  23.472438017569875
6161 | University of Illinois |  23.472438017569875
6162 | Scarlett Johansson |  23.472438017569875
6163 | Emma Stone |  23.472438017569875
6164 | Kamath |  23.472438017569875
6165 | David Barrett |  23.472438017569875
6166 | Terry Branstad |  23.472438017569875
6167 | AWR Hawkins |  23.472438017569875
6168 | Pulitzer Prizewinning |  23.472438017569875
6169 | Tommy Thompson |  23.472438017569875
6170 | Peshmerga |  23.472438017569875
6171 | Ian Hanchett 

6343 | Peter Tatchell |  23.472438017569875
6344 | Overnight Energy Automakers |  23.472438017569875
6345 | Emailer S. |  23.472438017569875
6346 | Johnny |  23.472438017569875
6347 | Allen Robinson |  23.472438017569875
6348 | Rachel Maddow |  23.472438017569875
6349 | Matthew Yglesias |  23.472438017569875
6350 | Vitaly Mutko |  23.472438017569875
6351 | ISS Securities Class Action Services |  23.472438017569875
6352 | Director of National Intelligence James Clapper |  23.472438017569875
6353 | Sugar Land |  23.472438017569875
6354 | CBS Worldwide Inc. All Rights Reserved |  23.472438017569875
6355 | Popovich |  23.472438017569875
6356 | Atlantic Wire |  23.472438017569875
6357 | Norm Coleman |  23.472438017569875
6358 | John Nichols |  23.472438017569875
6359 | Robots |  23.472438017569875
6360 | Oklahoma State |  23.472438017569875
6361 | Power Five |  23.472438017569875
6362 | Audacity of Hope |  23.472438017569875
6363 | Sicilian |  23.472438017569875
6364 | Anfield |  23.4724380

6526 | Tommy Vietor |  23.472438017569875
6527 | Kevin Donohoe |  23.472438017569875
6528 | Rudy Giulianis |  23.472438017569875
6529 | Chicago Blackhawks |  23.472438017569875
6530 | Trump Donald John TrumpIran |  23.472438017569875
6531 | Stefan |  23.472438017569875
6532 | Brett Favre |  23.472438017569875
6533 | Galilee |  23.472438017569875
6534 | Rex Ryan |  23.472438017569875
6535 | Geithners |  23.472438017569875
6536 | Milk |  23.472438017569875
6537 | DeKalb County |  23.472438017569875
6538 | Seuss |  23.472438017569875
6539 | California Democratic |  23.472438017569875
6540 | United States House |  23.472438017569875
6541 | George Pataki |  23.472438017569875
6542 | Muqtada alSadr |  23.472438017569875
6543 | Griffith Park |  23.472438017569875
6544 | Kennedy Space Center |  23.472438017569875
6545 | Shutterstock.co |  23.472438017569875
6546 | BMWs |  23.472438017569875
6547 | Sonia Sotomayor |  23.472438017569875
6548 | Cape |  23.472438017569875
6549 | America and Europe

6719 | Jerry Falwell |  23.472438017569875
6720 | Regnery |  23.472438017569875
6721 | Van Jones |  23.472438017569875
6722 | Elvis Presley |  23.472438017569875
6723 | Emirati |  23.472438017569875
6724 | SATs |  23.472438017569875
6725 | Kiev Ukraine |  23.472438017569875
6726 | Yemen  Dems |  23.472438017569875
6727 | Robert Smith |  23.472438017569875
6728 | Israeli and Palestinian |  23.472438017569875
6729 | Holy Spirit |  23.472438017569875
6730 | Walkers |  23.472438017569875
6731 | Herbert Hoover |  23.472438017569875
6732 | Township |  23.472438017569875
6733 | Write to Charlotte Alter |  23.472438017569875
6734 | Bret Baier |  23.472438017569875
6735 | TEXAS SPORTS NATION |  23.472438017569875
6736 | Sochi Games |  23.472438017569875
6737 | Al Jazeera English |  23.472438017569875
6738 | Gene McCarthy |  23.472438017569875
6739 | Sundar Pichai |  23.472438017569875
6740 | Julian Gill |  23.472438017569875
6741 | Pat Toomey |  23.472438017569875
6742 | PTA |  23.4724380175698

6914 | Sarajevo |  23.472438017569875
6915 | Pride |  23.472438017569875
6916 | Heather |  23.472438017569875
6917 | Westchester |  23.472438017569875
6918 | Twilight |  23.472438017569875
6919 | Mark Jackson |  23.472438017569875
6920 | Plato |  23.472438017569875
6921 | University of California |  23.472438017569875
6922 | Bill Parcells |  23.472438017569875
6923 | LP |  23.472438017569875
6924 | Jared Polis |  23.472438017569875
6925 | Robert Mueller Robert Bob Swan MuellerSchiff Trump |  23.472438017569875
6926 | Quentin Tarantino |  23.472438017569875
6927 | Medical Center |  23.472438017569875
6928 | Los Angelesarea |  23.472438017569875
6929 | East Garfield Park |  23.472438017569875
6930 | RACHEL MARTIN |  23.472438017569875
6931 | Progressive Democrat Congresswomen |  23.472438017569875
6932 | Frances Tiafoe |  23.472438017569875
6933 | Jill Stewart |  23.472438017569875
6934 | SAN FRANCISCOBUnited |  23.472438017569875
6935 | Justice Neil Gorsuch |  23.472438017569875
6936 | 

7110 | National Hockey League |  23.472438017569875
7111 | Department of Agriculture |  23.472438017569875
7112 | Mark Dayton |  23.472438017569875
7113 | Agee |  23.472438017569875
7114 | TOKYO Reuters |  23.472438017569875
7115 | Nuri Kamal alMaliki |  23.472438017569875
7116 | Fairfield |  23.472438017569875
7117 | GMOs |  23.472438017569875
7118 | Kerala |  23.472438017569875
7119 | Alan Blinder |  23.472438017569875
7120 | Battalion |  23.472438017569875
7121 | EditoratLarge at Breitbart News |  23.472438017569875
7122 | Malaysia Airlines Flight |  23.472438017569875
7123 | PMs |  23.472438017569875
7124 | Toronto Raptors |  23.472438017569875
7125 | JPMorgan Chase  Co |  23.472438017569875
7126 | Latest on California |  23.472438017569875
7127 | EW.co |  23.472438017569875
7128 | Abraham Lincolns |  23.472438017569875
7129 | Muammar Gaddafi |  23.472438017569875
7130 | Armstrongs |  23.472438017569875
7131 | Rep. Gabrielle Giffords |  23.472438017569875
7132 | GaryHart |  23.4724

7302 | Robert McNamara |  23.472438017569875
7303 | MBS |  23.472438017569875
7304 | Trump and Clinton |  23.472438017569875
7305 | Hamilton County |  23.472438017569875
7306 | Engels |  23.472438017569875
7307 | itto |  23.472438017569875
7308 | Ted Cruz RTexas |  23.472438017569875
7309 | WMD |  23.472438017569875
7310 | David Simon |  23.472438017569875
7311 | Tim Anderson |  23.472438017569875
7312 | St Petersburg |  23.472438017569875
7313 | VP |  23.472438017569875
7314 | midApril |  23.472438017569875
7315 | Coolidge |  23.472438017569875
7316 | David Dao |  23.472438017569875
7317 | Palo Alto |  23.472438017569875
7318 | Greenwich Village |  23.472438017569875
7319 | Upper West Side |  23.472438017569875
7320 | Marathon |  23.472438017569875
7321 | Byron |  23.472438017569875
7322 | West th Street |  23.472438017569875
7323 | Georgetown University School of Medicine |  23.472438017569875
7324 | Tina Brown |  23.472438017569875
7325 | Wayne Simmonds |  23.472438017569875
7326 | 

In [59]:
processed_sdf.write.parquet(f'features/{VER}')

In [60]:
cv_model.save(f'vocab_model/{VER}')

In [62]:
processed_sdf.printSchema()

root
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- date_download: string (nullable = true)
 |-- date_modify: string (nullable = true)
 |-- date_publish: string (nullable = true)
 |-- description: string (nullable = true)
 |-- language: string (nullable = true)
 |-- language_guess: string (nullable = true)
 |-- month: long (nullable = true)
 |-- published: string (nullable = true)
 |-- source_domain: string (nullable = true)
 |-- text: string (nullable = true)
 |-- text_or_desc: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- year: long (nullable = true)
 |-- id: long (nullable = false)
 |-- clean_text: string (nullable = true)
 |-- finished_sentence: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- finished_token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- finished_lemma: array (nullable = true)
 |    |-- element: string (conta

In [63]:
processed_sdf.count()

10000