# NLP: Use LDA to identify additional stop words

## Imports

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from gensim import corpora, models, similarities, matutils
from sklearn.feature_extraction.text import CountVectorizer



## Functions

In [2]:
def remove_stop_words(raw_text, stop_list):
    words = raw_text.split()
    cleaned = [w for w in words if not w in stop_list]
    return( " ".join(cleaned))

## Load Data

In [3]:
df = pd.read_csv('news_articles_cleaned_trimmed.csv')

In [4]:
df.sample(5)

Unnamed: 0,article_text,datetime,source,title,url,cleaned_article_text
1743,"\nThe crisis in Greece, a slowdown in China an...",2015-03-26 13:58:21,Guardian,Bank of England warns of danger to markets fro...,https://www.theguardian.com/business/2015/mar/...,The crisis Greece slowdown China eurozone main...
9352,"\nIn a chapter of their new book, The Body Eco...",2013-06-09 21:51:02,Guardian,IMF and Greece: Institutional Monstrous Failure,https://www.theguardian.com/commentisfree/2013...,In chapter book The_Body_Economic_academics_Da...
39682,(Adds close of U.S. markets) * Stronger U.S. Q...,2015-09-25 00:00:00,Reuters,"GLOBAL MARKETS-Dollar, shares gain on Yellen s...",http://uk.reuters.com/article/markets-global-i...,World equity dollar advanced Friday_to_end_a_r...
27579,* European shares fall for 2nd straight sessi...,2010-05-03 00:00:00,Reuters,Europe shares fall; Greece aid fails to calm n...,http://uk.reuters.com/article/markets-europe-s...,European shares edged concerns massive bailout...
49345,"TOKYO, March 10 (Reuters) - Japan's Nikkei av...",2011-03-10,Reuters,Nikkei falls but supported by dividend prospects,http://uk.reuters.com/article/markets-japan-st...,Japan Nikkei U_S stocks weaker buying stocks d...


## LDA

Take a sample from the entire dataset.

In [5]:
sample_df = df.sample(10000)

In [6]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(sample_df['cleaned_article_text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
counts = count_vectorizer.transform(sample_df['cleaned_article_text']).transpose()

In [8]:
counts.shape

(99162, 10000)

In [9]:
corpus = matutils.Sparse2Corpus(counts)

In [10]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [11]:
len(id2word)

99162

In [12]:
lda = models.LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, passes=1)

In [13]:
lda.print_topics()

[(0,
  '0.005*"growth" + 0.004*"rate" + 0.004*"economy" + 0.004*"bank" + 0.004*"rates" + 0.003*"financial" + 0.003*"global" + 0.003*"banks" + 0.003*"government" + 0.003*"fed"'),
 (1,
  '0.006*"prices" + 0.006*"shares" + 0.005*"economy" + 0.005*"investors" + 0.005*"stocks" + 0.004*"european" + 0.003*"u_s" + 0.003*"yen" + 0.003*"rates" + 0.003*"financial"'),
 (2,
  '0.006*"shares" + 0.005*"stocks" + 0.004*"economy" + 0.004*"investors" + 0.004*"financial" + 0.003*"bank" + 0.003*"economic" + 0.003*"dollar" + 0.003*"stock" + 0.003*"u_s"'),
 (3,
  '0.006*"shares" + 0.005*"stocks" + 0.004*"growth" + 0.004*"investors" + 0.004*"economic" + 0.004*"prices" + 0.004*"european" + 0.003*"expected" + 0.003*"u_s" + 0.003*"economy"'),
 (4,
  '0.004*"government" + 0.004*"growth" + 0.004*"oil" + 0.004*"expected" + 0.004*"economic" + 0.003*"yen" + 0.003*"tax" + 0.003*"business" + 0.003*"shares" + 0.003*"european"'),
 (5,
  '0.006*"shares" + 0.004*"government" + 0.004*"growth" + 0.003*"yen" + 0.003*"investo

## Remove extra stop words

In [14]:
extra_stopwords = ['shares', 'economy', 'economic', 'prices', 'investors']

In [15]:
stop_list = stopwords.words('english')
len(stop_list)

153

In [16]:
stop_list = stop_list + extra_stopwords
len(stop_list)

158

In [17]:
sample_df['article_text_extra_stops'] = sample_df['cleaned_article_text'].apply(remove_stop_words, stop_list=stop_list)

In [18]:
sample_df.sample(5)

Unnamed: 0,article_text,datetime,source,title,url,cleaned_article_text,article_text_extra_stops
68176,"* June ADP employment number rises to 172,000 ...",2016-07-07 00:00:00,Reuters,"US STOCKS-S&P, Dow little changed ahead of cru...",http://uk.reuters.com/article/usa-stocks-idUKL...,The S P Dow little changed late_morning tradin...,The S P Dow little changed late_morning tradin...
60875,"(Updates to midsession) TOKYO, May 25 (Reut...",2007-05-25 00:00:00,Reuters,"Tokyo stocks down broadly on lower U.S., China...",http://uk.reuters.com/article/markets-japan-st...,Japanese stocks broadly investors concerned fa...,Japanese stocks broadly concerned falls U_S Ch...
41671,(Updates to U.S. market open) * U.S. dollar se...,2017-02-03,Reuters,GLOBAL MARKETS-U.S. dollar and bond yields low...,http://uk.reuters.com/article/global-markets-i...,The U_S dollar headed fourth straight weekly l...,The U_S dollar headed fourth straight weekly l...
79440,Wall Street eased overnight but it was the dol...,2016-11-16 23:23:00,FT,"Fast Asia Open: Dollar daze, Aussie jobs",https://www.ft.com/content/62d851e6-14c5-31f9-...,Wall Street eased overnight dollar capturing a...,Wall Street eased overnight dollar capturing a...
60680,"ZURICH, Sept 25 (Reuters) - The Swiss blue-chi...",2015-09-25 00:00:00,Reuters,Swiss stocks - Factors to watch on Sept 25,http://uk.reuters.com/article/markets-swiss-st...,The Swiss blue chip SMI seen opening up_percen...,The Swiss blue chip SMI seen opening up_percen...


In [19]:
len(sample_df.iloc[2028]['cleaned_article_text'])

4826

In [20]:
len(sample_df.iloc[2028]['article_text_extra_stops'])

4734

## Repeat process

### LDA

In [21]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(sample_df['article_text_extra_stops'])
counts = count_vectorizer.transform(sample_df['article_text_extra_stops']).transpose()
counts.shape

(99162, 10000)

In [22]:
corpus = matutils.Sparse2Corpus(counts)
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())
len(id2word)

99162

In [23]:
lda = models.LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, passes=1)
lda.print_topics()

[(0,
  '0.005*"yen" + 0.004*"oil" + 0.004*"stocks" + 0.004*"growth" + 0.003*"company" + 0.003*"china" + 0.003*"government" + 0.003*"trading" + 0.003*"sector" + 0.003*"stock"'),
 (1,
  '0.004*"government" + 0.004*"financial" + 0.004*"growth" + 0.004*"u_s" + 0.004*"rates" + 0.004*"stocks" + 0.004*"uk" + 0.003*"european" + 0.003*"oil" + 0.003*"tax"'),
 (2,
  '0.006*"growth" + 0.005*"european" + 0.003*"government" + 0.003*"global" + 0.003*"stocks" + 0.003*"policy" + 0.003*"uk" + 0.003*"china" + 0.003*"euro" + 0.002*"tax"'),
 (3,
  '0.006*"stocks" + 0.005*"growth" + 0.004*"government" + 0.004*"european" + 0.003*"yen" + 0.003*"financial" + 0.003*"bank" + 0.003*"rate" + 0.003*"sector" + 0.003*"global"'),
 (4,
  '0.005*"growth" + 0.004*"fed" + 0.003*"stocks" + 0.003*"bank" + 0.003*"financial" + 0.003*"european" + 0.003*"expected" + 0.003*"yen" + 0.003*"rate" + 0.003*"oil"'),
 (5,
  '0.004*"bank" + 0.004*"nikkei" + 0.003*"banks" + 0.003*"stocks" + 0.003*"growth" + 0.003*"companies" + 0.003*"u_s

### Stop words

In [24]:
extra_stopwords = extra_stopwords + ['government', 'growth', 'banks']
len(extra_stopwords)

8

In [25]:
stop_list = stopwords.words('english')
stop_list = stop_list + extra_stopwords
len(stop_list)

161

In [26]:
sample_df['article_text_extra_stops_v2'] = sample_df['cleaned_article_text'].apply(remove_stop_words, stop_list=stop_list)

In [27]:
sample_df.sample(5)

Unnamed: 0,article_text,datetime,source,title,url,cleaned_article_text,article_text_extra_stops,article_text_extra_stops_v2
724,\nThe United States' reputation for sound econ...,2013-12-10 13:54:54,Guardian,America's reputation for sound economic policy...,https://www.theguardian.com/business/economics...,The_United_States reputation sound economic po...,The_United_States reputation sound policymakin...,The_United_States reputation sound policymakin...
38832,* Blue-chip FTSE 100 index steadies in late tr...,2014-11-06 00:00:00,Reuters,FTSE steadies after gaining on Draghi's comments,http://uk.reuters.com/article/markets-stocks-f...,Britain steadied late trading surrendering ear...,Britain steadied late trading surrendering ear...,Britain steadied late trading surrendering ear...
54869,* Nikkei slips on profit-taking after bounce ...,2009-08-07,Reuters,Nikkei slips on nerves as jobs data looms,http://uk.reuters.com/article/markets-japan-st...,Japan Nikkei stock edged investors nervous ahe...,Japan Nikkei stock edged nervous ahead crucial...,Japan Nikkei stock edged nervous ahead crucial...
27070,* FTSEurofirst 300 closes 0.6 pct lower; 3rd ...,2010-09-29 00:00:00,Reuters,Europe shares close at three-wk low; retailers...,http://uk.reuters.com/article/markets-europe-s...,European shares slipped three_week closing Hen...,European slipped three_week closing Hennes_Mau...,European slipped three_week closing Hennes_Mau...
81278,Shares in Monster Beverage led the S&P 500 on ...,2017-03-02 21:25:00,FT,Monster shares lead Wall Street gainers after ...,https://www.ft.com/content/48be67d6-ff69-11e6-...,Shares Monster_Beverage led S P posted biggest...,Shares Monster_Beverage led S P posted biggest...,Shares Monster_Beverage led S P posted biggest...


In [28]:
print(len(sample_df.iloc[2028]['cleaned_article_text']))
print(len(sample_df.iloc[2028]['article_text_extra_stops']))
print(len(sample_df.iloc[2028]['article_text_extra_stops_v2']))

4826
4734
4673


## And again..

### LDA

In [29]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(sample_df['article_text_extra_stops_v2'])
counts = count_vectorizer.transform(sample_df['article_text_extra_stops_v2']).transpose()
counts.shape

(99162, 10000)

In [30]:
corpus = matutils.Sparse2Corpus(counts)
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())
len(id2word)

99162

In [31]:
lda = models.LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, passes=1)
lda.print_topics()

[(0,
  '0.004*"expected" + 0.004*"rate" + 0.003*"rates" + 0.003*"sector" + 0.003*"u_s" + 0.003*"financial" + 0.003*"uk" + 0.003*"spending" + 0.003*"stocks" + 0.003*"trading"'),
 (1,
  '0.005*"rate" + 0.004*"earnings" + 0.004*"fed" + 0.004*"european" + 0.003*"rates" + 0.003*"yen" + 0.003*"dollar" + 0.003*"inflation" + 0.003*"bank" + 0.003*"profit"'),
 (2,
  '0.004*"rates" + 0.004*"rate" + 0.003*"debt" + 0.003*"european" + 0.003*"financial" + 0.003*"global" + 0.003*"stocks" + 0.003*"oil" + 0.003*"bank" + 0.003*"business"'),
 (3,
  '0.003*"stocks" + 0.003*"yen" + 0.003*"expected" + 0.003*"european" + 0.003*"financial" + 0.002*"bank" + 0.002*"rates" + 0.002*"earnings" + 0.002*"debt" + 0.002*"china"'),
 (4,
  '0.006*"european" + 0.005*"stocks" + 0.005*"u_s" + 0.003*"uk" + 0.003*"bank" + 0.003*"rate" + 0.003*"expected" + 0.003*"company" + 0.003*"sector" + 0.003*"rates"'),
 (5,
  '0.004*"u_s" + 0.004*"stocks" + 0.003*"china" + 0.003*"stock" + 0.003*"expected" + 0.003*"financial" + 0.003*"yen"

### Stop words

In [32]:
extra_stopwords = extra_stopwords + ['expected', 'rate', 'rates', 'stocks', 'financial']
len(extra_stopwords)

13

In [33]:
stop_list = stopwords.words('english')
stop_list = stop_list + extra_stopwords
len(stop_list)

166

In [34]:
sample_df['article_text_extra_stops_v3'] = sample_df['cleaned_article_text'].apply(remove_stop_words, stop_list=stop_list)

In [35]:
sample_df.sample(5)

Unnamed: 0,article_text,datetime,source,title,url,cleaned_article_text,article_text_extra_stops,article_text_extra_stops_v2,article_text_extra_stops_v3
31197,* Pan-European FTSEurofirst 300 index falls 0....,2015-09-10 00:00:00,Reuters,"European shares down, Brazil-exposed stocks un...",http://uk.reuters.com/article/markets-stocks-e...,European shares retreated companies exposed Br...,European retreated companies exposed Brazil ca...,European retreated companies exposed Brazil ca...,European retreated companies exposed Brazil ca...
35836,* FTSEurofirst 300 up 1.1 percent * Germany's ...,2013-09-19 00:00:00,Reuters,European shares surge as Fed leaves stimulus i...,http://uk.reuters.com/article/markets-europe-s...,European shares strongly the_U_S_Federal_Reser...,European strongly the_U_S_Federal_Reserve surp...,European strongly the_U_S_Federal_Reserve surp...,European strongly the_U_S_Federal_Reserve surp...
45977,"(Updates to close) TOKYO, Jan 22 (Reuters) ...",2007-01-22,Reuters,"Nikkei at 9-mth high as Olympus,security firms...",http://uk.reuters.com/article/markets-japan-st...,The Nikkei highest nine_months shares precisio...,The Nikkei highest nine_months precision elect...,The Nikkei highest nine_months precision elect...,The Nikkei highest nine_months precision elect...
35959,"LONDON, April 12 (Reuters) - European shares ...",2010-04-12 00:00:00,Reuters,"European shares turn flat; banks up, miners fall",http://uk.reuters.com/article/markets-europe-s...,European shares turned flat morning stronger b...,European turned flat morning stronger banking ...,European turned flat morning stronger banking ...,European turned flat morning stronger banking ...
20942,"PARIS, Oct 1 (Reuters) - European stocks inch...",2007-10-01 00:00:00,Reuters,"Airlines, miners help Europe stocks recover fr...",http://uk.reuters.com/article/markets-europe-s...,European stocks inched early_on reversing earl...,European stocks inched early_on reversing earl...,European stocks inched early_on reversing earl...,European inched early_on reversing early losse...


In [36]:
print(len(sample_df.iloc[2028]['cleaned_article_text']))
print(len(sample_df.iloc[2028]['article_text_extra_stops']))
print(len(sample_df.iloc[2028]['article_text_extra_stops_v2']))
print(len(sample_df.iloc[2028]['article_text_extra_stops_v3']))

4826
4734
4673
4641


## Again..

### LDA

In [37]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(sample_df['article_text_extra_stops_v3'])
counts = count_vectorizer.transform(sample_df['article_text_extra_stops_v3']).transpose()
counts.shape

(99162, 10000)

In [38]:
corpus = matutils.Sparse2Corpus(counts)
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())
len(id2word)

99162

In [39]:
lda = models.LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, passes=1)
lda.print_topics()

[(0,
  '0.005*"uk" + 0.005*"european" + 0.003*"u_s" + 0.002*"tax" + 0.002*"britain" + 0.002*"inflation" + 0.002*"country" + 0.002*"bank" + 0.002*"business" + 0.002*"spending"'),
 (1,
  '0.004*"dollar" + 0.003*"oil" + 0.003*"u_s" + 0.003*"company" + 0.003*"stock" + 0.003*"yen" + 0.003*"fed" + 0.002*"britain" + 0.002*"business" + 0.002*"bank"'),
 (2,
  '0.004*"oil" + 0.003*"earnings" + 0.003*"business" + 0.003*"sector" + 0.003*"european" + 0.003*"u_s" + 0.003*"uk" + 0.002*"inflation" + 0.002*"yen" + 0.002*"company"'),
 (3,
  '0.005*"u_s" + 0.004*"china" + 0.004*"nikkei" + 0.004*"yen" + 0.004*"stock" + 0.003*"global" + 0.003*"trading" + 0.003*"bank" + 0.003*"oil" + 0.003*"european"'),
 (4,
  '0.005*"european" + 0.004*"dollar" + 0.003*"bank" + 0.003*"global" + 0.003*"policy" + 0.003*"world" + 0.003*"u_s" + 0.003*"uk" + 0.003*"debt" + 0.003*"china"'),
 (5,
  '0.005*"european" + 0.004*"earnings" + 0.003*"world" + 0.003*"u_s" + 0.003*"biggest" + 0.003*"uk" + 0.003*"stock" + 0.003*"trade" + 0.

### Stop words

In [54]:
extra_stopwords = extra_stopwords + ['uk', 'european', 'u_s', 'tax', 'britain', 'inflation', 'country', 'bank', 
                                    'business', 'spending', 'dollar', 'oil', 'company', 'stock', 'yen', 'fed', 'earnings',
                                    'sector', 'global', 'policy', 'world', 'debt', 'china', 'biggest', 'trade', 'bank',
                                    'work', 'nikkei', 'profit', 'business', 'European', 'UK', 'U_S', 'Bank', 'Nikkei', 'Fed', 
                                     'Britain', 'Dollar', 'Yen', 'Global', 'World', 'China']
len(extra_stopwords)

115

In [55]:
stop_list = stopwords.words('english')
stop_list = stop_list + extra_stopwords
len(stop_list)

268

In [56]:
stop_list = set(stop_list)
len(stop_list)

206

In [57]:
sample_df['article_text_extra_stops_v4'] = sample_df['cleaned_article_text'].apply(remove_stop_words, stop_list=stop_list)

In [58]:
print(len(sample_df.iloc[2000]['cleaned_article_text']))
print(len(sample_df.iloc[2000]['article_text_extra_stops']))
print(len(sample_df.iloc[2000]['article_text_extra_stops_v2']))
print(len(sample_df.iloc[2000]['article_text_extra_stops_v3']))
print(len(sample_df.iloc[2000]['article_text_extra_stops_v4']))

2357
2289
2269
2253
2106


## Again

### LDA

In [59]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(sample_df['article_text_extra_stops_v4'])
counts = count_vectorizer.transform(sample_df['article_text_extra_stops_v4']).transpose()
counts.shape

(99160, 10000)

In [60]:
corpus = matutils.Sparse2Corpus(counts)
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())
len(id2word)

99160

In [61]:
lda = models.LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, passes=1)
lda.print_topics()

[(0,
  '0.003*"crisis" + 0.002*"euro" + 0.002*"europe" + 0.002*"long" + 0.002*"countries" + 0.002*"money" + 0.002*"japan" + 0.002*"central" + 0.002*"public" + 0.002*"says"'),
 (1,
  '0.003*"companies" + 0.003*"hit" + 0.003*"japan" + 0.002*"crisis" + 0.002*"cut" + 0.002*"central" + 0.002*"level" + 0.002*"billion" + 0.002*"gained" + 0.002*"gains"'),
 (2,
  '0.003*"companies" + 0.002*"jobs" + 0.002*"price" + 0.002*"euro" + 0.002*"eu" + 0.002*"europe" + 0.002*"trading" + 0.002*"hit" + 0.002*"reporting" + 0.002*"showed"'),
 (3,
  '0.002*"eu" + 0.002*"euro" + 0.002*"companies" + 0.002*"chief" + 0.002*"crisis" + 0.002*"hit" + 0.002*"term" + 0.002*"public" + 0.002*"trading" + 0.002*"deal"'),
 (4,
  '0.002*"trading" + 0.002*"europe" + 0.002*"hit" + 0.002*"gains" + 0.002*"companies" + 0.002*"investment" + 0.002*"expectations" + 0.002*"japan" + 0.002*"according" + 0.002*"cut"'),
 (5,
  '0.003*"sales" + 0.003*"trading" + 0.002*"companies" + 0.002*"billion" + 0.002*"japan" + 0.002*"euro" + 0.002*"p

### Stop words

In [62]:
extra_stopwords = extra_stopwords + ['Euro', 'euro', 'EU', 'eu', 'Europe', 'europe', 'long', 'Japan', 'japan', 'says', 
                                    'hit', 'reporting', 'Reporting']
len(extra_stopwords)

128

In [63]:
stop_list = stopwords.words('english')
stop_list = stop_list + extra_stopwords
len(stop_list)

281

In [64]:
sample_df['article_text_extra_stops_v5'] = sample_df['cleaned_article_text'].apply(remove_stop_words, stop_list=stop_list)

In [65]:
print(len(sample_df.iloc[2000]['cleaned_article_text']))
print(len(sample_df.iloc[2000]['article_text_extra_stops']))
print(len(sample_df.iloc[2000]['article_text_extra_stops_v2']))
print(len(sample_df.iloc[2000]['article_text_extra_stops_v3']))
print(len(sample_df.iloc[2000]['article_text_extra_stops_v4']))
print(len(sample_df.iloc[2000]['article_text_extra_stops_v5']))

2357
2289
2269
2253
2106
2056


## Again

### LDA

In [66]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(sample_df['article_text_extra_stops_v5'])
counts = count_vectorizer.transform(sample_df['article_text_extra_stops_v5']).transpose()
counts.shape

(99159, 10000)

In [67]:
corpus = matutils.Sparse2Corpus(counts)
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())
len(id2word)

99159

In [68]:
lda = models.LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, passes=1)
lda.print_topics()

[(0,
  '0.003*"companies" + 0.003*"trading" + 0.002*"ftse" + 0.002*"gains" + 0.002*"sales" + 0.002*"reported" + 0.002*"chief" + 0.002*"greece" + 0.002*"expectations" + 0.002*"crisis"'),
 (1,
  '0.002*"crisis" + 0.002*"public" + 0.002*"gains" + 0.002*"record" + 0.002*"cut" + 0.002*"gained" + 0.002*"countries" + 0.002*"according" + 0.002*"central" + 0.002*"major"'),
 (2,
  '0.003*"crisis" + 0.002*"investment" + 0.002*"price" + 0.002*"credit" + 0.002*"like" + 0.002*"level" + 0.002*"trading" + 0.002*"going" + 0.002*"companies" + 0.002*"gains"'),
 (3,
  '0.002*"public" + 0.002*"chancellor" + 0.002*"labour" + 0.002*"price" + 0.002*"jobs" + 0.002*"companies" + 0.002*"crisis" + 0.002*"need" + 0.002*"make" + 0.002*"countries"'),
 (4,
  '0.003*"billion" + 0.002*"money" + 0.002*"pay" + 0.002*"bond" + 0.002*"central" + 0.002*"jobs" + 0.002*"chief" + 0.002*"demand" + 0.002*"fund" + 0.002*"sales"'),
 (5,
  '0.003*"gains" + 0.003*"trading" + 0.002*"companies" + 0.002*"jobs" + 0.002*"price" + 0.002*"s

### Stop words

In [55]:
extra_stopwords = extra_stopwords + ['strong', 'financial', 'Strong', 'Financial']

In [69]:
['shares',
 'economy',
 'economic',
 'prices',
 'investors',
 'government',
 'growth',
 'banks',
 'expected',
 'rate',
 'rates',
 'stocks',
 'financial',
 'uk',
 'european',
 'u_s',
 'tax',
 'britain',
 'inflation',
 'country',
 'bank',
 'business',
 'spending',
 'dollar',
 'oil',
 'company',
 'stock',
 'yen',
 'fed',
 'earnings',
 'sector',
 'global',
 'policy',
 'world',
 'debt',
 'china',
 'biggest',
 'trade',
 'bank',
 'work',
 'nikkei',
 'profit',
 'business',
 'uk',
 'european',
 'u_s',
 'tax',
 'britain',
 'inflation',
 'country',
 'bank',
 'business',
 'spending',
 'dollar',
 'oil',
 'company',
 'stock',
 'yen',
 'fed',
 'earnings',
 'sector',
 'global',
 'policy',
 'world',
 'debt',
 'china',
 'biggest',
 'trade',
 'bank',
 'work',
 'nikkei',
 'profit',
 'business',
 'uk',
 'european',
 'u_s',
 'tax',
 'britain',
 'inflation',
 'country',
 'bank',
 'business',
 'spending',
 'dollar',
 'oil',
 'company',
 'stock',
 'yen',
 'fed',
 'earnings',
 'sector',
 'global',
 'policy',
 'world',
 'debt',
 'china',
 'biggest',
 'trade',
 'bank',
 'work',
 'nikkei',
 'profit',
 'business',
 'European',
 'UK',
 'U_S',
 'Bank',
 'Nikkei',
 'Fed',
 'Britain',
 'Dollar',
 'Yen',
 'Global',
 'World',
 'China',
 'Euro',
 'euro',
 'EU',
 'eu',
 'Europe',
 'europe',
 'long',
 'Japan',
 'japan',
 'says',
 'hit',
 'reporting',
 'Reporting']

['shares',
 'economy',
 'economic',
 'prices',
 'investors',
 'government',
 'growth',
 'banks',
 'expected',
 'rate',
 'rates',
 'stocks',
 'financial',
 'uk',
 'european',
 'u_s',
 'tax',
 'britain',
 'inflation',
 'country',
 'bank',
 'business',
 'spending',
 'dollar',
 'oil',
 'company',
 'stock',
 'yen',
 'fed',
 'earnings',
 'sector',
 'global',
 'policy',
 'world',
 'debt',
 'china',
 'biggest',
 'trade',
 'bank',
 'work',
 'nikkei',
 'profit',
 'business',
 'uk',
 'european',
 'u_s',
 'tax',
 'britain',
 'inflation',
 'country',
 'bank',
 'business',
 'spending',
 'dollar',
 'oil',
 'company',
 'stock',
 'yen',
 'fed',
 'earnings',
 'sector',
 'global',
 'policy',
 'world',
 'debt',
 'china',
 'biggest',
 'trade',
 'bank',
 'work',
 'nikkei',
 'profit',
 'business',
 'uk',
 'european',
 'u_s',
 'tax',
 'britain',
 'inflation',
 'country',
 'bank',
 'business',
 'spending',
 'dollar',
 'oil',
 'company',
 'stock',
 'yen',
 'fed',
 'earnings',
 'sector',
 'global',
 'policy',
 '

In [56]:
extra_stopwords = list(set(extra_stopwords))

## Apply extra stop word removal to full data set

In [57]:
stop_list = stopwords.words('english')
stop_list = stop_list + extra_stopwords
len(stop_list)

218

In [58]:
df['article_text_extra_stops'] = df['cleaned_article_text'].apply(remove_stop_words, stop_list=stop_list)

In [59]:
df.sample(5)

Unnamed: 0,article_text,datetime,source,title,url,cleaned_article_text,article_text_extra_stops
57471,(Refiles to correct typographical error in fir...,2009-10-26,Reuters,REFILE-Late selloff extends European stocks' l...,http://uk.reuters.com/article/markets-europe-s...,European stocks fell third session row Monday ...,European third session row sharp drop oil rebo...
1932,"\nThe US president, Barack Obama, has signalle...",2016-11-13 19:39:08,Guardian,Barack Obama calls for 'meaningful debt relief...,https://www.theguardian.com/us-news/2016/nov/1...,The US president Barack_Obama signalled use cr...,The US president Barack_Obama signalled use cr...
4653,"\nHamlet without the prince. That, bluntly, is...",2017-01-17 10:29:43,Guardian,Davos without Donald Trump is like Hamlet with...,https://www.theguardian.com/business/2017/jan/...,Hamlet without prince That bluntly Davos feels...,Hamlet without prince That bluntly Davos feels...
11583,\nMark Carney has backed the prime minister’s ...,2016-10-06 19:18:21,Guardian,Mark Carney backs prime minister's call for ec...,https://www.theguardian.com/politics/2016/oct/...,Mark_Carney backed prime minister call rethink...,Mark_Carney backed prime minister call rethink...
72597,"Hedge funds may not like Donald Trump, but th...",2016-09-26 00:00:00,CNBC,\n This trade will tell you whether Tru...,https://www.cnbc.com/2016/09/26/this-trade-wil...,Hedge funds may like Donald_Trump mean try mak...,Hedge funds may like Donald_Trump mean try mak...


## Save data

In [60]:
df.to_csv('news_articles_extra_stops.csv', index=None, encoding='utf-8')