# News Data Analysis

### Libraries

In [1]:
import pandas as pd
from pathlib import Path
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import plotly.express as px
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
from nltk.stem.snowball import SnowballStemmer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

### Data preprocessing

Data preprocessing and cleaning is an important step before any text mining task, in this step, we will remove the punctuation, stop words and normalize the reviews as much as possible. After every preprocessing step, it is a good practice to check the most frequent words in the data. Therefore, let’s define a function that would plot a bar graph of n most frequent words in the data.

In [2]:
search_news_raw=pd.read_csv('/work/news-topic-modeling/data/raw/raw_search_news_dataset.csv')

In [3]:
search_news_raw=search_news_raw.iloc[:, [ False, True, True,True,True, False,True,True, False,True]]

In [4]:
latest_news_raw=pd.read_csv('/work/news-topic-modeling/data/raw/raw_latest_news_dataset.csv')

In [5]:
latest_news_raw

Unnamed: 0.1,Unnamed: 0,author,title,description,url,urlToImage,publishedAt,content,source.id,source.name
0,0,"Denny Jacob, Quentin Webb, Margot Patrick, Cha...",Stock Market Today: Dow Rises More Than 300 Po...,"Live coverage of stocks and financial news, in...",https://www.wsj.com/livecoverage/stock-market-...,https://images.wsj.net/im-785590/social,2023-05-26T14:28:00Z,Stocks are moving higher to end the week as a ...,the-wall-street-journal,The Wall Street Journal
1,1,Jonathan M. Gitlin,Massive trove of Tesla files contains thousand...,A Tesla employee gave more than 100GB of data ...,https://arstechnica.com/cars/2023/05/massive-t...,https://cdn.arstechnica.net/wp-content/uploads...,2023-05-26T13:39:38Z,The German publication Handelsblatt is in poss...,ars-technica,Ars Technica
2,2,Hayden Field,OpenAI's Sam Altman reverses threat to cease E...,OpenAI's CEO threatened to cease operations in...,https://www.cnbc.com/2023/05/26/openai-ceo-sam...,https://image.cnbcfm.com/api/v1/image/10719649...,2023-05-26T13:21:23Z,"Sam Altman, president of Y Combinator, pauses ...",,CNBC
3,3,Investor's Business Daily,The Key Fed Inflation Rate Heated Up In April ...,,https://www.investors.com/news/economy/key-fed...,,2023-05-26T13:05:00Z,The Federal Reserve's primary inflation rate s...,,Investor's Business Daily
4,4,"Hayden Field, Jonathan Vanian",Tech layoffs ravage the teams that fight onlin...,"Meta, Amazon, Alphabet and Twitter have all dr...",https://www.cnbc.com/2023/05/26/tech-companies...,https://image.cnbcfm.com/api/v1/image/10716909...,2023-05-26T13:00:01Z,In this article META\n\nAMZN\n\nMSFT\n\nGOOGL ...,,CNBC
5,5,Diego Colman,US Dollar Perks Up While Gold Turns Lower Foll...,Gold prices turned lower and retraced session ...,https://www.dailyfx.com/news/forex-us-dollar-p...,https://a.c-dn.net/b/3aQ6FM/USD_DXY.jpg,2023-05-26T12:50:00Z,PCE REPORT KEY POINTS:\n\nApril U.S. consumer ...,,DailyFX
6,6,CNBC,This author spent 3 years interviewing over 10...,,https://news.google.com/rss/articles/CBMidGh0d...,,2023-05-26T12:44:46Z,Simone Stolzoff spent much of his 20s in searc...,google-news,Google News
7,7,Morgan Smith,"How to be happier at work, according to author...",The people who were happiest in their careers ...,https://www.cnbc.com/2023/05/26/this-author-in...,https://image.cnbcfm.com/api/v1/image/10724624...,2023-05-26T12:44:46Z,Simone Stolzoff spent much of his 20s in searc...,,CNBC
8,8,ZeeBiz WebTeam,"BHEL Q4 Result: PAT slips more than 30%, reven...",BHEL Q4 Result: The earnings released showed t...,https://www.zeebiz.com/companies/news-bhel-q4-...,https://cdn.zeebiz.com/sites/default/files/202...,2023-05-26T12:38:16Z,BHEL Q4 Result: BHEL or Bharat Heavy Electrica...,,Zee Business
9,9,,A debt ceiling deal will help the market break...,"Tobin Marcus, Evercore ISI senior U.S. policy ...",https://www.youtube.com/watch?v=fwc51ZA-MDQ,https://i.ytimg.com/vi/fwc51ZA-MDQ/maxresdefau...,2023-05-26T12:31:40Z,,,YouTube


In [6]:
latest_news_raw=latest_news_raw.iloc[:, [ False, True, True,True,True, False,True,True, False,True]]

In [7]:
latest_news_raw

Unnamed: 0,author,title,description,url,publishedAt,content,source.name
0,"Denny Jacob, Quentin Webb, Margot Patrick, Cha...",Stock Market Today: Dow Rises More Than 300 Po...,"Live coverage of stocks and financial news, in...",https://www.wsj.com/livecoverage/stock-market-...,2023-05-26T14:28:00Z,Stocks are moving higher to end the week as a ...,The Wall Street Journal
1,Jonathan M. Gitlin,Massive trove of Tesla files contains thousand...,A Tesla employee gave more than 100GB of data ...,https://arstechnica.com/cars/2023/05/massive-t...,2023-05-26T13:39:38Z,The German publication Handelsblatt is in poss...,Ars Technica
2,Hayden Field,OpenAI's Sam Altman reverses threat to cease E...,OpenAI's CEO threatened to cease operations in...,https://www.cnbc.com/2023/05/26/openai-ceo-sam...,2023-05-26T13:21:23Z,"Sam Altman, president of Y Combinator, pauses ...",CNBC
3,Investor's Business Daily,The Key Fed Inflation Rate Heated Up In April ...,,https://www.investors.com/news/economy/key-fed...,2023-05-26T13:05:00Z,The Federal Reserve's primary inflation rate s...,Investor's Business Daily
4,"Hayden Field, Jonathan Vanian",Tech layoffs ravage the teams that fight onlin...,"Meta, Amazon, Alphabet and Twitter have all dr...",https://www.cnbc.com/2023/05/26/tech-companies...,2023-05-26T13:00:01Z,In this article META\n\nAMZN\n\nMSFT\n\nGOOGL ...,CNBC
5,Diego Colman,US Dollar Perks Up While Gold Turns Lower Foll...,Gold prices turned lower and retraced session ...,https://www.dailyfx.com/news/forex-us-dollar-p...,2023-05-26T12:50:00Z,PCE REPORT KEY POINTS:\n\nApril U.S. consumer ...,DailyFX
6,CNBC,This author spent 3 years interviewing over 10...,,https://news.google.com/rss/articles/CBMidGh0d...,2023-05-26T12:44:46Z,Simone Stolzoff spent much of his 20s in searc...,Google News
7,Morgan Smith,"How to be happier at work, according to author...",The people who were happiest in their careers ...,https://www.cnbc.com/2023/05/26/this-author-in...,2023-05-26T12:44:46Z,Simone Stolzoff spent much of his 20s in searc...,CNBC
8,ZeeBiz WebTeam,"BHEL Q4 Result: PAT slips more than 30%, reven...",BHEL Q4 Result: The earnings released showed t...,https://www.zeebiz.com/companies/news-bhel-q4-...,2023-05-26T12:38:16Z,BHEL Q4 Result: BHEL or Bharat Heavy Electrica...,Zee Business
9,,A debt ceiling deal will help the market break...,"Tobin Marcus, Evercore ISI senior U.S. policy ...",https://www.youtube.com/watch?v=fwc51ZA-MDQ,2023-05-26T12:31:40Z,,YouTube


In [8]:
latest_news_raw=latest_news_raw.rename(columns={'publishedAt':'Date','source.name':'Source'})

In [9]:
search_news_raw=search_news_raw.rename(columns={'publishedAt':'Date','source.name':'Source'})

In [10]:
latest_news_raw['Date']

0     2023-05-26T14:28:00Z
1     2023-05-26T13:39:38Z
2     2023-05-26T13:21:23Z
3     2023-05-26T13:05:00Z
4     2023-05-26T13:00:01Z
5     2023-05-26T12:50:00Z
6     2023-05-26T12:44:46Z
7     2023-05-26T12:44:46Z
8     2023-05-26T12:38:16Z
9     2023-05-26T12:31:40Z
10    2023-05-26T12:21:22Z
11    2023-05-26T12:20:00Z
12    2023-05-26T12:06:37Z
13    2023-05-26T12:05:19Z
14    2023-05-26T12:01:00Z
15    2023-05-26T12:00:01Z
16    2023-05-26T12:00:00Z
17    2023-05-26T12:00:00Z
18    2023-05-26T11:59:17Z
19    2023-05-26T11:50:00Z
Name: Date, dtype: object

In [11]:
latest_news_raw['Date']=pd.to_datetime(latest_news_raw['Date'], format='%Y-%m-%dT%H:%M:%SZ')

In [12]:
search_news_raw['Date']=pd.to_datetime(search_news_raw['Date'], format='%Y-%m-%dT%H:%M:%SZ')

In [13]:
# remove unwanted characters, numbers and symbols
search_news_raw['content']=search_news_raw['content'].str.replace(" [^a-zA-Z#] |\n|\'s|[:@#$&=^*!?~-]|[()]|[/]|\[|\]|\"|\'", " ")
latest_news_raw['content']=latest_news_raw['content'].str.replace(" [^a-zA-Z#] |\n|\'s|[:@#$&=^*!?~-]|[()]|[/]|\[|\]|\"|\'", " ")

  search_news_raw['content']=search_news_raw['content'].str.replace(" [^a-zA-Z#] |\n|\'s|[:@#$&=^*!?~-]|[()]|[/]|\[|\]|\"|\'", " ")
  latest_news_raw['content']=latest_news_raw['content'].str.replace(" [^a-zA-Z#] |\n|\'s|[:@#$&=^*!?~-]|[()]|[/]|\[|\]|\"|\'", " ")


In [14]:
latest_news_raw

Unnamed: 0,author,title,description,url,Date,content,Source
0,"Denny Jacob, Quentin Webb, Margot Patrick, Cha...",Stock Market Today: Dow Rises More Than 300 Po...,"Live coverage of stocks and financial news, in...",https://www.wsj.com/livecoverage/stock-market-...,2023-05-26 14:28:00,Stocks are moving higher to end the week as a ...,The Wall Street Journal
1,Jonathan M. Gitlin,Massive trove of Tesla files contains thousand...,A Tesla employee gave more than 100GB of data ...,https://arstechnica.com/cars/2023/05/massive-t...,2023-05-26 13:39:38,The German publication Handelsblatt is in poss...,Ars Technica
2,Hayden Field,OpenAI's Sam Altman reverses threat to cease E...,OpenAI's CEO threatened to cease operations in...,https://www.cnbc.com/2023/05/26/openai-ceo-sam...,2023-05-26 13:21:23,"Sam Altman, president of Y Combinator, pauses ...",CNBC
3,Investor's Business Daily,The Key Fed Inflation Rate Heated Up In April ...,,https://www.investors.com/news/economy/key-fed...,2023-05-26 13:05:00,The Federal Reserve primary inflation rate sh...,Investor's Business Daily
4,"Hayden Field, Jonathan Vanian",Tech layoffs ravage the teams that fight onlin...,"Meta, Amazon, Alphabet and Twitter have all dr...",https://www.cnbc.com/2023/05/26/tech-companies...,2023-05-26 13:00:01,In this article META AMZN MSFT GOOGL Follow...,CNBC
5,Diego Colman,US Dollar Perks Up While Gold Turns Lower Foll...,Gold prices turned lower and retraced session ...,https://www.dailyfx.com/news/forex-us-dollar-p...,2023-05-26 12:50:00,PCE REPORT KEY POINTS April U.S. consumer sp...,DailyFX
6,CNBC,This author spent 3 years interviewing over 10...,,https://news.google.com/rss/articles/CBMidGh0d...,2023-05-26 12:44:46,Simone Stolzoff spent much of his 20s in searc...,Google News
7,Morgan Smith,"How to be happier at work, according to author...",The people who were happiest in their careers ...,https://www.cnbc.com/2023/05/26/this-author-in...,2023-05-26 12:44:46,Simone Stolzoff spent much of his 20s in searc...,CNBC
8,ZeeBiz WebTeam,"BHEL Q4 Result: PAT slips more than 30%, reven...",BHEL Q4 Result: The earnings released showed t...,https://www.zeebiz.com/companies/news-bhel-q4-...,2023-05-26 12:38:16,BHEL Q4 Result BHEL or Bharat Heavy Electrica...,Zee Business
9,,A debt ceiling deal will help the market break...,"Tobin Marcus, Evercore ISI senior U.S. policy ...",https://www.youtube.com/watch?v=fwc51ZA-MDQ,2023-05-26 12:31:40,,YouTube


In [15]:
#func to get the lemma and stemmed form of word
def lemmatize_stemming(text):
    return SnowballStemmer(language='english').stem(WordNetLemmatizer().lemmatize(text, pos='v'))

    
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [16]:
print(search_news_raw['content'][19])

Artificial intelligence has been working behind the scenes of our apps and gadgets for many years now—interpreting our voice commands, recommending music playlists, picking out the next word for us on our phone keyboard—but in recent months AI technology has come to the fore like never before.  With AI now generating its own artwork and producing natural sounding text on demand, we ve moved up to a whole new level of sophistication. These powerful tools are being integrated into all of the apps we use every day, and that includes web browsers.  Microsoft is leading the way with its Edge browser, but others are following quickly behind, as we ll explain. Here we re going to cover built in browser features that put some AI magic at your fingertips, but there are also plenty of browser extensions available to help you get more out of AI too.  Microsoft Edge  Microsoft has been aggressively adding AI capabilities in as many of its products as possible, and the Edge browser is being used as

In [17]:
doc_sample = search_news_raw['content'][19]

print('original document: ')
words=[]
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Artificial', 'intelligence', 'has', 'been', 'working', 'behind', 'the', 'scenes', 'of', 'our', 'apps', 'and', 'gadgets', 'for', 'many', 'years', 'now—interpreting', 'our', 'voice', 'commands,', 'recommending', 'music', 'playlists,', 'picking', 'out', 'the', 'next', 'word', 'for', 'us', 'on', 'our', 'phone', 'keyboard—but', 'in', 'recent', 'months', 'AI', 'technology', 'has', 'come', 'to', 'the', 'fore', 'like', 'never', 'before.', '', 'With', 'AI', 'now', 'generating', 'its', 'own', 'artwork', 'and', 'producing', 'natural', 'sounding', 'text', 'on', 'demand,', 'we', 've', 'moved', 'up', 'to', 'a', 'whole', 'new', 'level', 'of', 'sophistication.', 'These', 'powerful', 'tools', 'are', 'being', 'integrated', 'into', 'all', 'of', 'the', 'apps', 'we', 'use', 'every', 'day,', 'and', 'that', 'includes', 'web', 'browsers.', '', 'Microsoft', 'is', 'leading', 'the', 'way', 'with', 'its', 'Edge', 'browser,', 'but', 'others', 'are', 'following', 'quickly', 'behind,', 'as', 'w

In [18]:
processed_docs = search_news_raw['content'].fillna('').astype(str).map(preprocess)
processed_docs.head(10)

0    [chatgpt, longer, need, reli, model, plugin, p...
1    [appl, restrict, employe, tool, like, openai, ...
2    [height, wed, season, approach, peopl, turn, a...
3    [search, chatgpt, appl, store, chatbot, launch...
4    [capabl, tool, progress, rapid, googl, microso...
5    [chatgpt, scam, crypto, scam, meta, warn, meta...
6    [chatgpt, avail, itali, temporari, openai, say...
7    [month, googl, appear, remot, worri, search, b...
8    [journalist, thing, appreci, gari, marcus, mak...
9    [amazon, look, chatbot, technolog, alexa, repo...
Name: content, dtype: object

In [19]:
# Join multiple lists
l=''

for i in latest_news_raw['content']:
    #print(i)
    l=l+str(i)

print(l)


 In economic data, the Federal Reserves preferred inflation gauge,…  +957 chars The German publication Handelsblatt is in possession of more than 23,000 internal files and documents from Tesla after an employee leaked the data. The files include personal information on more than 100,000 current and former employees, as well as thousands of reports of problems with Tesla  advanced driving assistance systems, Autopilot, and  Full Self Driving.   The earliest complaints in the data trove date back to 2015, and the most recent to March 2022. Most of the complaints arise from the US, although European and Asian customer problems are also reflected in the data.  More than 2,400 complaints allege sudden unintended acceleration problems. Although Autopilot and FSD have been the focus of headlines for the last few years, during the mid 2010s there were plenty of reports of Teslas taking off on their own accord—at least 232 cases have been reported in the US, although  as often turns out in case

In [20]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [21]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

In [22]:
pl=preprocess(l)

### Latest news wordcloud

In [23]:
import plotly.graph_objects as go
from dash import dcc
from wordcloud import STOPWORDS
from wordcloud import WordCloud
    # add search query to list of exclusions
excluded_words = list(STOPWORDS)
wordcloud_image = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(str(l)) 

fig = go.Figure()
fig.add_trace(go.Image(z=wordcloud_image))
fig.update_layout(
    height=600,
xaxis={"visible": False},
yaxis={"visible": False},
margin={"t": 0, "b": 0, "l": 0, "r": 0},
hovermode=False,
paper_bgcolor="#F9F9FA",
plot_bgcolor="#F9F9FA",
    )

    

In [24]:
import plotly.graph_objects as go
from dash import dcc
from wordcloud import STOPWORDS
from wordcloud import WordCloud
    # add search query to list of exclusions
excluded_words = list(STOPWORDS)
wordcloud_image = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(str(pl)) 

fig = go.Figure()
fig.add_trace(go.Image(z=wordcloud_image))
fig.update_layout(
    height=600,
xaxis={"visible": False},
yaxis={"visible": False},
margin={"t": 0, "b": 0, "l": 0, "r": 0},
hovermode=False,
paper_bgcolor="#F9F9FA",
plot_bgcolor="#F9F9FA",
    )

    

### Search news wordcloud

In [25]:
search_news_raw['content'].head()

0    ChatGPT no longer needs to rely on its own mod...
1    Apple has restricted employees from using AI t...
2    As the height of wedding season approaches, so...
3    If you’ve searched for “ChatGPT” in Apple’s Ap...
4    The capabilities of AI tools are progressing r...
Name: content, dtype: object

In [26]:
# Join multiple lists
s=''

for i in search_news_raw['content']:
    #print(i)
    s=s+str(i)

print(l)


 In economic data, the Federal Reserves preferred inflation gauge,…  +957 chars The German publication Handelsblatt is in possession of more than 23,000 internal files and documents from Tesla after an employee leaked the data. The files include personal information on more than 100,000 current and former employees, as well as thousands of reports of problems with Tesla  advanced driving assistance systems, Autopilot, and  Full Self Driving.   The earliest complaints in the data trove date back to 2015, and the most recent to March 2022. Most of the complaints arise from the US, although European and Asian customer problems are also reflected in the data.  More than 2,400 complaints allege sudden unintended acceleration problems. Although Autopilot and FSD have been the focus of headlines for the last few years, during the mid 2010s there were plenty of reports of Teslas taking off on their own accord—at least 232 cases have been reported in the US, although  as often turns out in case

In [27]:
import plotly.graph_objects as go
from dash import dcc
from wordcloud import STOPWORDS
from wordcloud import WordCloud
    # add search query to list of exclusions
excluded_words = list(STOPWORDS)
wordcloud_image = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(str(s)) 

fig = go.Figure()
fig.add_trace(go.Image(z=wordcloud_image))
fig.update_layout(
    height=600,
xaxis={"visible": False},
yaxis={"visible": False},
margin={"t": 0, "b": 0, "l": 0, "r": 0},
hovermode=False,
paper_bgcolor="#F9F9FA",
plot_bgcolor="#F9F9FA",
    )
if not os.path.exists("/work/news-topic-modeling/reports/figures/word-cloud.png"):
        fig.write_image("/work/news-topic-modeling/reports/figures/word-cloud.png")
    

NameError: name 'os' is not defined

In [28]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

In [29]:
ps=preprocess(s)

In [30]:
# function to plot most frequent terms
def freq_words(x, terms = 30):
    #all_words = ' '.join([text for text in x])
    #all_words = x.split(' ')
    all_words = x

    fdist = nltk.FreqDist(all_words)
    words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})

    # selecting top 20 most frequent words
    d = words_df.nlargest(columns="count", n = terms) 
    d=d.sort_values(by='count',ascending=True)
    fig = px.bar(d,x = "count",y= "word",
             hover_data=['word', 'count'], color='count',
             labels={'count':'frequecy of the word'}, height=400)
    fig.show()
freq_words(ps)

In [31]:
# function to plot most frequent terms
def freq_words(x, terms = 30):
    #all_words = ' '.join([text for text in x])
    #all_words = x.split(' ')
    all_words = x

    fdist = nltk.FreqDist(all_words)
    words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})

    # selecting top 20 most frequent words
    d = words_df.nlargest(columns="count", n = terms) 
    d=d.sort_values(by='count',ascending=True)
    fig = px.bar(d,x = "count",y= "word",
             hover_data=['word', 'count'], color='count',
             labels={'count':'frequecy of the word'},title='Top words', height=400)
    fig.show()
freq_words(pl)

In [32]:
import nltk
nltk.download('stopwords')

In [60]:
import nltk
from nltk.util import ngrams
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

word_fd = nltk.FreqDist(pl)
bigram_fd = nltk.FreqDist(nltk.bigrams(ps))

l1=[]
l2=[]
bigram_fd.most_common()
for i in range(20):
    l1.append(bigram_fd.most_common()[i][0])
    l2.append(bigram_fd.most_common()[i][1])

df=pd.DataFrame({'two words':l1,'count':l2})

In [61]:
df['two words'][0]
def convertTuple(tup):
    str1 = ' '.join(tup)
    return str1

In [62]:
df['two words']=df['two words'].apply(convertTuple)

In [63]:
df

In [64]:

trigram_fd = nltk.FreqDist(nltk.trigrams(ps))

l1=[]
l2=[]
trigram_fd.most_common()
for i in range(20):
    l1.append(trigram_fd.most_common()[i][0])
    l2.append(trigram_fd.most_common()[i][1])

df1=pd.DataFrame({'three words':l1,'count':l2})

In [65]:
df1['three words']=df1['three words'].apply(convertTuple)

In [66]:
df1

In [72]:
# function to plot most frequent terms
def freq_words(df):
    df=df.sort_values(by='count',ascending=True)
    fig = px.bar(df,x = "count",y= "two words"
             , color='count',
             labels={'count':'frequecy'},title='Twp word phrases ', height=400)
    fig.show()
    if not os.path.exists("/work/news-topic-modeling/reports/figures/two-words-chart.png"):
        fig.write_image("/work/news-topic-modeling/reports/figures/two-words-chart.png")
freq_words(df)

In [73]:
def freq_words(df):
    df=df.sort_values(by='count',ascending=True)
    fig = px.bar(df,x = "count",y= "three words",
             hover_data=['three words', 'count'], color='count',
             labels={'count':'frequecy'},title='Three word phrases', height=400)
    fig.show()
    if not os.path.exists("/work/news-topic-modeling/reports/figures/three-words-chart.png"):
        fig.write_image("/work/news-topic-modeling/reports/figures/three-words-chart.png")
freq_words(df1)

In [42]:
search_news_raw

In [50]:
df=pd.DataFrame({'News source':search_news_raw['Source'].value_counts().index,'Count':search_news_raw['Source'].value_counts().values})

In [51]:
fig = px.pie(df, values='Count', names='News source', title='News source distribution')
fig.show()

In [48]:
df=pd.DataFrame({'News source':latest_news_raw['Source'].value_counts().index,'Count':latest_news_raw['Source'].value_counts().values})

In [49]:
fig = px.pie(df, values='Count', names='News source', title='News source distribution')
fig.show()

In [52]:
import os

if not os.path.exists("/work/news-topic-modeling/reports/figures/pie-chart.png"):
    fig.write_image("/work/news-topic-modeling/reports/figures/pie-chart.png")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=1e25ccbc-e36d-4741-b648-71cf16013ae6' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>