In [2]:
#import required libraries
from flask import Flask, render_template, json, jsonify, Response, request, redirect
import requests
from bs4 import BeautifulSoup  
from textblob import TextBlob
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ozank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ozank\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ozank\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
URL = 'https://rss.nytimes.com/services/xml/rss/nyt/World.xml'

In [4]:
def wordCount(tdesc):
    text = re.sub('<[^<]+?>', '', tdesc)
    text = text.lower()
    text = text.replace('"','')
    text = text.replace('“','')
    text = text.replace('”','')
    text = text.replace("’",'')

    strip = str.maketrans('','', string.punctuation)
    text = text.translate(strip)

    tokenized_word=word_tokenize(text)
    tokenized_word = [word.lower() for word in tokenized_word]

    stop_words = set(stopwords.words('english'))
    filtered_word = []

    for word in tokenized_word:
        if word not in stop_words:
            filtered_word.append(word)

    lem = WordNetLemmatizer()
    lem_words = []
    for w in filtered_word:
        lem_words.append(lem.lemmatize(w,'v'))

    fdist = FreqDist(lem_words)
    most_common = fdist.most_common(100)
    wcDict = []
    for item in most_common:
        res = {'word' : item[0], 'count': item[1]}
        wcDict.append(res)
    return wcDict

In [5]:
newsURL = URL
    
newsGet = requests.get(newsURL)
newsSoup = BeautifulSoup(newsGet.content, features='xml')
news = newsSoup.findAll('item')
totalDescription = ''

newsArticles = []
for article in news:
    newsArticle = {}
    newsArticle['title'] = article.title.text
    newsArticle['description'] = article.description.text
    newsArticle['link'] = article.link.text
    newsArticle['date'] = article.pubDate.text
        

    useBlob = article.title.text
    blob = TextBlob(useBlob) 
    newsArticle['polarity'] = blob.sentiment.polarity
        
    if newsArticle['polarity'] > 0:
        newsArticle['sentiment'] = 'Positive'
    elif newsArticle['polarity'] < 0:
        newsArticle['sentiment'] = 'Negative'
    else:
        newsArticle['sentiment'] = 'Neutral'

    newsArticles.append(newsArticle)
    totalDescription += newsArticle['description']

df = pd.DataFrame(newsArticles,columns=['title','description','link','date','polarity','sentiment'])
rDict = wordCount(totalDescription)

In [6]:
df.head()

Unnamed: 0,title,description,link,date,polarity,sentiment
0,British Columbia's Flooding Is Worse Because o...,After a summer of deadly heat and uncontrolled...,https://www.nytimes.com/2021/11/21/canada-floo...,"Sun, 21 Nov 2021 15:46:22 +0000",-0.2,Negative
1,"In Hard Times, Afghan Farmers Are Turning to O...",The war’s intense conclusion and a drought com...,https://www.nytimes.com/2021/11/21/world/asia/...,"Sun, 21 Nov 2021 19:59:33 +0000",-0.291667,Negative
2,Chileans Will Vote For President on Sunday,The top contenders to lead Chile out of a turb...,https://www.nytimes.com/2021/11/21/world/ameri...,"Sun, 21 Nov 2021 16:03:40 +0000",0.0,Neutral
3,"Ousted in Coup, Sudan’s Prime Minister Returns...","Four weeks after he was detained, Prime Minist...",https://www.nytimes.com/2021/11/21/world/afric...,"Sun, 21 Nov 2021 20:11:49 +0000",-0.1,Negative
4,Israeli Is Killed by Palestinian Near Holiest ...,"The shooting, the first attack by a Palestinia...",https://www.nytimes.com/2021/11/21/world/middl...,"Sun, 21 Nov 2021 19:34:48 +0000",-0.05,Negative


In [26]:
rDict

[{'word': 'say', 'count': 11},
 {'word': 'president', 'count': 6},
 {'word': 'officials', 'count': 5},
 {'word': 'country', 'count': 5},
 {'word': 'would', 'count': 4},
 {'word': 'new', 'count': 4},
 {'word': 'government', 'count': 4},
 {'word': 'state', 'count': 4},
 {'word': 'people', 'count': 4},
 {'word': 'two', 'count': 4},
 {'word': 'help', 'count': 4},
 {'word': 'meet', 'count': 3},
 {'word': 'side', 'count': 3},
 {'word': 'monday', 'count': 3},
 {'word': 'sign', 'count': 3},
 {'word': 'conflict', 'count': 3},
 {'word': 'border', 'count': 3},
 {'word': 'trap', 'count': 3},
 {'word': 'year', 'count': 3},
 {'word': 'one', 'count': 3},
 {'word': 'days', 'count': 3},
 {'word': 'least', 'count': 3},
 {'word': 'since', 'count': 3},
 {'word': 'countrys', 'count': 3},
 {'word': 'duterte', 'count': 3},
 {'word': 'kill', 'count': 3},
 {'word': 'military', 'count': 3},
 {'word': 'experts', 'count': 3},
 {'word': 'men', 'count': 3},
 {'word': 'surge', 'count': 3},
 {'word': 'last', 'count':

In [9]:
df.to_csv("sentimentdata.csv", encoding='utf-8', index=False)

Azure Blob Storage v12.9.0 - Python quickstart sample
None
