In [1]:
#Vectorizing documents using NLTK

text = "I am not a sentimental person but I believe in the utility of sentiment analysis"

In [2]:
# Tokenization
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('all')

In [3]:
tokens = word_tokenize(text)
print(tokens)

['I', 'am', 'not', 'a', 'sentimental', 'person', 'but', 'I', 'believe', 'in', 'the', 'utility', 'of', 'sentiment', 'analysis']


In [4]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
tokens=[lemmatizer.lemmatize(word) for word in tokens]

In [5]:
tokens

['I',
 'am',
 'not',
 'a',
 'sentimental',
 'person',
 'but',
 'I',
 'believe',
 'in',
 'the',
 'utility',
 'of',
 'sentiment',
 'analysis']

In [6]:
# Stemming
from nltk.stem import PorterStemmer
tokens=word_tokenize(text.lower())
ps = PorterStemmer()
tokens=[ps.stem(word) for word in tokens]
print(tokens)

['i', 'am', 'not', 'a', 'sentiment', 'person', 'but', 'i', 'believ', 'in', 'the', 'util', 'of', 'sentiment', 'analysi']


In [7]:
# Stop words
import nltk
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
tokens_new = [j for j in tokens if j not in stopwords]

In [9]:
tokens_new

['sentiment', 'person', 'believ', 'util', 'sentiment', 'analysi']

In [13]:
################################Vader Demo#####################################

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()
#analyser.polarity_scores("This is a good course")
analyser.polarity_scores("This is an awesome course") # degree modifier
# analyser.polarity_scores("The instructor is so cool")
# analyser.polarity_scores("The instructor is so cool!!") # exclaimataion changes score
# analyser.polarity_scores("The instructor is so COOL!!") # Capitalization changes score
# analyser.polarity_scores("Machine learning makes me :)") #emoticons
# analyser.polarity_scores("His antics had me ROFL")
# analyser.polarity_scores("The movie SUX") #Slangs

{'neg': 0.0, 'neu': 0.494, 'pos': 0.506, 'compound': 0.6249}

In [17]:
################################Textblob Demo##################################

from textblob import TextBlob

# TextBlob("His").sentiment
# TextBlob("remarkable").sentiment
# TextBlob("work").sentiment
# TextBlob("ethic").sentiment
# TextBlob("impressed").sentiment
# TextBlob("me").sentiment
TextBlob("His remarkable work ethic impressed me").sentiment

Sentiment(polarity=0.875, subjectivity=0.875)

In [19]:
# import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [20]:
# Create lists to store scraped news urls, headlines and text
url_list = []
date_time = []
news_text = []
headlines = [] 

In [21]:
for i in range(1,3): #parameters of range function correspond to page numbers in the website with news listings
    #get the list of unique urls in the page
    url = 'https://oilprice.com/Energy/Crude-Oil/Page-{}.html'.format(i)
    request = requests.get(url)
    soup = BeautifulSoup(request.text, "html.parser")
    for links in soup.find_all('div', {'class': 'categoryArticle'}):
        for info in links.find_all('a'):
            if info.get('href') not in url_list:
                url_list.append(info.get('href'))

In [22]:
for www in url_list:
    #access each url
    headlines.append(www.split("/")[-1].replace('-',' '))
    request = requests.get(www)
    soup = BeautifulSoup(request.text, "html.parser")
    
    #store date and time of publication of the article
    for dates in soup.find_all('span', {'class': 'article_byline'}):
        date_time.append(dates.text.split('-')[-1])
    
    #store the text of the news
    temp = []
    for news in soup.find_all('p'):
            temp.append(news.text)
    
    #identify the last line of the news article
    for last_sentence in reversed(temp):
        if last_sentence.split(" ")[0]=="By" and last_sentence.split(" ")[-1]=="Oilprice.com":
            break
        elif last_sentence.split(" ")[0]=="By":
            break
    
    #prune non news related text from the scraped data to create the news text
    joined_text = ' '.join(temp[temp.index("More Info")+1:temp.index(last_sentence)])
    news_text.append(joined_text)

In [23]:
# save news text along with the news headline in a dataframe      
news_df = pd.DataFrame({ 'Date' : date_time,
                         'Headline': headlines,
                         'News': news_text,
                       })

In [24]:
# use VADER to perform sentiment analysis on stored news articles
analyser = SentimentIntensityAnalyzer()

def comp_score(text):
   return analyser.polarity_scores(text)["compound"]   
  
news_df["sentiment"] = news_df["News"].apply(comp_score)

In [26]:
news_df

Unnamed: 0,Date,Headline,News,sentiment
0,"Aug 01, 2022, 6:00 PM CDT",How Russian Oil Is Making Its Way From Europe ...,Russia has been ramping up oil exports to Asia...,0.9095
1,"Aug 01, 2022, 10:30 AM CDT",Oil Dips Amid Renewed Demand Concerns.html,Oil prices plunged by 4% at the start of Augus...,-0.9477
2,"Jul 29, 2022, 2:00 PM CDT",US Crude Production Sinks In May.html,U.S. field production of crude oil fell in May...,-0.893
3,"Jul 31, 2022, 6:00 PM CDT",Demand Destruction Could Help America Refill I...,U.S. petroleum inventories are still sitting a...,-0.9682
4,"Jul 31, 2022, 2:00 PM CDT",NOCs Not Big Oil Are Responsible For Most Emis...,,0.0
5,"Jul 30, 2022, 4:00 PM CDT",More Oil Discoveries Boost Guyanas Offshore Bo...,"The former British colony of Guyana, a nation ...",-0.987
6,"Jul 29, 2022, 10:00 AM CDT",US Refiners Havent Seen Fuel Demand Destructio...,U.S. refiners say there is no indication acros...,-0.9306
7,"Jul 28, 2022, 7:00 PM CDT",Big Oil Set For Blowout Performance But What C...,"Earnings season is here with us once again, wi...",0.9971
8,"Jul 28, 2022, 10:30 AM CDT",Shell And TotalEnergies See Risk Of Higher Oil...,As oil companies begin to report their impress...,-0.7773
9,"Jul 28, 2022, 11:00 AM CDT",Earnings Season Is Here And Energy Companies A...,"Much to the chagrin of snowflakes everywhere, ...",0.9842
