# Using trained Naive Bayes classifier to predict sentiment of a news article

In [27]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
import requests
from bs4 import BeautifulSoup

In [2]:
#############Importing trained classifier and fitted vectorizer################
nb_clf = pickle.load(open("data/nb_clf_crude_oil", 'rb'))
vectorizer = pickle.load(open("data/vectorizer_crude_oil", 'rb'))

In [4]:
##############Predict sentiment using the trained classifier###################

# Create lists to store scraped news urls, headlines and text
url_list = []
news_text = []
headlines = []

In [17]:
for i in range(4,6): #parameters of range function correspond to page numbers in the website with news listings
    url = 'https://oilprice.com/Energy/Crude-Oil/Page-{}.html'.format(i)
    request = requests.get(url)
    soup = BeautifulSoup(request.text, "html.parser")
    for links in soup.find_all('div', {'class': 'categoryArticle'}):
        for info in links.find_all('a'):
            if info.get('href') not in url_list:
                url_list.append(info.get('href'))

In [18]:
for www in url_list:
    temp = []
    headlines.append(www.split("/")[-1].replace('-',' '))
    request = requests.get(www)
    soup = BeautifulSoup(request.text, "html.parser")
    for news in soup.find_all('p'):
            temp.append(news.text)
    
    #identify the last line of the news article
    for last_sentence in reversed(temp):
        if last_sentence.split(" ")[0]=="By" and last_sentence.split(" ")[-1]=="Oilprice.com":
            break
        elif last_sentence.split(" ")[0]=="By":
            break
    
    #prune non news related text from the scraped data to create the news text
    joined_text = ' '.join(temp[temp.index("More Info")+1:temp.index(last_sentence)])
    news_text.append(joined_text)

In [19]:
# save news text along with the news headline in a dataframe      
news_df = pd.DataFrame({ 'Headline': headlines,
                         'News': news_text,
                       })

news_df.head()

Unnamed: 0,Headline,News
0,Russia OPEC To Gradually Ease Production Cuts ...,The OPEC+ alliance will gradually ease the col...
1,Oil Demand Could Peak By 2026 Goldman Sachs.html,Despite a bullish stance on the short-term fut...
2,3 Oil Stocks To Watch This Spring.html,Oil bulls who have been praying for oil prices...
3,Shell To Exhaust Dwindling Oil Gas Reserves By...,Shell expects to have produced 75 percent of i...
4,The Permian Faces An Empty Pipeline Crisis.html,"Just a few short years ago, the United States ..."


In [20]:
# export the news data into a csv file
news_df.to_csv("data/CrudeOil_News_Articles_test.csv",index=False)

In [21]:
# Import test data set
data_pred = pd.read_csv("data/CrudeOil_News_Articles_test.csv", encoding = "ISO-8859-1")

In [23]:
X_test     = data_pred.iloc[:,1] # extract column with news article
X_vec_test = vectorizer.transform(X_test) #don't use fit_transform here because the model is already fitted
X_vec_test = X_vec_test.todense() #convert sparse matrix to dense
X_vec_test

matrix([[4, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [4, 0, 0, ..., 0, 0, 0],
        [2, 0, 0, ..., 0, 0, 0]])

In [24]:
# Transform data by applying term frequency inverse document frequency (TF-IDF) 
tfidf = TfidfTransformer() #by default applies "l2" normalization
X_tfidf_test = tfidf.fit_transform(X_vec_test)
X_tfidf_test = X_tfidf_test.todense()
X_tfidf_test

matrix([[0.13815448, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.12428614, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.05689982, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [25]:
# Predict the sentiment values
y_pred = nb_clf.predict(X_tfidf_test)
y_pred

array(['Pos', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Pos', 'Neg',
       'Neutral', 'Neutral', 'Neutral', 'Neg', 'Neutral', 'Neutral',
       'Neutral', 'Pos', 'Neutral', 'Neutral', 'Pos', 'Neutral',
       'Neutral', 'Pos', 'Neutral', 'Neutral', 'Neutral', 'Neutral',
       'Pos', 'Neg', 'Neutral', 'Neutral', 'Neutral', 'Neg', 'Neutral',
       'Neutral', 'Neutral', 'Pos', 'Neutral', 'Neutral', 'Pos',
       'Neutral', 'Neutral', 'Pos', 'Neutral', 'Neutral', 'Pos', 'Pos',
       'Pos', 'Neutral', 'Neutral', 'Neutral', 'Pos', 'Neutral',
       'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Pos',
       'Neutral', 'Pos', 'Neutral'], dtype='<U7')

In [55]:
pd.DataFrame(y_pred[:40], data_pred["Headline"].iloc[:40])

Unnamed: 0_level_0,0
Headline,Unnamed: 1_level_1
Russia OPEC To Gradually Ease Production Cuts In May July.html,Pos
Oil Demand Could Peak By 2026 Goldman Sachs.html,Neutral
3 Oil Stocks To Watch This Spring.html,Neutral
Shell To Exhaust Dwindling Oil Gas Reserves By 2040.html,Neutral
The Permian Faces An Empty Pipeline Crisis.html,Neutral
OPECs Bullish Demand Data Sparks Hope Of New Oil Rally.html,Pos
Chinas Oil Imports Surge Ahead Of Refinery Maintenance Season.html,Neg
Saudi Arabias Breakeven Oil Price To Drop To 65 Next Year.html,Neutral
The 7 Trillion Reason Saudi Arabia Is Cutting Oil Production.html,Neutral
Enis Latest Oil Discovery Could Be Big For Angola.html,Neutral
