# Training Naive Bayes classifier on news articles' sentiment

In [31]:
# import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import pickle

In [32]:
data = pd.read_csv("data/CrudeOil_News_Articles.csv", encoding = "ISO-8859-1")
data.tail()

Unnamed: 0,Headline,News,Sentiment
35,Oil Prices Rally Despite Bearish Backlash.html,Despite the re-emerging Covid 19 crisis in Ind...,Pos
36,European Oil Majors Out Traded US Peers In 202...,Last year might have been a harrowing one for ...,Neg
37,Is Californias Fracking Ban A Big Deal For The...,"âCalifornia needs to move beyond oil,â Gov...",Neutral
38,The 5 Most Influential Oil Companies In The Wo...,In a virtual climate summit with 41 world lead...,Neutral
39,Suriname Hopes To Become South Americas Newest...,As Guyana prepares to become a major oil produ...,Pos


In [33]:
X = data.iloc[:,1] # extract column with news article body
X.tail()

35    Despite the re-emerging Covid 19 crisis in Ind...
36    Last year might have been a harrowing one for ...
37    âCalifornia needs to move beyond oil,â Gov...
38    In a virtual climate summit with 41 world lead...
39    As Guyana prepares to become a major oil produ...
Name: News, dtype: object

In [34]:
CountVectorizer?

[0;31mInit signature:[0m
[0mCountVectorizer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minput[0m[0;34m=[0m[0;34m'content'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoding[0m[0;34m=[0m[0;34m'utf-8'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdecode_error[0m[0;34m=[0m[0;34m'strict'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstrip_accents[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlowercase[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpreprocessor[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtokenizer[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstop_words[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtoken_pattern[0m[0;34m=[0m[0;34m'(?u)\\b\\w\\w+\\b'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mngram_range[0m[0;34m=[0m[0;34m([0m[0;36m1[0m[0;34m,[0m [0;36m1

In [35]:
# tokenize the news text and convert data in matrix format
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)
print(X_vec) # Scipy sparse matrix

  (0, 742)	4
  (0, 2083)	3
  (0, 3044)	1
  (0, 2027)	1
  (0, 2960)	2
  (0, 1007)	3
  (0, 2553)	6
  (0, 3505)	2
  (0, 2318)	2
  (0, 1258)	1
  (0, 2850)	3
  (0, 1759)	2
  (0, 1847)	3
  (0, 2423)	4
  (0, 3413)	2
  (0, 414)	2
  (0, 2742)	1
  (0, 1098)	5
  (0, 3218)	2
  (0, 1132)	1
  (0, 997)	1
  (0, 1975)	1
  (0, 1668)	2
  (0, 912)	2
  (0, 3457)	1
  :	:
  (39, 3988)	1
  (39, 2457)	1
  (39, 727)	1
  (39, 1286)	1
  (39, 1934)	1
  (39, 1009)	1
  (39, 2479)	1
  (39, 628)	1
  (39, 624)	1
  (39, 1947)	1
  (39, 1402)	2
  (39, 1652)	1
  (39, 1364)	1
  (39, 438)	1
  (39, 3376)	1
  (39, 294)	1
  (39, 715)	1
  (39, 925)	1
  (39, 808)	2
  (39, 437)	1
  (39, 1134)	1
  (39, 3654)	1
  (39, 3797)	1
  (39, 2001)	1
  (39, 378)	1


In [36]:
print(vectorizer.vocabulary_) # print out each feature/token



In [37]:
pickle.dump(vectorizer, open("data/vectorizer_crude_oil", 'wb')) # Save vectorizer for reuse

In [38]:
X_vec = X_vec.todense() # convert sparse matrix into dense matrix
X_vec

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 5, 0],
        [3, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [1, 0, 0, ..., 0, 0, 0]])

In [39]:
# Transform data by applying term frequency inverse document frequency (TF-IDF) 
tfidf = TfidfTransformer() #by default applies "l2" normalization
X_tfidf = tfidf.fit_transform(X_vec)
X_tfidf = X_tfidf.todense()
X_tfidf

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.14072562,
         0.        ],
        [0.08989077, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.01394705, 0.        , 0.        , ..., 0.        , 0.021819  ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.02877387, 0.        ,
         0.        ],
        [0.014548  , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [40]:
##################Apply Naive Bayes algorithm to train data####################

# Extract the news body and labels for training the classifier
X_train = X_tfidf[:39,:]
Y_train = data.iloc[:39,2]

In [41]:
# Train the NB classifier
clf = GaussianNB().fit(X_train, Y_train) 
pickle.dump(clf, open("data/nb_clf_crude_oil", 'wb')) # Save classifier for reuse