In [35]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [36]:
data = pd.read_csv('Data/stocknews/Combined_News_DJIA.csv')
train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']

In [37]:
trainheadlines = []
for row in range(0,len(train.index)):
    trainheadlines.append(' '.join(str(x) for x in train.iloc[row,2:27]))

In [38]:
advancedvectorizer = TfidfVectorizer( min_df=0.03, max_df=0.2, max_features = 200000, ngram_range = (2, 2), stop_words='english')
advancedtrain = advancedvectorizer.fit_transform(trainheadlines)

In [39]:
print(advancedtrain.shape)

(1611, 85)


In [40]:
advancedmodel = MultinomialNB(alpha=0.0001)
advancedmodel = advancedmodel.fit(advancedtrain, train["Label"])
testheadlines = []
for row in range(0,len(test.index)):
    testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))
advancedtest = advancedvectorizer.transform(testheadlines)
preds5 = advancedmodel.predict(advancedtest)
acc5 = accuracy_score(test['Label'], preds5)

In [41]:
print('NBayes 2 accuracy: ', acc5)

NBayes 2 accuracy:  0.5052910052910053


In [42]:
advwords = advancedvectorizer.get_feature_names()
advcoeffs = advancedmodel.coef_.tolist()[0]
advcoeffdf = pd.DataFrame({'Words' : advwords, 
                        'Coefficient' : advcoeffs})
advcoeffdf = advcoeffdf.sort_values(['Coefficient', 'Words'], ascending=[0, 1])
advcoeffdf.head(10)

Unnamed: 0,Coefficient,Words
72,-3.588603,united states
76,-3.642958,west bank
10,-3.70462,bbc news
37,-3.734047,new zealand
56,-3.760085,saudi arabia
63,-3.773811,south korea
74,-3.925463,war crimes
39,-3.992305,north korean
7,-4.05192,al qaeda
33,-4.0602,middle east


In [43]:
advcoeffdf.tail(10)

Unnamed: 0,Coefficient,Words
78,-4.854493,wikileaks founder
50,-4.855103,pirate party
21,-4.876388,european parliament
2,-4.899953,10 years
79,-4.915401,world bank
42,-4.935968,nuclear weapons
1,-4.937108,10 000
43,-5.038358,officials say
5,-5.071895,30 years
48,-5.181443,phone hacking
