In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from os import getcwd

In [4]:
getcwd()

'C:\\Users\\manas'

In [5]:
data = pd.read_csv('C:/Users/manas/OneDrive/Documents/practice/Combined_News_DJIA.csv')
train = data[data['Date']<'2015-01-01']
test = data[data['Date']>'2014-12-31']

In [6]:
trainhd = []
for row in range(0,len(train.index)):
    trainhd.append(''.join(str(x) for x in train.iloc[row,2:27]))
testhd = []
for row in range(0,len(test.index)):
    testhd.append(''.join(str(x) for x in test.iloc[row,2:27]))

In [7]:
vectorizern2 = CountVectorizer(ngram_range = (2,2))
trainn2 = vectorizern2.fit_transform(trainhd)
print(trainn2.shape)

(1611, 371577)


In [8]:
logimodel = LogisticRegression()
logimodel = logimodel.fit(trainn2, train['Label'])
testn2 = vectorizern2.transform(testhd)
print(testn2.shape)
testpred = logimodel.predict(testn2)
print(pd.crosstab(test["Label"],testpred, rownames = ["Actual"], colnames = ["Predicted"]))
print(logimodel.score(testn2, test["Label"]))

(378, 371577)
Predicted   0    1
Actual            
0          72  114
1          47  145
0.574074074074


In [9]:
wordsn2 = vectorizern2.get_feature_names()
coeffsn2 = logimodel.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Words' : wordsn2, 
                        'Coefficient' : coeffsn2})
coeffdf = coeffdf.sort_values(['Coefficient', 'Words'], ascending=[0, 1])
print(coeffdf.tail(10))

        Coefficient            Words
119549    -0.209019          fire on
49189     -0.209340        bin laden
156179    -0.211252            if he
222388    -0.214659  nuclear weapons
667       -0.219754           10 000
32005     -0.221851       around the
345236    -0.223192            up in
325492    -0.224133         there is
331498    -0.224201          to kill
319269    -0.318302      the country


In [10]:
tfidf = TfidfVectorizer(min_df=0.03, max_df=0.97, max_features = 200000, ngram_range = (2, 2))
tfidftrain = tfidf.fit_transform(trainhd)
print(tfidftrain.shape)

(1611, 615)


In [12]:
tflogimodel = logimodel.fit(tfidftrain, train['Label'])
tfidftest = tfidf.transform(testhd)
testpredtfidf = logimodel.predict(tfidftest)
print(pd.crosstab(test["Label"],testpredtfidf, rownames = ["Actual"], colnames = ["Predicted"]))
print(tflogimodel.score(tfidftest, test["Label"]))

Predicted   0    1
Actual            
0          77  109
1          53  139
0.571428571429


In [13]:
wordstf = tfidf.get_feature_names()
coeffstf = tflogimodel.coef_.tolist()[0]
coeffdf1 = pd.DataFrame({'Words' : wordstf, 
                        'Coefficient' : coeffstf})
coeffdf1 = coeffdf1.sort_values(['Coefficient', 'Words'], ascending=[0, 1])
print(coeffdf1.tail(10))

     Coefficient            Words
63     -0.949706    been arrested
594    -0.955817          why the
525    -0.998790          to kill
83     -1.034238      children in
123    -1.118211          fire on
558    -1.122707            up in
36     -1.164176       around the
1      -1.180613           10 000
292    -1.215791  nuclear weapons
420    -1.330145      the country


In [47]:
nbmodel = MultinomialNB(alpha = 0.01)
nbmodeln2 = nbmodel.fit(trainn2, train["Label"])
nbpredn2 = nbmodeln2.predict(testn2)
print(pd.crosstab(test["Label"], nbpredn2, rownames = ["Actual"], colnames = ["Predicted"]))
print(nbmodeln2.score(testn2, test["Label"]))

Predicted   0    1
Actual            
0          43  143
1          53  139
0.481481481481


In [48]:
nbmodeltf = nbmodel.fit(tfidftrain, train["Label"])
nbpredtf = nbmodeltf.predict(tfidftest)
print(pd.crosstab(test["Label"], nbpredtf, rownames = ["Actual"], colnames = ["Predicted"]))
print(nbmodel.score(tfidftest, test["Label"]))

Predicted   0    1
Actual            
0          44  142
1          23  169
0.563492063492


In [28]:
rfmod = RandomForestClassifier(n_estimators=100, min_samples_leaf = 10, max_features = 0.3,
                                oob_score = True)
rfmodn2 = rfmod.fit(trainn2, train["Label"])
rfn2pred = rfmodn2.predict(testn2)
print(pd.crosstab(test["Label"],rfn2pred, rownames = ["Actual"], colnames = ["Predicted"]))
print(rfmodn2.score(testn2, test["Label"]))

Predicted   0    1
Actual            
0          41  145
1          49  143
0.486772486772


In [29]:
rfmodtf = rfmod.fit(tfidftrain, train["Label"])
rftfpred = rfmodtf.predict(tfidftest)
print(pd.crosstab(test["Label"],rftfpred, rownames = ["Actual"], colnames = ["Predicted"]))
print(rfmodtf.score(tfidftest, test["Label"]))

Predicted   0    1
Actual            
0          58  128
1          52  140
0.52380952381


In [54]:
tfvect = TfidfVectorizer(ngram_range = (2,2), max_features = 200000)
rf = RandomForestClassifier(n_jobs = -1, random_state = 42)
pipe = Pipeline(steps = [('tfvect',tfvect),('rf', rf)])
parameters = {'tfvect__max_df' : [0.7,0.2, 0.97], 'tfvect__min_df' : [0.03,0.04], 
              'rf__n_estimators' : [10,50,100], 'rf__min_samples_leaf' : [1,2,5,10],
              'rf__max_features' : ['auto', 0.3,0.4]}
g_search = GridSearchCV(pipe, parameters, scoring = 'accuracy')
g_fit = g_search.fit(trainhd, train["Label"])
best_est = g_fit.best_estimator_
print(best_est)
best_pred = g_fit.predict(testhd)
print(pd.crosstab(test["Label"],best_pred, rownames = ["Actual"], colnames = ["Predicted"]))

Pipeline(memory=None,
     steps=[('tfvect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=200000, min_df=0.03,
        ngram_range=(2, 2), norm='l2', preprocessor=None, smooth_idf=T...timators=50, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])
Predicted   0    1
Actual            
0          62  124
1          50  142


In [55]:
print(best_est.score(testhd, test["Label"]))

0.539682539683


In [53]:
nb = MultinomialNB()
pipe1 = Pipeline(steps = [('tfvect',tfvect),('nb', nb)])
parameters1 = {'tfvect__max_df' : [0.7, 0.97, 0.2, 0.3], 'tfvect__min_df' : [0.02, 0.03, 0.04], 
              'nb__alpha':[0.01,0.1, 0.001], 'nb__fit_prior': [True]}
g_search1 = GridSearchCV(pipe1, parameters1, scoring = 'accuracy')
g_fit1 = g_search1.fit(trainhd, train["Label"])
best_est1 = g_fit1.best_estimator_
print(best_est1)
best_pred1 = g_fit1.predict(testhd)
print(pd.crosstab(test["Label"],best_pred1, rownames = ["Actual"], colnames = ["Predicted"]))
print(best_est1.score(testhd, test["Label"]))

Pipeline(memory=None,
     steps=[('tfvect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.3, max_features=200000, min_df=0.03,
        ngram_range=(2, 2), norm='l2', preprocessor=None, smooth_idf=T...rue,
        vocabulary=None)), ('nb', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])
Predicted   0    1
Actual            
0          47  139
1          25  167
0.566137566138
