In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split

import re
import string
import nltk
from nltk.stem import WordNetLemmatizer

In [2]:
#input_data = pd.read_csv(r'C:\Users\Patrick\Documents\GitHub\bootcamp_capstone\kaggle_dataset\sentiment_analysis_financial_news\all-data.csv'
#                , encoding = "ISO-8859-1", header=None, names=['sentiment', 'text'])

input_data = pd.read_csv(r'C:\Users\Patrick\Documents\GitHub\bootcamp_capstone\kaggle_dataset\stock-market_sentiment\stock_data.csv',
                        encoding="ISO-8859-1", header=1, names=['text', 'sentiment'] )

input_data.head(100)

Unnamed: 0,text,sentiment
0,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
1,user I'd be afraid to short AMZN - they are lo...,1
2,MNTA Over 12.00,1
3,OI Over 21.37,1
4,PGNX Over 3.04,1
...,...,...
95,NG nhod - what do you see? check the weekly - ...,1
96,AIG American International Group Option Trader...,-1
97,P out balance +.32,1
98,VNG buys vs. Sells?,1


In [3]:
#install nltk package
import sys
!{sys.executable} -m pip install nltk



In [4]:
#download the necessary data 
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
#Data Cleaning
#first, remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()


In [6]:
#data cleaning preprocessing
pattern = r'[^a-zA-Z0-9\s\%]'
cleaned_buffer = []
for x in input_data['text']:
    temp = re.sub(pattern, " ", x)
    temp = temp.lower()
    temp = temp.split()
    temp = [lemmatizer.lemmatize(word) for word in temp if not word in set(stopwords)]
    temp = ' '.join(temp)
    cleaned_buffer.append(temp)



In [7]:
cleaned_buffer

['user aap movie 55% return fea geed indicator 15 trade year awesome',
 'user afraid short amzn looking like near monopoly ebooks infrastructure service',
 'mnta 12 00',
 'oi 21 37',
 'pgnx 3 04',
 'aap user current downtrend break otherwise short term correction med term downtrend',
 'monday relative weakness nyx win tie tap ice int bmc aon c chk biib',
 'goog ower trend line channel test volume support',
 'aap watch tomorrow ong entry',
 'assuming fcx open tomorrow 34 25 trigger buy still much like setup',
 'really worry everyone expects market rally usually exact opposite happens every time shall see soon bac spx jpm',
 'aap gamco arry haverty apple extremely cheap great video',
 'user maykiljil posted agree msft going higher possibly north 30',
 'momentum coming back etfc broke ma200 resistance solid volume friday ong set',
 'ha hitting 35 65 mean resume targeting 42 level',
 'user gameplan shot today liked trend break may c h break oc weekly trend break back july 2011',
 'fcx gapp

In [8]:
input_data['cleaned'] = cleaned_buffer
input_data.head()

Unnamed: 0,text,sentiment,cleaned
0,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user aap movie 55% return fea geed indicator 1...
1,user I'd be afraid to short AMZN - they are lo...,1,user afraid short amzn looking like near monop...
2,MNTA Over 12.00,1,mnta 12 00
3,OI Over 21.37,1,oi 21 37
4,PGNX Over 3.04,1,pgnx 3 04


In [9]:
#split into training and test data sets
xtrain, xtest, ytrain, ytest = train_test_split( input_data['cleaned'], input_data['sentiment'],
                                                               test_size=.4, random_state=10)
xtrain


582                  msft take 26 29 gap fill jan 2012 26
3524    need liste little devil shoulder often 1000 mt...
134     aap top april aston kutcher star job kinda lik...
2621    goog market leader heading next century mark 8...
997     eversal confirmed closing nicely 20 day 9 47 a...
                              ...                        
1180                         aap da 2 % gef llt mir nicht
3441    new blog post followup yesterday watchlist ots...
1344    gtat holding 50sma ooks like stock could try t...
4623                  gw corning set emerge 18 month base
1289    user h broke support today may drop channel to...
Name: cleaned, Length: 3474, dtype: object

In [10]:
tfidf = TfidfVectorizer(ngram_range=(1,3))
xtrain_tf = tfidf.fit_transform(xtrain)
print("nsamples: %d, nfeatures: %d" % xtrain_tf.shape)

xtest_tf = tfidf.transform(xtest)
print("nsamples: %d, nfeatures: %d" % xtest_tf.shape)

nsamples: 3474, nfeatures: 57630
nsamples: 2316, nfeatures: 57630


In [11]:
print(xtest_tf)

  (0, 42903)	0.4415425405754121
  (0, 42881)	0.2667092374907307
  (0, 8352)	0.42038109765368015
  (0, 8348)	0.2839049348989414
  (0, 8306)	0.32686820349996853
  (0, 8216)	0.3691910893434322
  (0, 3523)	0.21834102665199873
  (0, 3395)	0.33998443920835736
  (0, 663)	0.2649093853408777
  (1, 45368)	0.510097570366633
  (1, 25696)	0.510097570366633
  (1, 25695)	0.510097570366633
  (1, 14855)	0.3210467802934407
  (1, 12960)	0.3410723837858898
  (2, 31576)	0.822655797157474
  (2, 31525)	0.4690720683613014
  (2, 5270)	0.32126131744492964
  (3, 53636)	0.278052157054106
  (3, 25370)	0.6107922209596574
  (3, 19603)	0.7019442429341386
  (3, 5270)	0.23852492654719917
  (4, 54614)	0.14534575715856426
  (4, 31856)	0.20963214640606526
  (4, 31775)	0.15848079748007413
  (4, 30806)	0.30651812917821897
  :	:
  (2313, 47638)	0.15337363921567604
  (2313, 44688)	0.26061093268006646
  (2313, 44557)	0.13860070835600663
  (2313, 43182)	0.19655511431514644
  (2313, 41719)	0.19292688597246394
  (2313, 35112)	0.1

In [12]:
#bayes classification
nb_classify = MultinomialNB()
nb_classify.fit(xtrain_tf, ytrain)

MultinomialNB()

In [13]:
predictions = nb_classify.predict(xtest_tf)

In [14]:
results = metrics.classification_report(ytest, predictions)
print(results) #naive bays

              precision    recall  f1-score   support

          -1       0.85      0.19      0.31       826
           1       0.69      0.98      0.81      1490

    accuracy                           0.70      2316
   macro avg       0.77      0.59      0.56      2316
weighted avg       0.74      0.70      0.63      2316



In [15]:
print(metrics.confusion_matrix(ytest, predictions))

[[ 158  668]
 [  28 1462]]


In [16]:
#Now do with random forest

In [18]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [19]:
randforest = RandomForestClassifier()
scores = cross_val_score(randforest, xtrain_tf, ytrain.values.ravel(), cv=5)
print(scores)

[0.75971223 0.74100719 0.74964029 0.73381295 0.71613833]


In [20]:
#this is hyperparameter tuning, but with very few params
params = { 'n_estimators' : [5, 10, 25, 50, 100], 'max_depth' : [2, 5, 10, 20, None]}

grid_search = GridSearchCV (randforest, params)
grid_search.fit(xtrain_tf, ytrain.values.ravel())

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 5, 10, 20, None],
                         'n_estimators': [5, 10, 25, 50, 100]})

In [46]:
all_means = grid_search.cv_results_['mean_test_score']
all_std_dev = grid_search.cv_results_['std_test_score']
all_params = grid_search.cv_results_['params']
#print(all_params[1]['max_depth'])
for x in range(0, len(all_means)):
    print(all_params[x], "\t", all_means[x], "\t", all_std_dev[x])

{'max_depth': 2, 'n_estimators': 5} 	 0.632124064437211 	 0.0008149416010404049
{'max_depth': 2, 'n_estimators': 10} 	 0.6324126635291191 	 0.0009721290872219675
{'max_depth': 2, 'n_estimators': 25} 	 0.6315485248688658 	 0.00021230278025421434
{'max_depth': 2, 'n_estimators': 50} 	 0.6315485248688658 	 0.00021230278025421434
{'max_depth': 2, 'n_estimators': 100} 	 0.6315485248688658 	 0.00021230278025421434
{'max_depth': 5, 'n_estimators': 5} 	 0.6352895320631102 	 0.004131627117809366
{'max_depth': 5, 'n_estimators': 10} 	 0.6335629133580744 	 0.0022605693693205585
{'max_depth': 5, 'n_estimators': 25} 	 0.6318362946530384 	 0.0006613717490850005
{'max_depth': 5, 'n_estimators': 50} 	 0.6318362946530384 	 0.0006613717490850005
{'max_depth': 5, 'n_estimators': 100} 	 0.6315485248688658 	 0.00021230278025421434
{'max_depth': 10, 'n_estimators': 5} 	 0.6358663155930587 	 0.002416779609735369
{'max_depth': 10, 'n_estimators': 10} 	 0.6387473306657269 	 0.005300879968914325
{'max_depth': 1

In [22]:
grid_search.best_estimator_

RandomForestClassifier(n_estimators=50)

In [23]:
#doing this again but with arbitrarily picked, because why not?
forest1 = RandomForestClassifier(n_estimators=50, max_depth=5)
forest2 = RandomForestClassifier(n_estimators=25, max_depth=20)
forest3 = RandomForestClassifier(n_estimators=100, max_depth=None)

forest1.fit(xtrain_tf, ytrain.values.ravel())
forest2.fit(xtrain_tf, ytrain.values.ravel())
forest3.fit(xtrain_tf, ytrain.values.ravel())

RandomForestClassifier()

In [24]:
for x in [forest1, forest2, forest3]:
    predictions = x.predict(xtest_tf)
    results = metrics.classification_report(ytest, predictions)
    print(results)
    print("+++++++++++++++++++++++++++++++++++")

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       826
           1       0.64      1.00      0.78      1490

    accuracy                           0.64      2316
   macro avg       0.32      0.50      0.39      2316
weighted avg       0.41      0.64      0.50      2316

+++++++++++++++++++++++++++++++++++
              precision    recall  f1-score   support

          -1       0.85      0.05      0.09       826
           1       0.65      1.00      0.79      1490

    accuracy                           0.66      2316
   macro avg       0.75      0.52      0.44      2316
weighted avg       0.72      0.66      0.54      2316

+++++++++++++++++++++++++++++++++++
              precision    recall  f1-score   support

          -1       0.66      0.58      0.62       826
           1       0.78      0.83      0.81      1490

    accuracy                           0.74      2316
   macro avg       0.72      0.71      0.71      2316
we

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
#now with official hyperparameter tuning
#creating the grid
nestimators = [int(x) for x in np.linspace(start=5, stop=300, num = 5)]
max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 100, num=5)]
max_depth.append(None)

min_samples_split = [1, 2, 5, 10, 20, 25, 50, 100]
min_samples_leaf = [1,2,3,4]
bootstrap = [True, False]

grid = {'n_estimators': nestimators, 'max_features':max_features, 'max_depth':max_depth,
       'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf, 'bootstrap':bootstrap}


In [37]:
#if you can avoid running this again, that'd be nice. Took at least 5 hours with iter=1000/cv=5
rf_regress = RandomForestRegressor()
rf_rand = RandomizedSearchCV(estimator=rf_regress, param_distributions=grid, n_iter=1000, 
                            cv=5, verbose=2, random_state=10,n_jobs=-1)
rf_rand.fit(xtrain_tf, ytrain)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


615 fits failed out of a total of 5000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
290 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Patrick\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Patrick\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "C:\Users\Patrick\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\Patrick\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=1000,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 32, 55, 77, 100,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4],
                                        'min_samples_split': [1, 2, 5, 10, 20,
                                                              25, 50, 100],
                                        'n_estimators': [5, 78, 152, 226, 300]},
                   random_state=10, verbose=2)

In [38]:
rf_rand.best_estimator_

RandomForestRegressor(bootstrap=False, max_features='sqrt', min_samples_split=5,
                      n_estimators=78)

In [41]:
forest4 = RandomForestClassifier(bootstrap=False, max_features='sqrt', min_samples_split=5, n_estimators=78)
forest4.fit(xtrain_tf, ytrain)
forest4_predict = forest4.predict(xtest_tf)
results4 = metrics.classification_report(ytest, forest4_predict)

In [43]:
print(results4)


              precision    recall  f1-score   support

          -1       0.63      0.67      0.65       826
           1       0.81      0.78      0.80      1490

    accuracy                           0.74      2316
   macro avg       0.72      0.73      0.72      2316
weighted avg       0.75      0.74      0.75      2316



In [45]:
#turn into pickle file to load to the interwebs
#https://towardsdatascience.com/3-ways-to-deploy-machine-learning-models-in-production-cdba15b00e
#https://ianlondon.github.io/blog/pickling-basics/


#look at streamlit.io

#also, look into additional vectorizer methods.



import pickle

filename = "my_saved_model"
pickle.dump(forest4, open(filename, 'wb+') )

filename = "my_saved_tfidf"
pickle.dump(tfidf, open(filename, 'wb+'))