# Stock Prediction Models

## 1. Preprocessing

In [None]:
DATASET_PATH = "/content/drive/MyDrive/4th Sem/CSE 573 - SWM/CSE 573 Project/DataSets/Extract"
# DATASET_PATH = "/content/drive/MyDrive/4th Sem/CSE 573 - SWM/Test Datasets"

In [None]:
import csv
import pandas as pd
import numpy as np
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import dataset

In [None]:
# NEWS_GROUP = "NewsGroupTime_240"
# DATASET_NAME = "Amazon240.csv"
# DATASET = f"{DATASET_PATH}/{NEWS_GROUP}/{DATASET_NAME}"
DATASET = f"{DATASET_PATH}/Train_Data/Para_News240_Combined240.csv"

In [None]:
df = pd.read_csv(DATASET)

In [None]:
df.head()

Unnamed: 0,Source,DateTime,News,Movement,Stock
0,seekingalpha.com,2017-12-14 11:42:00+00:00,Bearish Calls Lumentum (NASDAQ: LITE ): It's a...,1,Apple
1,seekingalpha.com,2017-12-14 12:31:00+00:00,The areas to be concerned are that there are f...,1,Apple
2,seekingalpha.com,2017-12-15 12:04:00+00:00,Amazon will soon resume selling the Apple TV (...,1,Apple
3,seekingalpha.com,2017-12-15 12:04:00+00:00,Amazon will soon resume selling the Apple TV (...,1,Amazon
4,seekingalpha.com,2017-12-21 12:04:00+00:00,Walmart is experimenting with a cashier-less s...,-1,Amazon


In [None]:
data = df.copy()
data.drop(['Source'], axis=1, inplace=True)
data['Label'] = np.where(data['Movement'] == 1, 1, 0)
# data['DateTime'] = ''
sentences = data['News'].values
labels = data['Label'].values
# sentences = data['text'].values
# labels = data['label'].values
times = data['DateTime'].values
stocks = data['Stock'].values

In [None]:
def preprocess(sentence):
    result = re.sub('[^a-zA-Z]', ' ', sentence)
    result = result.lower()
    result = result.split()
    
    ps = PorterStemmer()
    stopSet = set(stopwords.words('english')) 
    result = [ps.stem(word) for word in result if not word in stopSet]
    result = ' '.join(result)
    return result

In [None]:
len(sentences)

22548

In [None]:
news = []
for i, sentence in enumerate(sentences):
    if i % 1000 == 0:
        print("Completed", i)
    news.append(preprocess(sentence))
# news = sentences

Completed 0
Completed 1000
Completed 2000
Completed 3000
Completed 4000
Completed 5000
Completed 6000
Completed 7000
Completed 8000
Completed 9000
Completed 10000
Completed 11000
Completed 12000
Completed 13000
Completed 14000
Completed 15000
Completed 16000
Completed 17000
Completed 18000
Completed 19000
Completed 20000
Completed 21000
Completed 22000


In [None]:
news[0]

'bearish call lumentum nasdaq lite dicey stock cramer prefer finisar nasdaq fnsr appl nasdaq aapl invest snap nyse snap cramer fan'

## 2. Feature Extraction

In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV

In [None]:
vectorizer = CountVectorizer(max_features=12500, ngram_range=(2, 2))
# vectorizer = TfidfVectorizer(max_features=5000)
# vectorizer = HashingVectorizer()

In [None]:
# vect = vectorizer.fit(news)
# vectorized = vect.transform(news)

# svd = TruncatedSVD(n_components=5, random_state=42)

# X = svd.fit_transform(vectorized)
f = vectorizer.fit(news)
# tfidf_transformer = TfidfTransformer()

# X = tfidf_transformer.fit_transform(news_count)
X = f.transform(news)
Y = np.array([
    [times[i], labels[i], stocks[i]]
    for i, _ in enumerate(labels)   
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

y_train_labels = y_train[:,1:2]
y_train_times  = [x[0] for x in y_train[:,0:1]]
y_train_stocks = [x[0] for x in y_train[:,2:3]]
y_train = np.array([int(x[0]) for x in y_train_labels])

y_test_labels  = y_test[:,1:2]
y_test_times   = [x[0] for x in y_test[:,0:1]]
y_test_stocks = [x[0] for x in y_test[:,2:3]]
y_test = np.array([int(x[0]) for x in y_test_labels])

## 3. Training

In [None]:
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import xgboost as xgb

In [None]:
param = {'max_depth': 10, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['reg_lambda'] = 1
# param['n_estimators'] = 50

In [None]:
# clf = xgb.XGBClassifier(**param)
# clf = DecisionTreeClassifier()
# clf = SVC(gamma='auto')
# clf = MultinomialNB()
clf = RandomForestClassifier(n_estimators=10)
# clf = AdaBoostClassifier(n_estimators=100)
# clf = LogisticRegression()
# clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2))

In [None]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## 4. Evaluation

In [None]:
from sklearn.metrics import confusion_matrix , accuracy_score, f1_score, roc_auc_score, classification_report

In [None]:
y_pred = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

0.6068736141906874

In [None]:
confusion_matrix(y_test, y_pred)

array([[1662,  735],
       [1038, 1075]])

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.69      0.65      2397
           1       0.59      0.51      0.55      2113

    accuracy                           0.61      4510
   macro avg       0.60      0.60      0.60      4510
weighted avg       0.61      0.61      0.60      4510



## 5. Save predictions

In [None]:
with open("Para_News15_Combined_15_Predictions_DT_TFIDF.csv", "w") as csvfile:
    writer = csv.DictWriter(csvfile, ['DateTime', 'Stock', 'PredictedMovement', 'ActualMovement'])
    writer.writeheader()

    for i, _ in enumerate(y_pred):
        label = y_pred[i]
        actual = y_test[i]
        item = {
            "DateTime": y_test_times[i],
            "Stock": y_test_stocks[i],
            "PredictedMovement": -1 if label == 0 else 1,
            "ActualMovement": -1 if actual == 0 else 1
        }
        writer.writerow(item)

## 6. Custom test

In [None]:
CUSTOM_DATA = "Apple60_Major.csv"
df = pd.read_csv(CUSTOM_DATA)
data = df.copy()
data.drop(['Source'], axis=1, inplace=True)
data['Label'] = np.where(data['Movement'] == 1, 1, 0)
# data['DateTime'] = ''
sentences = data['News'].values
labels = data['Label'].values

In [None]:
x_test = [preprocess(x) for x in sentences]
x_test = f.transform(x_test)
y_test = np.array(labels)
x_test.shape, y_test.shape

In [None]:
y_pred = clf.predict(x_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
sample = """
Starting next year, all movies released by Sony Pictures will stream exclusively on Netflix after their theatrical and home entertainment releases.

The deal, which was announced Thursday, includes films such as “Morbius,” “Uncharted” and “Bullet Train.”

Netflix will also be able to place films such as “Spider-Man: Into the Spider-Verse” and future Venom and other Spider-Man films on its platform. Franchises such as Jumanji and Bad Boys are also included in the deal. Netflix will also have access to other titles from Sony’s movie library.

As part of this partnership, Sony will offer Netflix a first look at any films it plans on taking directly to streaming or decides to later license to streaming. Netflix has committed to make a number of these films over the course of the deal.

The length of the deal and its financials were not disclosed by the two companies.
"""

# sample = """
# Fears of weakness at Apple Inc. proved true Wednesday.

# The tech giant, which became the only public U.S. company to reach a $1 trillion valuation last year before a fourth-quarter collapse for its shares, confirmed the fears that led to the stock decline by lowering its forecast Wednesday afternoon. In a letter to shareholders, Chief Executive Tim Cook said that Apple will report much lower sales than previously expected, largely due to slowing iPhone sales and pressure in China.

# “While we anticipated some challenges in key emerging markets, we did not foresee the magnitude of the economic deceleration, particularly in Greater China,” Cook wrote. “In fact, most of our revenue shortfall to our guidance, and over 100% of our year-over-year world-wide revenue decline, occurred in Greater China across iPhone, Mac and iPad.”

# Apple’s AAPL, +2.02%  stock was halted in after-hours trading ahead of the announcement, then fell 7.6% in extended trading on volume of more than 6 million shares, the highest after-hours volume for an S&P 500 index stock Wednesday. The stock has dropped 31.1% in the past three months, as the S&P 500 SPX, +0.77%  has declined 14.3%.

# Other tech stocks fell in late trading after the Apple news hit. Apple suppliers were especially targeted, with Skyworks Solutions Inc. SWKS, -0.49%  and Qorvo Inc. QRVO, -0.86%  more than 5% and Broadcom Inc. AVGO, -0.08%  dropping 4.7%. Other members of the so-called “FAANG” grouping of tech stocks also declined: Amazon.com Inc. AMZN, +2.21%  dropped 2.8%, Facebook Inc. FB, -0.18%  was down 1.6%, Alphabet Inc. GOOG, +0.90%   GOOGL, +0.90%  declined 2.1%, and Netflix Inc. NFLX, +0.13%   declined 2.5%. Microsoft Corp. MSFT, +1.03%  , which took the title as most valuable public company in the U.S. from Apple late last year, fell 2.1%, while PC manufacturer HP Inc. HPQ, +1.85%  dropped 4.5%. Chip makers Nvidia Corp. NVDA, +0.58%   and Micron Technology Inc. MU, +0.01%  saw shares decline more than 3%. The selloff affected U.S. stock futures as well, with Dow, S&P 500 and Nasdaq futures all sinking more than 1% late Wednesday.

# Cook said that Apple now expects fiscal first-quarter revenue of about $84 billion, after previously stating expectations for sales of $89 billion to $93 billion. Apple shares have been pressured since the company originally gave its revenue forecast for the holiday season, as suppliers have reined in forecasts, causing doubts about the company’s iPhone sales.

# “Lower than anticipated iPhone revenue, primarily in Greater China, accounts for all of our revenue shortfall to our guidance and for much more than our entire year-over-year revenue decline,” Cook said Wednesday in his letter.

# China is not the only issue with iPhone sales, however, as Cook admitted later in his letter.

# “While macroeconomic challenges in some markets were a key contributor to this trend, we believe there are other factors broadly impacting our iPhone performance, including consumers adapting to a world with fewer carrier subsidies, U.S. dollar strength-related price increases, and some customers taking advantage of significantly reduced pricing for iPhone battery replacements.”
# """

sample = """
Romeo Power, Inc. ("Romeo Power") (NYSE: RMO), an energy technology leader delivering large-scale electrification solutions for complex commercial applications, announced today a long-term supply agreement with PACCAR (Nasdaq: PCAR), a global technology leader in the design, manufacture and customer support of high-quality light-, medium- and heavy-duty trucks under the Kenworth, Peterbilt and DAF nameplates, to provide battery packs, modules and battery management systems (BMS) for PACCAR’s battery electric vehicles (BEVs).
"""

In [None]:
sample_news = preprocess(sample)
sample_news

'paccar commit industri lead qualiti innov said darrin siver paccar senior vice presid romeo power batteri technolog solut enabl paccar deliv state art transport solut enhanc custom oper environment impact pleas enter long term suppli agreement romeo power pave way cost effect electrif within commerci vehicl sector'

In [None]:
x_test = f.transform([sample_news])

In [None]:
clf.predict(x_test)

array([0])